On Wed, 28 Sep 2005, Jay Lan wrote:
>
> I am not concerned about do_munmap any more from updating
> hiwater_vm's concern since the total_vm gets decreased, not increased.
Many thanks for that remark, it inspired me to do the exact opposite!
> In case you are going to turn hiwater_vm update into a macro and hide it
> in a config flag, here is what i have that might be useful.
And thanks for your patch, which I've factored in with Frank's, and come
come up with the one below, against 2.6.14-rc2-mm2. Which uses no CONFIG
flag: I think we're enough out of the fast paths that it's not needed.
See comment in fs/proc/task_mmu.c for the principle. Could maintain
hiwater_vm straightforwardly, but I think it's easier to remember if
we handle them both in the same way.
I did look into doing the total_vm increment and calling vm_stat_account
in insert_vm_struct, but concluded it solved no particular problem, and
raised some questions (where architectures, notably ia64, have special
vmas which they may have good reason to leave out of total_vm).
I haven't cross-checked the mm_struct cacheline rearrangement yet,
it looks plausible, but could easily turn out to straddle boundaries.
Christoph, Frank, Jay: does this patch look like it fits your needs?
Hugh
--- 2.6.14-rc2-mm2/fs/compat.c 2005-09-21 12:16:40.000000000 +0100
+++ linux/fs/compat.c 2005-10-03 19:42:49.000000000 +0100
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
/* execve success */
security_bprm_free(bprm);
acct_update_integrals(current);
- update_mem_hiwater(current);
kfree(bprm);
return retval;
}
--- 2.6.14-rc2-mm2/fs/exec.c 2005-09-30 11:59:08.000000000 +0100
+++ linux/fs/exec.c 2005-10-03 19:42:49.000000000 +0100
@@ -1207,7 +1207,6 @@ int do_execve(char * filename,
/* execve success */
security_bprm_free(bprm);
acct_update_integrals(current);
- update_mem_hiwater(current);
kfree(bprm);
return retval;
}
--- 2.6.14-rc2-mm2/fs/proc/task_mmu.c 2005-09-30 11:59:09.000000000 +0100
+++ linux/fs/proc/task_mmu.c 2005-10-03 19:42:49.000000000 +0100
@@ -14,22 +14,41 @@
char *task_mem(struct mm_struct *mm, char *buffer)
{
unsigned long data, text, lib;
+ unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+ /*
+ * Note: to minimize their overhead, mm maintains hiwater_vm and
+ * hiwater_rss only when about to *lower* total_vm or rss. Any
+ * collector of these hiwater stats must therefore get total_vm
+ * and rss too, which will usually be the higher. Barriers? not
+ * worth the effort, such snapshots can always be inconsistent.
+ */
+ hiwater_vm = total_vm = mm->total_vm;
+ if (hiwater_vm < mm->hiwater_vm)
+ hiwater_vm = mm->hiwater_vm;
+ hiwater_rss = total_rss = get_mm_rss(mm);
+ if (hiwater_rss < mm->hiwater_rss)
+ hiwater_rss = mm->hiwater_rss;
+
data = mm->total_vm - mm->shared_vm - mm->stack_vm;
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
buffer += sprintf(buffer,
+ "VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
"VmLck:\t%8lu kB\n"
+ "VmHWM:\t%8lu kB\n"
"VmRSS:\t%8lu kB\n"
"VmData:\t%8lu kB\n"
"VmStk:\t%8lu kB\n"
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n",
- (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+ hiwater_vm << (PAGE_SHIFT-10),
+ (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
mm->locked_vm << (PAGE_SHIFT-10),
- get_mm_rss(mm) << (PAGE_SHIFT-10),
+ hiwater_rss << (PAGE_SHIFT-10),
+ total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
--- 2.6.14-rc2-mm2/include/linux/mm.h 2005-09-30 11:59:11.000000000 +0100
+++ linux/include/linux/mm.h 2005-10-03 19:42:49.000000000 +0100
@@ -945,9 +945,6 @@ static inline void vm_stat_account(struc
}
#endif /* CONFIG_PROC_FS */
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
#ifndef CONFIG_DEBUG_PAGEALLOC
static inline void
kernel_map_pages(struct page *page, int numpages, int enable)
--- 2.6.14-rc2-mm2/include/linux/sched.h 2005-09-30 11:59:11.000000000 +0100
+++ linux/include/linux/sched.h 2005-10-03 19:42:49.000000000 +0100
@@ -245,6 +245,16 @@ extern void arch_unmap_area_topdown(stru
#define dec_mm_counter(mm, member) (mm)->_##member--
#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+#define update_hiwater_rss(mm) do { \
+ unsigned long _rss = get_mm_rss(mm); \
+ if ((mm)->hiwater_rss < _rss) \
+ (mm)->hiwater_rss = _rss; \
+} while (0)
+#define update_hiwater_vm(mm) do { \
+ if ((mm)->hiwater_vm < (mm)->total_vm) \
+ (mm)->hiwater_vm = (mm)->total_vm; \
+} while (0)
+
typedef unsigned long mm_counter_t;
struct mm_struct {
@@ -270,16 +280,19 @@ struct mm_struct {
* by mmlist_lock
*/
- unsigned long start_code, end_code, start_data, end_data;
- unsigned long start_brk, brk, start_stack;
- unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long total_vm, locked_vm, shared_vm;
- unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
-
/* Special counters protected by the page_table_lock */
mm_counter_t _file_rss;
mm_counter_t _anon_rss;
+ unsigned long hiwater_rss; /* High-watermark of RSS usage */
+ unsigned long hiwater_vm; /* High-water virtual memory usage */
+
+ unsigned long total_vm, locked_vm, shared_vm, exec_vm;
+ unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
+ unsigned long start_code, end_code, start_data, end_data;
+ unsigned long start_brk, brk, start_stack;
+ unsigned long arg_start, arg_end, env_start, env_end;
+
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
unsigned dumpable:2;
@@ -299,11 +312,7 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
-
struct kioctx default_kioctx;
-
- unsigned long hiwater_rss; /* High-water RSS usage */
- unsigned long hiwater_vm; /* High-water virtual memory usage */
};
struct sighand_struct {
--- 2.6.14-rc2-mm2/kernel/exit.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/kernel/exit.c 2005-10-03 19:42:49.000000000 +0100
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long co
preempt_count());
acct_update_integrals(tsk);
- update_mem_hiwater(tsk);
+ if (tsk->mm) {
+ update_hiwater_rss(tsk->mm);
+ update_hiwater_vm(tsk->mm);
+ }
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
del_timer_sync(&tsk->signal->real_timer);
--- 2.6.14-rc2-mm2/kernel/sched.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/kernel/sched.c 2005-10-03 19:42:49.000000000 +0100
@@ -2603,8 +2603,6 @@ void account_system_time(struct task_str
cpustat->idle = cputime64_add(cpustat->idle, tmp);
/* Account for system time used */
acct_update_integrals(p);
- /* Update rss highwater mark */
- update_mem_hiwater(p);
}
/*
--- 2.6.14-rc2-mm2/mm/fremap.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/fremap.c 2005-10-03 19:42:49.000000000 +0100
@@ -20,13 +20,12 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
pte_t pte = *ptep;
+ int did_zap_file_pte = 0;
- if (pte_none(pte))
- return;
if (pte_present(pte)) {
unsigned long pfn = pte_pfn(pte);
@@ -39,7 +38,7 @@ static inline void zap_pte(struct mm_str
set_page_dirty(page);
page_remove_rmap(page);
page_cache_release(page);
- dec_mm_counter(mm, file_rss);
+ did_zap_file_pte = 1;
}
}
} else {
@@ -47,6 +46,8 @@ static inline void zap_pte(struct mm_str
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(mm, addr, ptep);
}
+
+ return did_zap_file_pte;
}
/*
@@ -90,9 +91,9 @@ int install_page(struct mm_struct *mm, s
if (!page->mapping || page->index >= size)
goto err_unlock;
- zap_pte(mm, vma, addr, pte);
+ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
+ inc_mm_counter(mm, file_rss);
- inc_mm_counter(mm, file_rss);
flush_icache_page(vma, page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
page_add_file_rmap(page);
@@ -137,7 +138,10 @@ int install_file_pte(struct mm_struct *m
if (!pte)
goto err_unlock;
- zap_pte(mm, vma, addr, pte);
+ if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+ update_hiwater_rss(mm);
+ dec_mm_counter(mm, file_rss);
+ }
set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
pte_val = *pte;
--- 2.6.14-rc2-mm2/mm/hugetlb.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/hugetlb.c 2005-10-03 19:42:49.000000000 +0100
@@ -309,6 +309,9 @@ void unmap_hugepage_range(struct vm_area
BUG_ON(start & ~HPAGE_MASK);
BUG_ON(end & ~HPAGE_MASK);
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (! ptep)
--- 2.6.14-rc2-mm2/mm/memory.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/memory.c 2005-10-03 19:42:49.000000000 +0100
@@ -800,6 +800,7 @@ unsigned long zap_page_range(struct vm_a
lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
+ update_hiwater_rss(mm);
end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock);
@@ -2205,22 +2206,6 @@ unsigned long vmalloc_to_pfn(void * vmal
EXPORT_SYMBOL(vmalloc_to_pfn);
-/*
- * update_mem_hiwater
- * - update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
- if (tsk->mm) {
- unsigned long rss = get_mm_rss(tsk->mm);
-
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
--- 2.6.14-rc2-mm2/mm/mmap.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/mmap.c 2005-10-03 19:42:49.000000000 +0100
@@ -1625,6 +1625,8 @@ find_extend_vma(struct mm_struct * mm, u
*/
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
do {
long nrpages = vma_pages(vma);
@@ -1653,6 +1655,7 @@ static void unmap_region(struct mm_struc
lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
+ update_hiwater_rss(mm);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
@@ -1938,6 +1941,7 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
tlb = tlb_gather_mmu(mm, 1);
+ /* Don't update_hiwater_rss(mm) here, do_exit already did */
/* Use -1 here to ensure all VMAs in the mm are unmapped */
end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
--- 2.6.14-rc2-mm2/mm/mremap.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/mremap.c 2005-10-03 19:42:49.000000000 +0100
@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_
unsigned long new_pgoff;
unsigned long moved_len;
unsigned long excess = 0;
+ unsigned long hiwater_vm;
int split = 0;
/*
@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_
}
/*
- * if we failed to move page tables we still do total_vm increment
- * since do_munmap() will decrement it by old_len == new_len
+ * If we failed to move page tables we still do total_vm increment
+ * since do_munmap() will decrement it by old_len == new_len.
+ *
+ * Since total_vm is about to be raised artificially high for a
+ * moment, we need to restore high watermark afterwards: if stats
+ * are taken meanwhile, total_vm and hiwater_vm appear too high.
+ * If this were a serious issue, we'd add a flag to do_munmap().
*/
+ hiwater_vm = mm->hiwater_vm;
mm->total_vm += new_len >> PAGE_SHIFT;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+ mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
if (excess) {
--- 2.6.14-rc2-mm2/mm/nommu.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/nommu.c 2005-10-03 19:42:49.000000000 +0100
@@ -239,6 +239,7 @@ void vunmap(void *addr)
asmlinkage unsigned long sys_brk(unsigned long brk)
{
struct mm_struct *mm = current->mm;
+ long grow;
if (brk < mm->start_brk || brk > mm->context.end_brk)
return mm->brk;
@@ -246,6 +247,12 @@ asmlinkage unsigned long sys_brk(unsigne
if (mm->brk == brk)
return mm->brk;
+ /* Update high watermark before we may lower total_vm */
+ update_hiwater_vm(mm);
+
+ grow = PAGE_ALIGN(brk) - PAGE_ALIGN(mm->brk);
+ mm->total_vm += grow / PAGE_SIZE;
+
/*
* Always allow shrinking brk
*/
@@ -679,6 +686,7 @@ unsigned long do_mmap_pgoff(struct file
unsigned long flags,
unsigned long pgoff)
{
+ struct mm_struct *mm = current->mm;
struct vm_list_struct *vml = NULL;
struct vm_area_struct *vma = NULL;
struct rb_node *rb;
@@ -813,7 +821,7 @@ unsigned long do_mmap_pgoff(struct file
realalloc += kobjsize(vma);
askedalloc += sizeof(*vma);
- current->mm->total_vm += len >> PAGE_SHIFT;
+ mm->total_vm += len >> PAGE_SHIFT;
add_nommu_vma(vma);
@@ -821,8 +829,8 @@ unsigned long do_mmap_pgoff(struct file
realalloc += kobjsize(vml);
askedalloc += sizeof(*vml);
- vml->next = current->mm->context.vmlist;
- current->mm->context.vmlist = vml;
+ vml->next = mm->context.vmlist;
+ mm->context.vmlist = vml;
up_write(&nommu_vma_sem);
@@ -1075,19 +1083,6 @@ void arch_unmap_area(struct mm_struct *m
{
}
-void update_mem_hiwater(struct task_struct *tsk)
-{
- unsigned long rss;
-
- if (likely(tsk->mm)) {
- rss = get_mm_rss(tsk->mm);
- if (tsk->mm->hiwater_rss < rss)
- tsk->mm->hiwater_rss = rss;
- if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
- tsk->mm->hiwater_vm = tsk->mm->total_vm;
- }
-}
-
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen,
int even_cows)
--- 2.6.14-rc2-mm2/mm/rmap.c 2005-09-30 11:59:12.000000000 +0100
+++ linux/mm/rmap.c 2005-10-03 19:42:49.000000000 +0100
@@ -543,6 +543,9 @@ static int try_to_unmap_one(struct page
if (pte_dirty(pteval))
set_page_dirty(page);
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private };
/*
@@ -633,6 +636,9 @@ static void try_to_unmap_cluster(unsigne
if (!pmd_present(*pmd))
goto out_unlock;
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
for (original_pte = pte = pte_offset_map(pmd, address);
address < end; pte++, address += PAGE_SIZE) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]