Support migration of tasks across groups. Migration uses the accounting
information tracked in the mm_struct to add/delete RSS from the container as
a process migrates from one container to the next.
This patch also adds a /proc/<tid>/memacct interface for debugging purposes.
/proc/<tid>/memacct prints the rss of the task
1. As accounted by the patches
2. By walking the page tables of the process
Signed-off-by: Balbir Singh <[email protected]>
---
fs/proc/base.c | 4
include/linux/memctlr.h | 9 +
include/linux/rmap.h | 6 -
kernel/res_group/memctlr.c | 228 ++++++++++++++++++++++++++++++++++++++++++---
mm/filemap_xip.c | 2
mm/fremap.c | 2
mm/memory.c | 6 -
mm/rmap.c | 6 -
8 files changed, 236 insertions(+), 27 deletions(-)
diff -puN kernel/res_group/memctlr.c~container-memctlr-task-migration kernel/res_group/memctlr.c
--- linux-2.6.19-rc2/kernel/res_group/memctlr.c~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/kernel/res_group/memctlr.c 2006-11-09 21:56:49.000000000 +0530
@@ -31,10 +31,12 @@
#include <linux/module.h>
#include <linux/res_group_rc.h>
#include <linux/memctlr.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
static const char res_ctlr_name[] = "memctlr";
static struct resource_group *root_rgroup;
-static const char version[] = "0.01";
+static const char version[] = "0.05";
static struct memctlr *memctlr_root;
#define MEMCTLR_MAGIC 0xdededede
@@ -52,6 +54,7 @@ struct memctlr {
int successes;
int failures;
int magic;
+ spinlock_t lock;
};
struct res_controller memctlr_rg;
@@ -95,7 +98,7 @@ void mm_assign_container(struct mm_struc
rcu_read_unlock();
}
-static inline struct memctlr *get_memctlr_from_page(struct page *page)
+static inline struct memctlr *get_task_memctlr(struct task_struct *p)
{
struct resource_group *rgroup;
struct memctlr *res;
@@ -107,7 +110,7 @@ static inline struct memctlr *get_memctl
return NULL;
rcu_read_lock();
- rgroup = (struct resource_group *)rcu_dereference(current->container);
+ rgroup = (struct resource_group *)rcu_dereference(p->container);
rcu_read_unlock();
res = get_memctlr(rgroup);
@@ -119,31 +122,54 @@ static inline struct memctlr *get_memctl
}
-void memctlr_inc_rss(struct page *page)
+void memctlr_inc_rss_mm(struct page *page, struct mm_struct *mm)
{
struct memctlr *res;
- res = get_memctlr_from_page(page);
- if (!res)
+ res = get_task_memctlr(current);
+ if (!res) {
+ printk(KERN_INFO "inc_rss no res set *---*\n");
return;
+ }
- atomic_long_inc(¤t->mm->counter->rss);
+ spin_lock(&res->lock);
+ atomic_long_inc(&mm->counter->rss);
atomic_long_inc(&res->counter.rss);
+ spin_unlock(&res->lock);
}
-void memctlr_dec_rss(struct page *page)
+void memctlr_inc_rss(struct page *page)
{
struct memctlr *res;
+ struct mm_struct *mm = get_task_mm(current);
- res = get_memctlr_from_page(page);
- if (!res)
+ res = get_task_memctlr(current);
+ if (!res) {
+ printk(KERN_INFO "inc_rss no res set *---*\n");
return;
+ }
- atomic_long_dec(&res->counter.rss);
+ spin_lock(&res->lock);
+ atomic_long_inc(&mm->counter->rss);
+ atomic_long_inc(&res->counter.rss);
+ spin_unlock(&res->lock);
+ mmput(mm);
+}
- if ((current->flags & PF_EXITING) && !current->mm)
+void memctlr_dec_rss(struct page *page, struct mm_struct *mm)
+{
+ struct memctlr *res;
+
+ res = get_task_memctlr(current);
+ if (!res) {
+ printk(KERN_INFO "dec_rss no res set *---*\n");
return;
- atomic_long_dec(¤t->mm->counter->rss);
+ }
+
+ spin_lock(&res->lock);
+ atomic_long_dec(&res->counter.rss);
+ atomic_long_dec(&mm->counter->rss);
+ spin_unlock(&res->lock);
}
static void memctlr_init_new(struct memctlr *res)
@@ -154,6 +180,7 @@ static void memctlr_init_new(struct memc
res->shares.unused_min_shares = SHARE_DEFAULT_DIVISOR;
memctlr_init_mem_counter(&res->counter);
+ spin_lock_init(&res->lock);
}
static struct res_shares *memctlr_alloc_instance(struct resource_group *rgroup)
@@ -188,6 +215,122 @@ static void memctlr_free_instance(struct
kfree(res);
}
+static long count_pte_rss(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end)
+{
+ pte_t *pte;
+ long count = 0;
+
+ do {
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte))
+ continue;
+ count++;
+ pte_unmap(pte);
+ } while (pte++, addr += PAGE_SIZE, (addr != end));
+
+ return count;
+}
+
+static long count_pmd_rss(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end)
+{
+ pmd_t *pmd;
+ unsigned long next;
+ long count = 0;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+ count += count_pte_rss(vma, pmd, addr, next);
+ } while (pmd++, addr = next, (addr != end));
+
+ return count;
+}
+
+static long count_pud_rss(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end)
+{
+ pud_t *pud;
+ unsigned long next;
+ long count = 0;
+
+ pud = pud_offset(pgd, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+ count += count_pmd_rss(vma, pud, addr, next);
+ } while (pud++, addr = next, (addr != end));
+
+ return count;
+}
+
+static long count_pgd_rss(struct vm_area_struct *vma)
+{
+ unsigned long addr, next, end;
+ pgd_t *pgd;
+ long count = 0;
+
+ addr = vma->vm_start;
+ end = vma->vm_end;
+
+ pgd = pgd_offset(vma->vm_mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+ count += count_pud_rss(vma, pgd, addr, next);
+ } while (pgd++, addr = next, (addr != end));
+ return count;
+}
+
+static long count_rss(struct task_struct *p)
+{
+ int count = 0;
+ struct mm_struct *mm = get_task_mm(p);
+ struct vm_area_struct *vma = mm->mmap;
+
+ if (!mm)
+ return 0;
+
+ down_read(&mm->mmap_sem);
+ spin_lock(&mm->page_table_lock);
+
+ while (vma) {
+ count += count_pgd_rss(vma);
+ vma = vma->vm_next;
+ }
+
+ spin_unlock(&mm->page_table_lock);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ return count;
+}
+
+int proc_memacct(struct task_struct *p, char *buf)
+{
+ int i = 0, j = 0;
+ struct mm_struct *mm = get_task_mm(p);
+
+ if (!mm)
+ return sprintf(buf, "no mm associated with the task\n");
+
+ i = sprintf(buf, "rss pages %ld\n",
+ atomic_long_read(&mm->counter->rss));
+ buf += i;
+ j += i;
+
+ i = sprintf(buf, "pg table walk rss pages %ld\n", count_rss(p));
+ buf += i;
+ j += i;
+
+ mmput(mm);
+ return j;
+}
+
static ssize_t memctlr_show_stats(struct res_shares *shares, char *buf,
size_t len)
{
@@ -206,12 +349,69 @@ static ssize_t memctlr_show_stats(struct
return j;
}
+static void double_res_lock(struct memctlr *old, struct memctlr *new)
+{
+ BUG_ON(old == new);
+ if (&old->lock > &new->lock) {
+ spin_lock(&old->lock);
+ spin_lock(&new->lock);
+ } else {
+ spin_lock(&new->lock);
+ spin_lock(&old->lock);
+ }
+}
+
+static void double_res_unlock(struct memctlr *old, struct memctlr *new)
+{
+ BUG_ON(old == new);
+ if (&old->lock > &new->lock) {
+ spin_unlock(&new->lock);
+ spin_unlock(&old->lock);
+ } else {
+ spin_unlock(&old->lock);
+ spin_unlock(&new->lock);
+ }
+}
+
+static void memctlr_move_task(struct task_struct *p, struct res_shares *old,
+ struct res_shares *new)
+{
+ struct memctlr *oldres, *newres;
+ long rss_pages;
+
+ if (old == new)
+ return;
+
+ /*
+ * If a task has no mm structure associated with it we have
+ * nothing to do
+ */
+ if (!old || !new)
+ return;
+
+ if (p->pid != p->tgid)
+ return;
+
+ oldres = get_memctlr_from_shares(old);
+ newres = get_memctlr_from_shares(new);
+
+ double_res_lock(oldres, newres);
+
+ rss_pages = atomic_long_read(&p->mm->counter->rss);
+ atomic_long_sub(rss_pages, &oldres->counter.rss);
+
+ mm_assign_container(p->mm, p);
+ atomic_long_add(rss_pages, &newres->counter.rss);
+
+ double_res_unlock(oldres, newres);
+}
+
struct res_controller memctlr_rg = {
.name = res_ctlr_name,
.ctlr_id = NO_RES_ID,
.alloc_shares_struct = memctlr_alloc_instance,
.free_shares_struct = memctlr_free_instance,
- .move_task = NULL,
+ .move_task = memctlr_move_task,
.shares_changed = NULL,
.show_stats = memctlr_show_stats,
};
diff -puN fs/proc/base.c~container-memctlr-task-migration fs/proc/base.c
--- linux-2.6.19-rc2/fs/proc/base.c~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/fs/proc/base.c 2006-11-09 21:56:49.000000000 +0530
@@ -72,6 +72,7 @@
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
+#include <linux/memctlr.h>
#include "internal.h"
/* NOTE:
@@ -1759,6 +1760,9 @@ static struct pid_entry tgid_base_stuff[
#ifdef CONFIG_NUMA
REG("numa_maps", S_IRUGO, numa_maps),
#endif
+#ifdef CONFIG_RES_GROUPS_MEMORY
+ INF("memacct", S_IRUGO, memacct),
+#endif
REG("mem", S_IRUSR|S_IWUSR, mem),
#ifdef CONFIG_SECCOMP
REG("seccomp", S_IRUSR|S_IWUSR, seccomp),
diff -puN include/linux/memctlr.h~container-memctlr-task-migration include/linux/memctlr.h
--- linux-2.6.19-rc2/include/linux/memctlr.h~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/include/linux/memctlr.h 2006-11-09 21:56:49.000000000 +0530
@@ -30,15 +30,20 @@
extern int mm_init_mem_counter(struct mm_struct *mm);
extern void mm_assign_container(struct mm_struct *mm, struct task_struct *p);
extern void memctlr_inc_rss(struct page *page);
-extern void memctlr_dec_rss(struct page *page);
+extern void memctlr_inc_rss_mm(struct page *page, struct mm_struct *mm);
+extern void memctlr_dec_rss(struct page *page, struct mm_struct *mm);
extern void mm_free_mem_counter(struct mm_struct *mm);
+extern int proc_memacct(struct task_struct *task, char *buffer);
#else /* CONFIG_RES_GROUPS_MEMORY */
void memctlr_inc_rss(struct page *page)
{}
-void memctlr_dec_rss(struct page *page)
+void memctlr_inc_rss_mm(struct page *page, struct mm_struct *mm)
+{}
+
+void memctlr_dec_rss(struct page *page, struct mm_struct *mm)
{}
int mm_init_mem_counter(struct mm_struct *mm)
diff -puN mm/filemap_xip.c~container-memctlr-task-migration mm/filemap_xip.c
--- linux-2.6.19-rc2/mm/filemap_xip.c~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/mm/filemap_xip.c 2006-11-09 21:56:49.000000000 +0530
@@ -189,7 +189,7 @@ __xip_unmap (struct address_space * mapp
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
pteval = ptep_clear_flush(vma, address, pte);
- page_remove_rmap(page);
+ page_remove_rmap(page, mm);
dec_mm_counter(mm, file_rss);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
diff -puN mm/fremap.c~container-memctlr-task-migration mm/fremap.c
--- linux-2.6.19-rc2/mm/fremap.c~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/mm/fremap.c 2006-11-09 21:56:49.000000000 +0530
@@ -33,7 +33,7 @@ static int zap_pte(struct mm_struct *mm,
if (page) {
if (pte_dirty(pte))
set_page_dirty(page);
- page_remove_rmap(page);
+ page_remove_rmap(page, mm);
page_cache_release(page);
}
} else {
diff -puN mm/memory.c~container-memctlr-task-migration mm/memory.c
--- linux-2.6.19-rc2/mm/memory.c~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/mm/memory.c 2006-11-09 21:56:49.000000000 +0530
@@ -481,7 +481,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page);
+ page_dup_rmap(page, dst_mm);
rss[!!PageAnon(page)]++;
}
@@ -681,7 +681,7 @@ static unsigned long zap_pte_range(struc
mark_page_accessed(page);
file_rss--;
}
- page_remove_rmap(page);
+ page_remove_rmap(page, mm);
tlb_remove_page(tlb, page);
continue;
}
@@ -1575,7 +1575,7 @@ gotten:
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, mm);
if (!PageAnon(old_page)) {
dec_mm_counter(mm, file_rss);
inc_mm_counter(mm, anon_rss);
diff -puN mm/rmap.c~container-memctlr-task-migration mm/rmap.c
--- linux-2.6.19-rc2/mm/rmap.c~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/mm/rmap.c 2006-11-09 21:56:49.000000000 +0530
@@ -576,7 +576,7 @@ void page_add_file_rmap(struct page *pag
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, struct mm_struct *mm)
{
if (atomic_add_negative(-1, &page->_mapcount)) {
if (unlikely(page_mapcount(page) < 0)) {
@@ -689,7 +689,7 @@ static int try_to_unmap_one(struct page
dec_mm_counter(mm, file_rss);
- page_remove_rmap(page);
+ page_remove_rmap(page, mm);
page_cache_release(page);
out_unmap:
@@ -779,7 +779,7 @@ static void try_to_unmap_cluster(unsigne
if (pte_dirty(pteval))
set_page_dirty(page);
- page_remove_rmap(page);
+ page_remove_rmap(page, mm);
page_cache_release(page);
dec_mm_counter(mm, file_rss);
(*mapcount)--;
diff -puN include/linux/rmap.h~container-memctlr-task-migration include/linux/rmap.h
--- linux-2.6.19-rc2/include/linux/rmap.h~container-memctlr-task-migration 2006-11-09 21:56:49.000000000 +0530
+++ linux-2.6.19-rc2-balbir/include/linux/rmap.h 2006-11-09 21:56:49.000000000 +0530
@@ -73,7 +73,7 @@ void __anon_vma_link(struct vm_area_stru
void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
void page_add_file_rmap(struct page *);
-void page_remove_rmap(struct page *);
+void page_remove_rmap(struct page *, struct mm_struct *);
/**
* page_dup_rmap - duplicate pte mapping to a page
@@ -82,10 +82,10 @@ void page_remove_rmap(struct page *);
* For copy_page_range only: minimal extract from page_add_rmap,
* avoiding unnecessary tests (already checked) so it's quicker.
*/
-static inline void page_dup_rmap(struct page *page)
+static inline void page_dup_rmap(struct page *page, struct mm_struct *mm)
{
atomic_inc(&page->_mapcount);
- memctlr_inc_rss(page);
+ memctlr_inc_rss_mm(page, mm);
}
/*
_
--
Balbir Singh,
Linux Technology Center,
IBM Software Labs
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]