[patch] oom lca -- fork bombing killer

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




							   March 28, 2005
							  Coywolf Qi Hunt
							[email protected]


			Linux OOM LCA (Least Common Ancestor) Patch

				An anti Fork Bombing Solution


Introduction

   People complain that linux is still vulnerable to the ancient fork bombing.
   A quick scratch <http://freeforge.net/~coywolf/oom-bomb/> simply running
   as normal user could make a linux box totally unusable.

   Another example is in the recent thread: oom with 2.6.11
   <http://lkml.org/lkml/2005/3/17/142>

   The only way currently to deal with fork bombing is to limit the number of
   running processes or the forking rate. This patch brings a new way to find
   out and kill the possible fork bombing processes or any misbehaving processes
   taking up too much memory.

   The patch applies to 2.6.12-rc1-mm3 and seems able to kill various fork 
   bombings. 
   

Simple Experiment Result

   It is tested on an 256M desktop box. With swapoff and in console, it works
   pretty well.  After the fork bomber has forked into about 1500 instances,

	RES = 1036k
	SHR = 876k
	1500*(1036k-876k)=234M

   it got killed completely.

   When the fork bomber is running as root, kernel first kills the X server and 
   XFce4 el al, immediately kernel kills the bomb totally too.

   I haven't tested with swapon yet. The hard disk would keep swapping badly 
   then.  
   
   When in X, the screen got corrupted, but the box can still reboot through
   ctrl+alt+del. 


The oom LCA algorithm details

   1. New elements add to struct task_struct :
      struct task_struct *bio_parent
      struct list_head bio_children;
      struct list_head bio_sibling;

      Adds struct task_struct *bio_parent to struct task_struct.
      Unlike struct task_struct *parent, user programs can not change it
      by daemon(3).  It is describing the facts except when it is a kernel
      thread. We don't plan to oom kill any kernel threads.

   2. If a process is malicious, we'll consider its children malicious too. 
      So we kill the choose process and all its descendants. This way we'll be
      far faster than the tumor forking. LCA always grows upwards.

   3. LCA(Least Common Ancestor Algorithm, I choose the term LCA for it is more 
      widely used than NCA(Nearest Common Ancestor) according to google.)
      We remember the last killed process' parent, next time we'll kill
      the remembered last_chosen_parent and the new worst process(the highest
      badness)'s least common ancestor. NOTE if find_lca() fails, we just kill 
      the worst process and we must _update_ the last_kill_process to a new 
      value.

   4. Calculates oom badness
      bio_children is considered instead of children.


Not quite related changes

   I also include some other obvious cleanups. They are:

      1. hides reparent_to_init(), reparent_to_init() should be called only by
         daemonize()
      2. removes obsolete code comments in task_struct
      3. removes an obsolete parameter from choose_new_parent() and cleanups
      4. in reparent_thread(), ptrace should be type long, not int.
      5. cleanups in mm/oom_kill.c


Signed-off-by: Coywolf Qi Hunt <[email protected]>
---

 arch/i386/mach-voyager/voyager_thread.c |    1 
 include/linux/init_task.h               |    3 
 include/linux/sched.h                   |   10 --
 kernel/exit.c                           |   34 ++++++-
 kernel/fork.c                           |    4 
 mm/oom_kill.c                           |  141 ++++++++++++++++++++------------
 6 files changed, 129 insertions(+), 64 deletions(-)

diff -pruN 2.6.12-rc1-mm3/arch/i386/mach-voyager/voyager_thread.c 2.6.12-rc1-mm3-cy/arch/i386/mach-voyager/voyager_thread.c
--- 2.6.12-rc1-mm3/arch/i386/mach-voyager/voyager_thread.c	2004-08-20 14:39:58.000000000 +0800
+++ 2.6.12-rc1-mm3-cy/arch/i386/mach-voyager/voyager_thread.c	2005-03-28 10:18:24.000000000 +0800
@@ -126,7 +126,6 @@ thread(void *unused)
 
 	kvoyagerd_running = 1;
 
-	reparent_to_init();
 	daemonize(THREAD_NAME);
 
 	set_timeout = 0;
diff -pruN 2.6.12-rc1-mm3/include/linux/init_task.h 2.6.12-rc1-mm3-cy/include/linux/init_task.h
--- 2.6.12-rc1-mm3/include/linux/init_task.h	2005-03-26 13:21:10.000000000 +0800
+++ 2.6.12-rc1-mm3-cy/include/linux/init_task.h	2005-03-28 10:18:24.000000000 +0800
@@ -90,6 +90,9 @@ extern struct group_info init_groups;
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
 	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.bio_parent	= &tsk,						\
+	.bio_children	= LIST_HEAD_INIT(tsk.bio_children),		\
+	.bio_sibling	= LIST_HEAD_INIT(tsk.bio_sibling),		\
 	.group_leader	= &tsk,						\
 	.group_info	= &init_groups,					\
 	.cap_effective	= CAP_INIT_EFF_SET,				\
diff -pruN 2.6.12-rc1-mm3/include/linux/sched.h 2.6.12-rc1-mm3-cy/include/linux/sched.h
--- 2.6.12-rc1-mm3/include/linux/sched.h	2005-03-26 13:21:11.000000000 +0800
+++ 2.6.12-rc1-mm3-cy/include/linux/sched.h	2005-03-28 10:18:24.000000000 +0800
@@ -656,11 +656,7 @@ struct task_struct {
 	unsigned did_exec:1;
 	pid_t pid;
 	pid_t tgid;
-	/* 
-	 * pointers to (original) parent process, youngest child, younger sibling,
-	 * older sibling, respectively.  (p->father can be replaced with 
-	 * p->parent->pid)
-	 */
+
 	struct task_struct *real_parent; /* real parent process (when being debugged) */
 	struct task_struct *parent;	/* parent process */
 	/*
@@ -669,6 +665,9 @@ struct task_struct {
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
+	struct task_struct *bio_parent;	
+	struct list_head bio_children;
+	struct list_head bio_sibling;
 	struct task_struct *group_leader;	/* threadgroup leader */
 
 	/* PID/PID hash table linkage. */
@@ -1068,7 +1067,6 @@ extern void exit_itimers(struct signal_s
 
 extern NORET_TYPE void do_group_exit(int);
 
-extern void reparent_to_init(void);
 extern void daemonize(const char *, ...);
 extern int allow_signal(int);
 extern int disallow_signal(int);
diff -pruN 2.6.12-rc1-mm3/kernel/exit.c 2.6.12-rc1-mm3-cy/kernel/exit.c
--- 2.6.12-rc1-mm3/kernel/exit.c	2005-03-26 13:21:12.000000000 +0800
+++ 2.6.12-rc1-mm3-cy/kernel/exit.c	2005-03-28 10:18:24.000000000 +0800
@@ -52,6 +52,7 @@ static void __unhash_process(struct task
 	}
 
 	REMOVE_LINKS(p);
+	list_del_init(&p->bio_sibling);
 }
 
 void release_task(struct task_struct * p)
@@ -221,8 +222,11 @@ static inline int has_stopped_jobs(int p
  * been inherited from a user process, so we reset them to sane values here.
  *
  * NOTE that reparent_to_init() gives the caller full capabilities.
+ *
+ * We change kernel threads' current->bio_*, thus find_lca()
+ * will not catch it.  -- coywolf
  */
-void reparent_to_init(void)
+static inline void reparent_to_init(void)
 {
 	write_lock_irq(&tasklist_lock);
 
@@ -232,6 +236,9 @@ void reparent_to_init(void)
 	current->parent = child_reaper;
 	current->real_parent = child_reaper;
 	SET_LINKS(current);
+	list_del_init(&current->bio_sibling);
+	current->bio_parent = child_reaper;
+	list_add_tail(&current->bio_sibling, &child_reaper->bio_children);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
@@ -511,16 +518,15 @@ void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+static inline void choose_new_parent(task_t *p, task_t *reaper)
 {
 	/*
 	 * Make sure we're not reparenting to ourselves and that
 	 * the parent is not a zombie.
 	 */
 	BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE);
+	BUG_ON(p->parent == reaper);
 	p->real_parent = reaper;
-	if (p->parent == p->real_parent)
-		BUG();
 }
 
 static inline void reparent_thread(task_t *p, task_t *father, int traced)
@@ -590,6 +596,7 @@ static inline void reparent_thread(task_
 static inline void forget_original_parent(struct task_struct * father,
 					  struct list_head *to_release)
 {
+	extern struct task_struct *last_chosen_parent;
 	struct task_struct *p, *reaper = father;
 	struct list_head *_p, *_n;
 
@@ -610,7 +617,7 @@ static inline void forget_original_paren
 	 * Search them and reparent children.
 	 */
 	list_for_each_safe(_p, _n, &father->children) {
-		int ptrace;
+		long ptrace;
 		p = list_entry(_p,struct task_struct,sibling);
 
 		ptrace = p->ptrace;
@@ -620,7 +627,7 @@ static inline void forget_original_paren
 
 		if (father == p->real_parent) {
 			/* reparent with a reaper, real father it's us */
-			choose_new_parent(p, reaper, child_reaper);
+			choose_new_parent(p, reaper);
 			reparent_thread(p, father, 0);
 		} else {
 			/* reparent ptraced task to its real parent */
@@ -641,9 +648,22 @@ static inline void forget_original_paren
 	}
 	list_for_each_safe(_p, _n, &father->ptrace_children) {
 		p = list_entry(_p,struct task_struct,ptrace_list);
-		choose_new_parent(p, reaper, child_reaper);
+		choose_new_parent(p, reaper);
 		reparent_thread(p, father, 1);
 	}
+
+	list_for_each_safe(_p, _n,  &father->bio_children) {
+		p = list_entry(_p, struct task_struct, bio_sibling);
+		list_del_init(&p->bio_sibling);
+		p->bio_parent = father->bio_parent;
+		list_add_tail(&p->bio_sibling, &(p->bio_parent)->bio_children);
+	}
+
+	if (unlikely(father == last_chosen_parent)) {
+		last_chosen_parent = father->bio_parent;
+		if (last_chosen_parent == &init_task)
+			last_chosen_parent = NULL;
+	}
 }
 
 /*
diff -pruN 2.6.12-rc1-mm3/kernel/fork.c 2.6.12-rc1-mm3-cy/kernel/fork.c
--- 2.6.12-rc1-mm3/kernel/fork.c	2005-03-26 13:21:12.000000000 +0800
+++ 2.6.12-rc1-mm3-cy/kernel/fork.c	2005-03-28 10:18:24.000000000 +0800
@@ -915,6 +915,8 @@ static task_t *copy_process(unsigned lon
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->bio_children);
+	INIT_LIST_HEAD(&p->bio_sibling);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 	spin_lock_init(&p->proc_lock);
@@ -1044,6 +1046,7 @@ static task_t *copy_process(unsigned lon
 	else
 		p->real_parent = current;
 	p->parent = p->real_parent;
+	p->bio_parent = current;
 
 	if (clone_flags & CLONE_THREAD) {
 		spin_lock(&current->sighand->siglock);
@@ -1094,6 +1097,7 @@ static task_t *copy_process(unsigned lon
 	p->ioprio = current->ioprio;
 
 	SET_LINKS(p);
+	list_add_tail(&p->bio_sibling, &current->bio_children);
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
diff -pruN 2.6.12-rc1-mm3/mm/oom_kill.c 2.6.12-rc1-mm3-cy/mm/oom_kill.c
--- 2.6.12-rc1-mm3/mm/oom_kill.c	2005-03-03 17:12:18.000000000 +0800
+++ 2.6.12-rc1-mm3-cy/mm/oom_kill.c	2005-03-28 10:34:37.000000000 +0800
@@ -21,7 +21,9 @@
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 
-/* #define DEBUG */
+#define DEBUG
+
+struct task_struct *last_chosen_parent = NULL;
 
 /**
  * oom_badness - calculate a numeric value for how bad this task has been
@@ -56,22 +58,22 @@ unsigned long badness(struct task_struct
 	points = p->mm->total_vm;
 
 	/*
-	 * Processes which fork a lot of child processes are likely
-	 * a good choice. We add the vmsize of the childs if they
+	 * Processes which have a lot of bio_children processes are likely
+	 * a good choice. We add the vmsize of the bio_children if they
 	 * have an own mm. This prevents forking servers to flood the
-	 * machine with an endless amount of childs
+	 * machine with an endless amount of childs.
 	 */
-	list_for_each(tsk, &p->children) {
+	list_for_each(tsk, &p->bio_children) {
 		struct task_struct *chld;
-		chld = list_entry(tsk, struct task_struct, sibling);
+		chld = list_entry(tsk, struct task_struct, bio_sibling);
 		if (chld->mm != p->mm && chld->mm)
 			points += chld->mm->total_vm;
 	}
 
 	/*
 	 * CPU time is in tens of seconds and run time is in thousands
-         * of seconds. There is no particular reason for this other than
-         * that it turned out to work very well in practice.
+	 * of seconds. There is no particular reason for this other than
+	 * that it turned out to work very well in practice.
 	 */
 	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
 		>> (SHIFT_HZ + 3);
@@ -122,14 +124,34 @@ unsigned long badness(struct task_struct
 			points >>= -(p->oomkilladj);
 	}
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
-	p->pid, p->comm, points);
-#endif
+	pr_debug("OOMkill: task %d (%s) got %d points\n",
+			p->pid, p->comm, points);
+
 	return points;
 }
 
 /*
+ * LCA: Least Common Ancestor
+ *
+ * Returns NULL if no LCA (LCA is init_task)
+ *
+ * I prefer to use task_t for function parameters, and
+ * struct task_struct elsewhere.  -- coywolf
+ */
+static struct task_struct * find_lca(task_t *p1, task_t *p2)
+{
+	struct task_struct *tsk1, *tsk2;
+
+	for (tsk1 = p1; tsk1 != &init_task; tsk1 = tsk1->bio_parent) {
+		for (tsk2 = p2; tsk2 != &init_task; tsk2 = tsk2->bio_parent)
+			if (tsk1 == tsk2)
+				return tsk1;
+	}
+
+	return NULL;
+}
+
+/*
  * Simple selection loop. We chose the process with the highest
  * number of 'points'. We expect the caller will lock the tasklist.
  *
@@ -138,7 +160,7 @@ unsigned long badness(struct task_struct
 static struct task_struct * select_bad_process(void)
 {
 	unsigned long maxpoints = 0;
-	struct task_struct *g, *p;
+	struct task_struct *g, *p, *lca;
 	struct task_struct *chosen = NULL;
 	struct timespec uptime;
 
@@ -165,9 +187,42 @@ static struct task_struct * select_bad_p
 			}
 		}
 	while_each_thread(g, p);
+
+	if (!chosen)
+		goto out;
+
+	if (!last_chosen_parent) {
+		last_chosen_parent = chosen->bio_parent;
+		goto out1;
+	}
+
+	lca = find_lca(last_chosen_parent, chosen);
+	if (lca) {
+		last_chosen_parent = lca->bio_parent;
+		chosen = lca;
+	} else {
+		last_chosen_parent = chosen->bio_parent;
+	}
+
+out1:
+	if (last_chosen_parent == &init_task)
+		last_chosen_parent = NULL;
+
+out:
 	return chosen;
 }
 
+static int is_ancestor(task_t *ancestor, task_t *p)
+{
+	struct task_struct *tsk;
+
+	for (tsk = p; tsk != &init_task; tsk = tsk->bio_parent)
+		if (tsk == ancestor)
+			return 1;
+
+	return 0;
+}
+
 /**
  * We must be careful though to never send SIGKILL a process with
  * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
@@ -184,7 +239,7 @@ static void __oom_kill_task(task_t *p)
 	task_lock(p);
 	if (!p->mm || p->mm == &init_mm) {
 		WARN_ON(1);
-		printk(KERN_WARNING "tried to kill an mm-less task!\n");
+		printk(KERN_WARNING "tried to kill mm-less task %d (%s)!\n", p->pid, p->comm);
 		task_unlock(p);
 		return;
 	}
@@ -202,47 +257,37 @@ static void __oom_kill_task(task_t *p)
 	force_sig(SIGKILL, p);
 }
 
-static struct mm_struct *oom_kill_task(task_t *p)
+static int oom_kill_task(task_t *p)
 {
 	struct mm_struct *mm = get_task_mm(p);
-	task_t * g, * q;
+	struct task_struct *q;
 
 	if (!mm)
-		return NULL;
+		return 0;
 	if (mm == &init_mm) {
 		mmput(mm);
-		return NULL;
+		return 0;
 	}
 
-	__oom_kill_task(p);
-	/*
-	 * kill all processes that share the ->mm (i.e. all threads),
-	 * but are in a different thread group
-	 */
-	do_each_thread(g, q)
-		if (q->mm == mm && q->tgid != p->tgid)
-			__oom_kill_task(q);
-	while_each_thread(g, q);
-
-	return mm;
-}
-
-static struct mm_struct *oom_kill_process(struct task_struct *p)
-{
- 	struct mm_struct *mm;
-	struct task_struct *c;
-	struct list_head *tsk;
+	if (list_empty(&p->bio_children)) {
+		mmput(mm);
+		__oom_kill_task(p);
+		return 1;
+	}
 
-	/* Try to kill a child first */
-	list_for_each(tsk, &p->children) {
-		c = list_entry(tsk, struct task_struct, sibling);
-		if (c->mm == p->mm)
-			continue;
-		mm = oom_kill_task(c);
-		if (mm)
-			return mm;
+	/* Kill all descendants */
+	for_each_process(q) {
+		if (q->pid > 1) {
+			if (test_tsk_thread_flag(q, TIF_MEMDIE) ||
+				(q->flags & PF_EXITING) || (q->flags & PF_DEAD))
+				continue;
+			if (is_ancestor(p, q))
+				__oom_kill_task(q);
+		}
 	}
-	return oom_kill_task(p);
+	
+	mmput(mm);
+	return 1;
 }
 
 /**
@@ -255,7 +300,6 @@ static struct mm_struct *oom_kill_proces
  */
 void out_of_memory(int gfp_mask)
 {
-	struct mm_struct *mm = NULL;
 	task_t * p;
 
 	read_lock(&tasklist_lock);
@@ -274,14 +318,11 @@ retry:
 
 	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 	show_free_areas();
-	mm = oom_kill_process(p);
-	if (!mm)
+	if (!oom_kill_task(p))
 		goto retry;
 
  out:
 	read_unlock(&tasklist_lock);
-	if (mm)
-		mmput(mm);
 
 	/*
 	 * Give "p" a good chance of killing itself before we

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux