Below is a 2.6.17-rc4-mm3 patch that fixes a problem where the OOM killer was
unnecessarily killing system daemons in addition to memory-hogging user
processes. The patch fixes things so that the following assertion is
satisfied:
If a failed attempt to allocate memory triggers the OOM killer, then the
failed attempt must have occurred _after_ any process previously shot by
the OOM killer has cleaned out its mm_struct.
Thus we avoid situations where concurrent invocations of the OOM killer cause
more processes to be shot than necessary to resolve the OOM condition.
Signed-Off-By: David S. Peterson <[email protected]>
---
Changes in this version of patch:
- restore call to printk_ratelimit() in out_of_memory()
- edit comment above oom_alloc()
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/include/linux/sched.h linux-2.6.17-rc4-mm3-oom/include/linux/sched.h
--- linux-2.6.17-rc4-mm3/include/linux/sched.h 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/include/linux/sched.h 2006-05-22 08:46:17.000000000 -0700
@@ -297,6 +297,9 @@ typedef unsigned long mm_counter_t;
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)
+/* bit #s for flags in mm_struct->flags... */
+#define MM_FLAG_OOM_NOTIFY 0
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -355,6 +358,8 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+ unsigned long flags;
};
struct sighand_struct {
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/include/linux/swap.h linux-2.6.17-rc4-mm3-oom/include/linux/swap.h
--- linux-2.6.17-rc4-mm3/include/linux/swap.h 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/include/linux/swap.h 2006-05-22 08:46:17.000000000 -0700
@@ -155,7 +155,27 @@ struct swap_list_t {
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
/* linux/mm/oom_kill.c */
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
+extern int oom_kill_in_progress;
+
+/* Return 1 if OOM kill in progress. Else return 0. */
+static inline int oom_kill_active(void)
+{
+ return oom_kill_in_progress;
+}
+
+/* Start an OOM kill operation. */
+static inline void oom_kill_start(void)
+{
+ oom_kill_in_progress = 1;
+}
+
+/* Terminate an OOM kill operation. */
+static inline void oom_kill_finish(void)
+{
+ oom_kill_in_progress = 0;
+}
+
+extern int out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
/* linux/mm/memory.c */
extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/kernel/fork.c linux-2.6.17-rc4-mm3-oom/kernel/fork.c
--- linux-2.6.17-rc4-mm3/kernel/fork.c 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/kernel/fork.c 2006-05-22 08:46:17.000000000 -0700
@@ -329,6 +329,7 @@ static struct mm_struct * mm_init(struct
mm->ioctx_list = NULL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->flags = 0;
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -382,6 +383,8 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
+ oom_kill_finish(); /* terminate pending OOM kill */
mmdrop(mm);
}
}
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/mm/oom_kill.c linux-2.6.17-rc4-mm3-oom/mm/oom_kill.c
--- linux-2.6.17-rc4-mm3/mm/oom_kill.c 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/mm/oom_kill.c 2006-05-22 16:02:13.000000000 -0700
@@ -25,6 +25,8 @@
int sysctl_panic_on_oom;
/* #define DEBUG */
+int oom_kill_in_progress;
+
/**
* badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
@@ -260,27 +262,31 @@ static int oom_kill_task(task_t *p, cons
struct mm_struct *mm;
task_t * g, * q;
+ task_lock(p);
mm = p->mm;
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
+ if (mm == NULL || mm == &init_mm) {
+ task_unlock(p);
+ return 1;
+ }
+
+ set_bit(MM_FLAG_OOM_NOTIFY, &mm->flags);
+ task_unlock(p);
+
+ /* WARNING: mm may no longer be dereferenced since we did not obtain
+ * its value from get_task_mm(p). This is OK since all we need to do
+ * is compare mm to q->mm below.
*
* Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
+ * change to NULL at any time since we no longer hold task_lock(p).
* However, this is of no concern to us.
*/
- if (mm == NULL || mm == &init_mm)
- return 1;
-
- __oom_kill_task(p, message);
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * kill all processes that share the ->mm (i.e. all threads)
*/
do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
+ if (q->mm == mm)
__oom_kill_task(q, message);
while_each_thread(g, q);
@@ -313,11 +319,15 @@ static int oom_kill_process(struct task_
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
+ *
+ * Return 0 if we actually shot a process. Else return 1.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+int out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
task_t *p;
unsigned long points = 0;
+ const char *msg = NULL;
+ int ret = 1;
if (printk_ratelimit()) {
printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -335,19 +345,19 @@ void out_of_memory(struct zonelist *zone
*/
switch (constrained_alloc(zonelist, gfp_mask)) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, points,
- "No available memory (MPOL_BIND)");
+ p = current;
+ msg = "No available memory (MPOL_BIND)";
break;
case CONSTRAINT_CPUSET:
- oom_kill_process(current, points,
- "No available memory in cpuset");
+ p = current;
+ msg = "No available memory in cpuset";
break;
case CONSTRAINT_NONE:
if (sysctl_panic_on_oom)
panic("out of memory. panic_on_oom is selected\n");
-retry:
+
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
@@ -364,20 +374,17 @@ retry:
panic("Out of memory and no killable processes...\n");
}
- if (oom_kill_process(p, points, "Out of memory"))
- goto retry;
-
+ msg = "Out of memory";
break;
+
+ default:
+ BUG();
}
+ ret = oom_kill_process(p, points, msg);
+
out:
read_unlock(&tasklist_lock);
cpuset_unlock();
-
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory unless "p" is current
- */
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ return ret;
}
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/mm/page_alloc.c linux-2.6.17-rc4-mm3-oom/mm/page_alloc.c
--- linux-2.6.17-rc4-mm3/mm/page_alloc.c 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/mm/page_alloc.c 2006-05-22 16:04:28.000000000 -0700
@@ -992,6 +992,62 @@ static inline void set_page_owner(struct
}
#endif /* CONFIG_PAGE_OWNER */
+/* If an OOM kill is not already in progress, try once more to allocate
+ * memory. If allocation fails this time, invoke the OOM killer.
+ */
+static struct page * oom_alloc(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ static DECLARE_MUTEX(sem);
+ struct page *page;
+
+ down(&sem);
+
+ /* Prevent parallel OOM kill operations. This fixes a problem where
+ * the OOM killer was observed shooting system daemons in addition to
+ * memory-hogging user processes.
+ */
+ if (oom_kill_active()) {
+ up(&sem);
+ goto out_sleep;
+ }
+
+ /* If we get here, we _know_ that any previous OOM killer victim has
+ * cleaned out its mm_struct. Therefore we should pick a victim to
+ * shoot if this allocation fails.
+ */
+ page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
+ zonelist, ALLOC_WMARK_HIGH | ALLOC_CPUSET);
+
+ if (page) {
+ up(&sem);
+ return page;
+ }
+
+ oom_kill_start();
+ up(&sem);
+
+ /* Try to shoot a process. Call oom_kill_finish() only if the OOM
+ * killer did not shoot anything. If the OOM killer shot something,
+ * mmput() will call oom_kill_finish() once the mm_users count of the
+ * victim's mm_struct has reached 0 and the mm_struct has been cleaned
+ * out.
+ */
+ if (out_of_memory(zonelist, gfp_mask, order))
+ oom_kill_finish(); /* cancel OOM kill */
+
+out_sleep:
+ /* Did we get shot by the OOM killer? If not, sleep for a while to
+ * avoid burning lots of CPU cycles looping in the memory allocator.
+ * If the OOM killer shot a process, this gives the victim a good
+ * chance to die before we retry allocation.
+ */
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
+
+ return NULL;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -1103,18 +1159,9 @@ rebalance:
if (page)
goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- /*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
- */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ page = oom_alloc(gfp_mask, order, zonelist);
if (page)
goto got_pg;
-
- out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]