* [PATCH 1/2 (repost)] mm: serialize OOM kill operations
@ 2006-04-27 20:08 Dave Peterson
2006-04-27 20:44 ` Paul Jackson
2006-04-27 22:56 ` Andrew Morton
0 siblings, 2 replies; 10+ messages in thread
From: Dave Peterson @ 2006-04-27 20:08 UTC (permalink / raw)
To: linux-kernel; +Cc: linux-mm, riel, nickpiggin, ak, akpm
The patch below modifies the behavior of the OOM killer so that only
one OOM kill operation can be in progress at a time. When running a
test program that eats lots of memory, I was observing behavior where
the OOM killer gets impatient and shoots one or more system daemons
in addition to the program that is eating lots of memory. This fixes
the problematic behavior.
Signed-Off-By: David S. Peterson <dsp@llnl.gov>
---
This is a repost of a previous patch. It applies to kernel
2.6.17-rc3.
Index: linux-2.6.17-rc3-oom/include/linux/sched.h
===================================================================
--- linux-2.6.17-rc3-oom.orig/include/linux/sched.h 2006-04-27 11:00:32.000000000 -0700
+++ linux-2.6.17-rc3-oom/include/linux/sched.h 2006-04-27 12:08:36.000000000 -0700
@@ -292,6 +292,9 @@ typedef unsigned long mm_counter_t;
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)
+/* bit #s for flags in mm_struct->flags... */
+#define MM_FLAG_OOM_NOTIFY 0
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -350,6 +353,8 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+ unsigned long flags;
};
struct sighand_struct {
Index: linux-2.6.17-rc3-oom/include/linux/swap.h
===================================================================
--- linux-2.6.17-rc3-oom.orig/include/linux/swap.h 2006-04-27 11:00:32.000000000 -0700
+++ linux-2.6.17-rc3-oom/include/linux/swap.h 2006-04-27 12:08:36.000000000 -0700
@@ -147,6 +147,7 @@ struct swap_list_t {
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
/* linux/mm/oom_kill.c */
+extern spinlock_t oom_kill_lock;
extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
/* linux/mm/memory.c */
Index: linux-2.6.17-rc3-oom/kernel/fork.c
===================================================================
--- linux-2.6.17-rc3-oom.orig/kernel/fork.c 2006-04-27 11:00:32.000000000 -0700
+++ linux-2.6.17-rc3-oom/kernel/fork.c 2006-04-27 12:08:36.000000000 -0700
@@ -328,6 +328,7 @@ static struct mm_struct * mm_init(struct
mm->ioctx_list = NULL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->flags = 0;
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -379,6 +380,15 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+
+ if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
+ /* Terminate a pending OOM kill operation. No tasks
+ * actually spin on the lock. Tasks only do
+ * spin_trylock() (and abort OOM kill operation if
+ * lock is already taken).
+ */
+ spin_unlock(&oom_kill_lock);
+
mmdrop(mm);
}
}
Index: linux-2.6.17-rc3-oom/mm/oom_kill.c
===================================================================
--- linux-2.6.17-rc3-oom.orig/mm/oom_kill.c 2006-04-27 11:00:32.000000000 -0700
+++ linux-2.6.17-rc3-oom/mm/oom_kill.c 2006-04-27 12:08:36.000000000 -0700
@@ -21,9 +21,13 @@
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
+#include <linux/spinlock.h>
+#include <linux/bitops.h>
/* #define DEBUG */
+spinlock_t oom_kill_lock = SPIN_LOCK_UNLOCKED;
+
/**
* oom_badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
@@ -259,27 +263,31 @@ static int oom_kill_task(task_t *p, cons
struct mm_struct *mm;
task_t * g, * q;
+ task_lock(p);
mm = p->mm;
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
+ if (mm == NULL || mm == &init_mm) {
+ task_unlock(p);
+ return 1;
+ }
+
+ set_bit(MM_FLAG_OOM_NOTIFY, &mm->flags);
+ task_unlock(p);
+
+ /* WARNING: mm may no longer be dereferenced since we did not obtain
+ * its value from get_task_mm(p). This is OK since all we need to do
+ * is compare mm to q->mm below.
*
* Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
+ * change to NULL at any time since we no longer hold task_lock(p).
* However, this is of no concern to us.
*/
- if (mm == NULL || mm == &init_mm)
- return 1;
-
- __oom_kill_task(p, message);
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * kill all processes that share the ->mm (i.e. all threads)
*/
do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
+ if (q->mm == mm)
__oom_kill_task(q, message);
while_each_thread(g, q);
@@ -317,13 +325,27 @@ void out_of_memory(struct zonelist *zone
{
task_t *p;
unsigned long points = 0;
+ int cancel = 0;
- if (printk_ratelimit()) {
- printk("oom-killer: gfp_mask=0x%x, order=%d\n",
- gfp_mask, order);
- dump_stack();
- show_mem();
- }
+ /* Return immediately if an OOM kill is already in progress. We want
+ * to avoid the following behavior:
+ *
+ * 1. Two processes (A and B) race for oom_kill_lock. Let's say
+ * A wins and B is delayed.
+ *
+ * 2. Process A shoots some process and releases oom_kill_lock.
+ *
+ * 3. Process B now acquires oom_kill_lock and shoots another
+ * process. However this isn't really what we want to do if
+ * the OOM kill done by A above freed enough memory to resolve
+ * the OOM condition.
+ */
+ if (!spin_trylock(&oom_kill_lock))
+ return;
+
+ printk("oom-killer: gfp_mask=0x%x, order=%d\n", gfp_mask, order);
+ dump_stack();
+ show_mem();
cpuset_lock();
read_lock(&tasklist_lock);
@@ -334,12 +356,12 @@ void out_of_memory(struct zonelist *zone
*/
switch (constrained_alloc(zonelist, gfp_mask)) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, points,
+ cancel = oom_kill_process(current, points,
"No available memory (MPOL_BIND)");
break;
case CONSTRAINT_CPUSET:
- oom_kill_process(current, points,
+ cancel = oom_kill_process(current, points,
"No available memory in cpuset");
break;
@@ -351,8 +373,10 @@ retry:
*/
p = select_bad_process(&points);
- if (PTR_ERR(p) == -1UL)
+ if (PTR_ERR(p) == -1UL) {
+ cancel = 1;
goto out;
+ }
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
@@ -371,6 +395,9 @@ out:
read_unlock(&tasklist_lock);
cpuset_unlock();
+ if (cancel)
+ spin_unlock(&oom_kill_lock); /* cancel OOM kill operation */
+
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 20:08 [PATCH 1/2 (repost)] mm: serialize OOM kill operations Dave Peterson
@ 2006-04-27 20:44 ` Paul Jackson
2006-04-27 21:09 ` Andrew Morton
2006-04-27 22:56 ` Andrew Morton
1 sibling, 1 reply; 10+ messages in thread
From: Paul Jackson @ 2006-04-27 20:44 UTC (permalink / raw)
To: Dave Peterson; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak, akpm
Dave wrote:
> @@ -350,6 +353,8 @@ struct mm_struct {
> /* aio bits */
> rwlock_t ioctx_list_lock;
> struct kioctx *ioctx_list;
> +
> + unsigned long flags;
I see Andi didn't reply to your question concerning what
struct he saw a 'flags' in.
Adding a flags word still costs a slot in mm_struct.
Adding a 'oom_notify' bitfield after the existing 'dumpable'
bitfield in mm_struct would save that slot:
unsigned dumpable:2;
unsigned oom_notify:1;
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 20:44 ` Paul Jackson
@ 2006-04-27 21:09 ` Andrew Morton
2006-04-27 21:32 ` Dave Peterson
2006-04-27 23:02 ` Paul Jackson
0 siblings, 2 replies; 10+ messages in thread
From: Andrew Morton @ 2006-04-27 21:09 UTC (permalink / raw)
To: Paul Jackson; +Cc: dsp, linux-kernel, linux-mm, riel, nickpiggin, ak
Paul Jackson <pj@sgi.com> wrote:
>
> Adding a 'oom_notify' bitfield after the existing 'dumpable'
> bitfield in mm_struct would save that slot:
>
> unsigned dumpable:2;
> unsigned oom_notify:1;
Note that these will occupy the same machine word. So they'll need
locking. (Good luck trying to demonstrate the race though!)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 21:09 ` Andrew Morton
@ 2006-04-27 21:32 ` Dave Peterson
2006-04-27 23:02 ` Paul Jackson
1 sibling, 0 replies; 10+ messages in thread
From: Dave Peterson @ 2006-04-27 21:32 UTC (permalink / raw)
To: Andrew Morton, Paul Jackson; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak
On Thursday 27 April 2006 14:09, Andrew Morton wrote:
> Paul Jackson <pj@sgi.com> wrote:
> > Adding a 'oom_notify' bitfield after the existing 'dumpable'
> > bitfield in mm_struct would save that slot:
> >
> > unsigned dumpable:2;
> > unsigned oom_notify:1;
>
> Note that these will occupy the same machine word. So they'll need
> locking. (Good luck trying to demonstrate the race though!)
Taking a quick look at the code, I think a race could happen like this:
Task A is executing a system call handler such as sys_setgid(),
sys_setuid(), etc. that assigns a value to the 'dumpable' field (i.e.
current->mm->dumpable = suid_dumpable; ). At the same time the OOM
killer decides to shoot task A. The following sequence of events could
take place:
1. Task A reads the machine word containing 'dumpable' into a
register and modifies the value in the register so that
'dumpable' is set to the desired value.
2. Task B, which is executing the OOM killer code, has decided to
shoot task A. B reads the machine word containing 'dumpable'
into a register, sets the bit corresponding to 'oom_notify',
and writes the machine word back to memory.
3. Task A writes its machine word back to memory, which wipes out
the setting of the 'oom_notify' bit by task B. Now the OOM
kill operation will never terminate, and all future OOM kill
operations will be prevented.
I guess it would be kind of nice if we had some sort of primitive for
atomically modifying multiple bits within a machine word.
Oh well :-/
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 20:08 [PATCH 1/2 (repost)] mm: serialize OOM kill operations Dave Peterson
2006-04-27 20:44 ` Paul Jackson
@ 2006-04-27 22:56 ` Andrew Morton
2006-04-28 21:59 ` Dave Peterson
1 sibling, 1 reply; 10+ messages in thread
From: Andrew Morton @ 2006-04-27 22:56 UTC (permalink / raw)
To: Dave Peterson; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak
Dave Peterson <dsp@llnl.gov> wrote:
>
> The patch below modifies the behavior of the OOM killer so that only
> one OOM kill operation can be in progress at a time. When running a
> test program that eats lots of memory, I was observing behavior where
> the OOM killer gets impatient and shoots one or more system daemons
> in addition to the program that is eating lots of memory. This fixes
> the problematic behavior.
>
> ...
>
> @@ -379,6 +380,15 @@ void mmput(struct mm_struct *mm)
> spin_unlock(&mmlist_lock);
> }
> put_swap_token(mm);
> +
> + if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
> + /* Terminate a pending OOM kill operation. No tasks
> + * actually spin on the lock. Tasks only do
> + * spin_trylock() (and abort OOM kill operation if
> + * lock is already taken).
> + */
> + spin_unlock(&oom_kill_lock);
> +
Gad. I guess if we're going to do this then a better implementation would
be to use test_and_set_bit(some_unsigned_long). And perhaps call some
oom_kill.c interface function here rather than directly accessing
oom-killer data structures (could be an inlined function).
But the broader question is "what do we want to do here".
If we've picked a task and we've signalled it then the right thing to do
would appear to be just to block all tasks as they enter the oom-killer.
Send them to sleep until the killed task actually exits. But
a) memory can become free (or reclaimable) for other reasons, so those
now-sleeping tasks shouldn't be sleeping any more (this is a minor
problem).
b) one of the sleeping tasks may be holding a lock which prevents the
killed task from reaching do_exit(). This is a showstopper.
But I think b) stops your show as well:
- task A enters the oom-killer, decides to kill task Z.
- task A holds a lock which is preventing task Z from exitting
- but oom_kill_lock is now held. task A just keeps on trying to reclaim
memory and trying (and failing) to kill tasks.
- user hits reset button.
IOW: we just have to keep killing more tasks until something happens.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 21:09 ` Andrew Morton
2006-04-27 21:32 ` Dave Peterson
@ 2006-04-27 23:02 ` Paul Jackson
2006-04-28 22:09 ` Dave Peterson
1 sibling, 1 reply; 10+ messages in thread
From: Paul Jackson @ 2006-04-27 23:02 UTC (permalink / raw)
To: Andrew Morton; +Cc: dsp, linux-kernel, linux-mm, riel, nickpiggin, ak
Andrew wrote:
> Note that these will occupy the same machine word.
That's why I did it. Yup.
> So they'll need
> locking. (Good luck trying to demonstrate the race though!)
Oops. Good catch. Thanks, Andrew.
Probably solvable (lockable) but this line of thought is
getting to be more trouble than I suspect it's worth.
I'm still a little surprised that this per-mm 'oom_notify' bit
was needed to implement what I thought was a single, global
system wide oom killer serializer.
But I'm too ignorant and lazy, and too distracted by other
tasks, to actually think that surprise through.
Good luck with it, Dave.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 22:56 ` Andrew Morton
@ 2006-04-28 21:59 ` Dave Peterson
2006-04-28 22:16 ` Andrew Morton
0 siblings, 1 reply; 10+ messages in thread
From: Dave Peterson @ 2006-04-28 21:59 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak, pj
> Gad. I guess if we're going to do this then a better implementation would
> be to use test_and_set_bit(some_unsigned_long). And perhaps call some
> oom_kill.c interface function here rather than directly accessing
> oom-killer data structures (could be an inlined function).
This is the way I originally had things
(see http://lkml.org/lkml/2006/4/25/327). If you prefer, I can change
it back to the way it was.
> But the broader question is "what do we want to do here".
>
> If we've picked a task and we've signalled it then the right thing to do
> would appear to be just to block all tasks as they enter the oom-killer.
> Send them to sleep until the killed task actually exits. But
>
> a) memory can become free (or reclaimable) for other reasons, so those
> now-sleeping tasks shouldn't be sleeping any more (this is a minor
> problem).
This is one reason to have out_of_memory() return to its caller if an
OOM kill is in progress. Since out_of_memory() is called from
__alloc_pages() this will cause the task to retry the memory allocation
(assuming it's a type of allocation where retrying is allowed).
> b) one of the sleeping tasks may be holding a lock which prevents the
> killed task from reaching do_exit(). This is a showstopper.
Yes I am familiar with this sort of problem. :-) The same holds true
if you replace "one of the sleeping tasks" above with "tasks stuck
inside __alloc_pages()". Or if there is a task stuck inside
__alloc_pages() that holds a mm_users reference to the mm_struct used
by the OOM killer's chosen victim. I've actually seen this type of
behavior occur in practice. In fact, I can make it reproduce quite
easily with the modified 2.6.9-based redhat kernel that I've been mostly
working in. What I do is turn off swap and run several instances of my
"memory hog" program concurrently. It results in deadlocks more often
than not if you run several instances of the program concurrently.
I've been tinkering with the OOM killer lately because we're moving
toward diskless clusters (i.e. no swap) where I work, and we want the
machines to be able to recover gracefully if users get too aggressive
with their memory allocations. So far I've made the following OOM
killer changes to our RHEL4 2.6.9-based kernel:
- Backported the latest OOM killer code from 2.6.16 to our kernel
(I see much has changed in the OOM killer since 2.6.9). This
gives us the /proc/<pid>/oom_score and /proc/<pid>/oom_adjust
functionality and other OOM killer improvements present in more
recent kernels.
- Made some of my own changes on top of the backported code.
Lately I've been taking my code changes, cleaning them up a bit, making
them into patches that apply cleanly to the latest 2.6.17-rcX kernel,
and posting to LKML. However I haven't posted everything. The code I
haven't posted is aimed specifically at addressing problem b) you
mention above. My basic approach is as follows:
- Change the OOM killer so we avoid looping in there. It's better
to return to the caller of out_of_memory() than to get stuck
inside the OOM killer (since we may hold a semaphore or something
that prevents the victim we shot from exiting and freeing its
memory).
- Change __alloc_pages() so that if an OOM kill is in progress,
_all_ tasks looping inside __alloc_pages() are allowed to
disregard all watermarks - thus satisfying their requests so they
avoid getting stuck in __alloc_pages() (I know this sounds
dangerous; see below).
- Put a little piece of code at the start of the page fault handler
(and also near the start of sys_mlock()) that will immediately put
the process to sleep if we just entered the kernel from userspace
and we have not been chosen as an OOM killer victim. Thus the
victim gets all the memory it needs to run itself out of the
system and expire. Other tasks that continue to need memory will
go to sleep at a place where they just entered the kernel from
userspace and therefore don't yet hold any semaphores that may
prevent the victim from exiting.
I have had excellent results with the above approach in our redhat
kernel. The deadlocks completely disappear, and I can run 16 or 32
instances of my "memory hog" program (I think I even did 64 instances
once) without problems. The OOM killer shoots the memory hog processes
one by one, avoids shooting any other processes (even without any
user-provided hints from /proc/<pid>/oom_adjust), and the system
recovers fully.
After the above success I started making my code changes into
2.6.17-rcX patches and testing them in that kernel. Interestingly I
found that the 2.6.17-rcX OOM killer appears to work well in practice
with just the stuff I've posted so far (excluding the stuff described
above). Therefore I've held off (at least for now) posting a patch that
does the stuff above. I made such a patch for 2.6.17-rcX just for
myself to tinker with. Surprisingly I see system hangs with the patch.
I haven't yet had time to debug this. It's possible I made some obvious
blunder when porting the code from our redhat kernel. The patch appears
below (just to better illustrate what I am doing). It is definitely
_not_ ready for actual use.
Index: git5-oom/arch/i386/mm/fault.c
===================================================================
--- git5-oom.orig/arch/i386/mm/fault.c 2006-04-25 13:25:26.000000000 -0700
+++ git5-oom/arch/i386/mm/fault.c 2006-04-25 14:06:47.000000000 -0700
@@ -22,6 +22,7 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/swap.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -337,9 +338,13 @@ fastcall void __kprobes do_page_fault(st
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
fault has been handled. */
- if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) {
local_irq_enable();
+ if (oom_kill_active() && user_mode(regs))
+ oom_kill_wait();
+ }
+
mm = tsk->mm;
/*
Index: git5-oom/arch/x86_64/mm/fault.c
===================================================================
--- git5-oom.orig/arch/x86_64/mm/fault.c 2006-04-25 13:25:26.000000000 -0700
+++ git5-oom/arch/x86_64/mm/fault.c 2006-04-25 14:07:10.000000000 -0700
@@ -24,6 +24,7 @@
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/swap.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -362,9 +363,13 @@ asmlinkage void __kprobes do_page_fault(
SIGSEGV) == NOTIFY_STOP)
return;
- if (likely(regs->eflags & X86_EFLAGS_IF))
+ if (likely(regs->eflags & X86_EFLAGS_IF)) {
local_irq_enable();
+ if (oom_kill_active() && user_mode(regs))
+ oom_kill_wait();
+ }
+
if (unlikely(page_fault_trace))
printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
Index: git5-oom/include/linux/swap.h
===================================================================
--- git5-oom.orig/include/linux/swap.h 2006-04-25 13:43:02.000000000 -0700
+++ git5-oom/include/linux/swap.h 2006-04-25 14:02:05.000000000 -0700
@@ -147,6 +147,17 @@ struct swap_list_t {
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
/* linux/mm/oom_kill.c */
+extern volatile unsigned long oom_kill_in_progress;
+
+static inline int oom_kill_active(void)
+{
+ if (unlikely(oom_kill_in_progress))
+ return 1;
+
+ return 0;
+}
+
+extern void oom_kill_wait(void);
extern void oom_kill_finish(void);
extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
Index: git5-oom/mm/mlock.c
===================================================================
--- git5-oom.orig/mm/mlock.c 2006-04-25 13:25:26.000000000 -0700
+++ git5-oom/mm/mlock.c 2006-04-25 14:02:05.000000000 -0700
@@ -10,7 +10,7 @@
#include <linux/mm.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
-
+#include <linux/swap.h>
static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, unsigned int newflags)
@@ -130,6 +130,14 @@ asmlinkage long sys_mlock(unsigned long
if (!can_do_mlock())
return -EPERM;
+ /* If an OOM kill operation starts just after we pass this point, we
+ * will continue eating memory anyway. In this case, hopefully we are
+ * not about to mlock() a huge chunk of pages. At least for now we
+ * will ignore this potential problem.
+ */
+ if (oom_kill_active())
+ oom_kill_wait();
+
down_write(¤t->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
Index: git5-oom/mm/oom_kill.c
===================================================================
--- git5-oom.orig/mm/oom_kill.c 2006-04-25 14:01:13.000000000 -0700
+++ git5-oom/mm/oom_kill.c 2006-04-25 14:02:05.000000000 -0700
@@ -21,12 +21,15 @@
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/cpuset.h>
+#include <linux/wait.h>
#include <asm/bitops.h>
/* #define DEBUG */
volatile unsigned long oom_kill_in_progress = 0;
+static DECLARE_WAIT_QUEUE_HEAD(oom_wait);
+
/*
* Attempt to start an OOM kill operation. Return 0 on success, or 1 if an
* OOM kill is already in progress.
@@ -37,6 +40,44 @@ static inline int oom_kill_start(void)
}
/*
+ * A task calls this function in one of two cases:
+ *
+ * - It just page faulted from userspace and noticed that the
+ * oom_kill_in_progress flag was set.
+ *
+ * - It just started an mlock() system call and noticed that the
+ * oom_kill_in_progress flag was set.
+ *
+ * Once an OOM kill starts, tasks that continue to demand memory while
+ * executing in userspace and are _not_ OOM killer victims will sleep here.
+ * This accomplishes the following:
+ *
+ * - Once asleep, they no longer eat memory. Therefore they do not
+ * compete with the victim for memory.
+ *
+ * - Non-victim tasks that would have otherwise gone to sleep inside the
+ * memory allocator sleep here instead. This is desirable because here
+ * we just entered the kernel from userspace. Therefore we don't hold
+ * resources such as semaphores that may prevent the victim from exiting
+ * and freeing its memory.
+ */
+void oom_kill_wait(void)
+{
+ /* Each time a zombie is cleaned up, a little bit of memory is freed.
+ * Therefore it's probably best to let init continue running.
+ */
+ if (current->pid == 1)
+ return;
+
+ /* Sleep until OOM kill operation has finished or we have been shot by
+ * OOM killer.
+ */
+ wait_event(oom_wait,
+ (!oom_kill_in_progress ||
+ test_tsk_thread_flag(current, TIF_MEMDIE)));
+}
+
+/*
* Terminate an OOM kill operation.
*
* When the OOM killer chooses a victim, it sets the oom_notify flag of the
@@ -47,6 +88,7 @@ static inline int oom_kill_start(void)
void oom_kill_finish(void)
{
clear_bit(0, &oom_kill_in_progress);
+ wake_up(&oom_wait);
}
/**
@@ -312,6 +354,11 @@ static int oom_kill_task(task_t *p, cons
__oom_kill_task(q, message);
while_each_thread(g, q);
+ /* One or more tasks we just shot may be sleeping on oom_wait. Wake
+ * up any such tasks so they may exit.
+ */
+ wake_up(&oom_wait);
+
return 0;
}
@@ -402,11 +449,4 @@ out:
if (cancel)
oom_kill_finish(); /* cancel OOM kill operation */
-
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory unless "p" is current
- */
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
}
Index: git5-oom/mm/page_alloc.c
===================================================================
--- git5-oom.orig/mm/page_alloc.c 2006-04-25 13:25:26.000000000 -0700
+++ git5-oom/mm/page_alloc.c 2006-04-25 14:02:05.000000000 -0700
@@ -985,7 +985,7 @@ restart:
/* This allocation should allow future memory freeing. */
- if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (((p->flags & PF_MEMALLOC) || oom_kill_active())
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
@@ -1022,7 +1022,7 @@ rebalance:
cond_resched();
- if (likely(did_some_progress)) {
+ if (likely(did_some_progress) || oom_kill_active()) {
page = get_page_from_freelist(gfp_mask, order,
zonelist, alloc_flags);
if (page)
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-27 23:02 ` Paul Jackson
@ 2006-04-28 22:09 ` Dave Peterson
0 siblings, 0 replies; 10+ messages in thread
From: Dave Peterson @ 2006-04-28 22:09 UTC (permalink / raw)
To: Paul Jackson, Andrew Morton; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak
On Thursday 27 April 2006 16:02, Paul Jackson wrote:
> I'm still a little surprised that this per-mm 'oom_notify' bit
> was needed to implement what I thought was a single, global
> system wide oom killer serializer.
I think the title "mm: serialize OOM kill operations" was probably a
poor choice of words. It sounds like all I want to do is make sure
tasks enter the OOM killer one-at-a-time. My goal is actually to
prevent further OOM kill operations until the OOM kill in progress
has caused the victim task to free its address space (i.e. clean out
its mm_struct). That way we don't shoot more processes than necessary
to resolve the OOM condition.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-28 21:59 ` Dave Peterson
@ 2006-04-28 22:16 ` Andrew Morton
2006-04-28 22:24 ` Dave Peterson
0 siblings, 1 reply; 10+ messages in thread
From: Andrew Morton @ 2006-04-28 22:16 UTC (permalink / raw)
To: Dave Peterson; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak, pj
Dave Peterson <dsp@llnl.gov> wrote:
>
> Yes I am familiar with this sort of problem. :-)
Andrea has long advocated that the memory allocator shouldn't infinitely
loop for small __GFP_WAIT allocations. ie: ultimately we should return
NULL back to the caller.
Usually this will cause the correct process to exit. Sometimes it won't.
Did you try it?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/2 (repost)] mm: serialize OOM kill operations
2006-04-28 22:16 ` Andrew Morton
@ 2006-04-28 22:24 ` Dave Peterson
0 siblings, 0 replies; 10+ messages in thread
From: Dave Peterson @ 2006-04-28 22:24 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, linux-mm, riel, nickpiggin, ak, pj
On Friday 28 April 2006 15:16, Andrew Morton wrote:
> Dave Peterson <dsp@llnl.gov> wrote:
> > Yes I am familiar with this sort of problem. :-)
>
> Andrea has long advocated that the memory allocator shouldn't infinitely
> loop for small __GFP_WAIT allocations. ie: ultimately we should return
> NULL back to the caller.
>
> Usually this will cause the correct process to exit. Sometimes it won't.
>
> Did you try it?
Haven't tried it yet. Sounds like a good idea.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2006-04-28 22:24 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-04-27 20:08 [PATCH 1/2 (repost)] mm: serialize OOM kill operations Dave Peterson
2006-04-27 20:44 ` Paul Jackson
2006-04-27 21:09 ` Andrew Morton
2006-04-27 21:32 ` Dave Peterson
2006-04-27 23:02 ` Paul Jackson
2006-04-28 22:09 ` Dave Peterson
2006-04-27 22:56 ` Andrew Morton
2006-04-28 21:59 ` Dave Peterson
2006-04-28 22:16 ` Andrew Morton
2006-04-28 22:24 ` Dave Peterson
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).