* [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
@ 2016-02-19 14:33 Tetsuo Handa
2016-02-19 15:10 ` Michal Hocko
0 siblings, 1 reply; 8+ messages in thread
From: Tetsuo Handa @ 2016-02-19 14:33 UTC (permalink / raw)
To: rientjes, hannes, mhocko, vdavydov; +Cc: linux-mm, Tetsuo Handa
Currently, oom_unkillable_task() is called for twice for each thread,
once at oom_scan_process_thread() and again at oom_badness().
The reason oom_scan_process_thread() needs to call oom_unkillable_task()
is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
not OOM-killable.
But there is a problem with this ordering, for oom_task_origin() == true
will unconditionally select that thread regardless of oom_score_adj.
When we merge the OOM reaper, the OOM reaper will mark already reaped
process as OOM-unkillable by updating oom_score_adj. In order to avoid
falling into infinite loop, oom_score_adj needs to be checked before
doing oom_task_origin() test.
This patch merges oom_scan_process_thread() into oom_badness() in order
to check oom_score_adj before oom_task_origin() and in order to avoid
duplicated oom_unkillable_task(), by passing a flag for telling whether
to do TIF_MEMDIE test and oom_task_origin() test.
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
fs/proc/base.c | 2 +-
include/linux/oom.h | 16 +++-------
mm/memcontrol.c | 22 +++-----------
mm/oom_kill.c | 88 ++++++++++++++++++++++++-----------------------------
4 files changed, 50 insertions(+), 78 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d7c51ca..3020aa2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -581,7 +581,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
read_lock(&tasklist_lock);
if (pid_alive(task))
- points = oom_badness(task, NULL, NULL, totalpages) *
+ points = oom_badness(task, NULL, NULL, totalpages, false) *
1000 / totalpages;
read_unlock(&tasklist_lock);
seq_printf(m, "%lu\n", points);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 45993b8..b31467e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -43,13 +43,6 @@ enum oom_constraint {
CONSTRAINT_MEMCG,
};
-enum oom_scan_t {
- OOM_SCAN_OK, /* scan thread and find its badness */
- OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */
- OOM_SCAN_ABORT, /* abort the iteration and return */
- OOM_SCAN_SELECT, /* always select this thread first */
-};
-
/* Thread is the potential origin of an oom condition; kill first on oom */
#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1)
@@ -73,8 +66,10 @@ static inline bool oom_task_origin(const struct task_struct *p)
extern void mark_oom_victim(struct task_struct *tsk);
extern unsigned long oom_badness(struct task_struct *p,
- struct mem_cgroup *memcg, const nodemask_t *nodemask,
- unsigned long totalpages);
+ struct mem_cgroup *memcg,
+ struct oom_control *oc,
+ unsigned long totalpages,
+ bool check_exceptions);
extern int oom_kills_count(void);
extern void note_oom_kill(void);
@@ -86,9 +81,6 @@ extern void check_panic_on_oom(struct oom_control *oc,
enum oom_constraint constraint,
struct mem_cgroup *memcg);
-extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
- struct task_struct *task, unsigned long totalpages);
-
extern bool out_of_memory(struct oom_control *oc);
extern void exit_oom_victim(struct task_struct *tsk);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3c96dd3..4cc210c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1248,7 +1248,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
- unsigned int points = 0;
+ unsigned long points = 0;
struct task_struct *chosen = NULL;
mutex_lock(&oom_lock);
@@ -1271,28 +1271,16 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(&oc, task, totalpages)) {
- case OOM_SCAN_SELECT:
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = ULONG_MAX;
- get_task_struct(chosen);
- /* fall through */
- case OOM_SCAN_CONTINUE:
+ points = oom_badness(task, NULL, &oc, totalpages, true);
+ if (!points || points < chosen_points)
continue;
- case OOM_SCAN_ABORT:
+ if (points == ULONG_MAX) {
css_task_iter_end(&it);
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
goto unlock;
- case OOM_SCAN_OK:
- break;
- };
- points = oom_badness(task, NULL, NULL, totalpages);
- if (!points || points < chosen_points)
- continue;
+ }
/* Prefer thread group leaders for display purposes */
if (points == chosen_points &&
thread_group_leader(chosen))
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 28d6a32..f426ce8 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -155,21 +155,40 @@ static bool oom_unkillable_task(struct task_struct *p,
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
+ * @memcg: memory cgroup. NULL unless memcg OOM case.
+ * @oc: oom_control struct. NULL if /proc/pid/oom_score_adj case.
* @totalpages: total present RAM allowed for page allocation
+ * @check_exceptions: whether to check for TIF_MEMDIE and oom_task_origin().
+ *
+ * Returns ULONG_MAX if @p is marked as OOM-victim.
+ * Returns ULONG_MAX - 1 if @p is marked as oom_task_origin().
+ * Returns 0 if @p is marked as OOM-unkillable.
+ * Returns integer between 1 and ULONG_MAX - 2 otherwise.
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, unsigned long totalpages)
+ struct oom_control *oc, unsigned long totalpages,
+ bool check_exceptions)
{
long points;
long adj;
- if (oom_unkillable_task(p, memcg, nodemask))
+ if (oom_unkillable_task(p, memcg, oc ? oc->nodemask : NULL))
return 0;
+ /*
+ * This task already has access to memory reserves and is being killed.
+ * Don't allow any other task to have access to the reserves.
+ */
+ if (check_exceptions)
+ if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+ if (!is_sysrq_oom(oc))
+ return ULONG_MAX;
+ }
+
p = find_lock_task_mm(p);
if (!p)
return 0;
@@ -181,6 +200,16 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
}
/*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (check_exceptions)
+ if (oom_task_origin(p)) {
+ task_unlock(p);
+ return ULONG_MAX - 1;
+ }
+
+ /*
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
@@ -268,33 +297,6 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
}
#endif
-enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
- struct task_struct *task, unsigned long totalpages)
-{
- if (oom_unkillable_task(task, NULL, oc->nodemask))
- return OOM_SCAN_CONTINUE;
-
- /*
- * This task already has access to memory reserves and is being killed.
- * Don't allow any other task to have access to the reserves.
- */
- if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
- }
- if (!task->mm)
- return OOM_SCAN_CONTINUE;
-
- /*
- * If task is allocating a lot of memory and has been marked to be
- * killed first if it triggers an oom, then select it.
- */
- if (oom_task_origin(task))
- return OOM_SCAN_SELECT;
-
- return OOM_SCAN_OK;
-}
-
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
@@ -308,24 +310,14 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
rcu_read_lock();
for_each_process_thread(g, p) {
- unsigned int points;
-
- switch (oom_scan_process_thread(oc, p, totalpages)) {
- case OOM_SCAN_SELECT:
- chosen = p;
- chosen_points = ULONG_MAX;
- /* fall through */
- case OOM_SCAN_CONTINUE:
+ const unsigned long points = oom_badness(p, NULL, oc,
+ totalpages, true);
+ if (!points || points < chosen_points)
continue;
- case OOM_SCAN_ABORT:
+ if (points == ULONG_MAX) {
rcu_read_unlock();
return (struct task_struct *)(-1UL);
- case OOM_SCAN_OK:
- break;
- };
- points = oom_badness(p, NULL, oc->nodemask, totalpages);
- if (!points || points < chosen_points)
- continue;
+ }
/* Prefer thread group leaders for display purposes */
if (points == chosen_points && thread_group_leader(chosen))
continue;
@@ -676,7 +668,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
struct task_struct *child;
struct task_struct *t;
struct mm_struct *mm;
- unsigned int victim_points = 0;
+ unsigned long victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
@@ -709,15 +701,15 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
read_lock(&tasklist_lock);
for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
+ unsigned long child_points;
if (process_shares_mm(child, p->mm))
continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, oc->nodemask,
- totalpages);
+ child_points = oom_badness(child, memcg, oc,
+ totalpages, false);
if (child_points > victim_points) {
put_task_struct(victim);
victim = child;
--
1.8.3.1
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 14:33 [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks Tetsuo Handa
@ 2016-02-19 15:10 ` Michal Hocko
2016-02-19 16:01 ` Tetsuo Handa
0 siblings, 1 reply; 8+ messages in thread
From: Michal Hocko @ 2016-02-19 15:10 UTC (permalink / raw)
To: Tetsuo Handa; +Cc: rientjes, hannes, vdavydov, linux-mm
On Fri 19-02-16 23:33:31, Tetsuo Handa wrote:
> Currently, oom_unkillable_task() is called for twice for each thread,
> once at oom_scan_process_thread() and again at oom_badness().
>
> The reason oom_scan_process_thread() needs to call oom_unkillable_task()
> is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
> not OOM-killable.
>
> But there is a problem with this ordering, for oom_task_origin() == true
> will unconditionally select that thread regardless of oom_score_adj.
> When we merge the OOM reaper, the OOM reaper will mark already reaped
> process as OOM-unkillable by updating oom_score_adj. In order to avoid
> falling into infinite loop, oom_score_adj needs to be checked before
> doing oom_task_origin() test.
What would be the infinite loop?
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 15:10 ` Michal Hocko
@ 2016-02-19 16:01 ` Tetsuo Handa
2016-02-19 16:17 ` Michal Hocko
2016-02-19 17:15 ` Michal Hocko
0 siblings, 2 replies; 8+ messages in thread
From: Tetsuo Handa @ 2016-02-19 16:01 UTC (permalink / raw)
To: mhocko; +Cc: rientjes, hannes, vdavydov, linux-mm
Michal Hocko wrote:
> On Fri 19-02-16 23:33:31, Tetsuo Handa wrote:
> > Currently, oom_unkillable_task() is called for twice for each thread,
> > once at oom_scan_process_thread() and again at oom_badness().
> >
> > The reason oom_scan_process_thread() needs to call oom_unkillable_task()
> > is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
> > not OOM-killable.
> >
> > But there is a problem with this ordering, for oom_task_origin() == true
> > will unconditionally select that thread regardless of oom_score_adj.
> > When we merge the OOM reaper, the OOM reaper will mark already reaped
> > process as OOM-unkillable by updating oom_score_adj. In order to avoid
> > falling into infinite loop, oom_score_adj needs to be checked before
> > doing oom_task_origin() test.
>
> What would be the infinite loop?
Sequence until we merge the OOM reaper:
(1) select_bad_process() returns p due to oom_task_origin(p) == true.
(2) oom_kill_process() sends SIGKILL to p and sets TIF_MEMDIE on p.
(3) p gets stuck at down_read(&mm->mmap_sem) in exit_mm().
(4) The OOM killer will ignore TIF_MEMDIE on p after some timeout expires.
(5) select_bad_process() returns p again due to oom_task_origin(p) == true &&
p->mm != NULL.
(6) oom_kill_process() will do only
task_lock(p);
if (p->mm && task_will_free_mem(p)) {
mark_oom_victim(p);
task_unlock(p);
put_task_struct(p);
return;
}
task_unlock(p);
which would fail to continue because p already got stuck.
Sequence after we merge the OOM reaper:
(1) select_bad_process() returns p due to oom_task_origin(p) == true.
(2) oom_kill_process() sends SIGKILL to p and sets TIF_MEMDIE on p.
(3) p gets stuck at down_read(&mm->mmap_sem) in exit_mm().
(4) The OOM reaper will reap p's memory and set
p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN and clear TIF_MEMDIE on p.
(5) select_bad_process() returns p again due to oom_task_origin(p) == true &&
p->mm != NULL.
(6) oom_kill_process() will do only
task_lock(p);
if (p->mm && task_will_free_mem(p)) {
mark_oom_victim(p);
task_unlock(p);
put_task_struct(p);
return;
}
task_unlock(p);
which would fail to continue because p already got stuck.
The difference is "TIF_MEMDIE is cleared by the OOM reaper" and
"TIF_MEMDIE is ignored by some timeout". This trap can be avoided
by not returning p due to p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN.
Well, if the OOM reaper cannot reap p's memory, skipping TIF_MEMDIE by
some timeout will again select p. Maybe we need to add a flag for
avoid selecting the same task again (e.g. PFA_OOM_NO_RECURSION in
http://lkml.kernel.org/r/201601181335.JJD69226.JHVQSMFOFOFtOL@I-love.SAKURA.ne.jp ).
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 16:01 ` Tetsuo Handa
@ 2016-02-19 16:17 ` Michal Hocko
2016-02-19 17:15 ` Michal Hocko
1 sibling, 0 replies; 8+ messages in thread
From: Michal Hocko @ 2016-02-19 16:17 UTC (permalink / raw)
To: Tetsuo Handa; +Cc: rientjes, hannes, vdavydov, linux-mm
On Sat 20-02-16 01:01:36, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 19-02-16 23:33:31, Tetsuo Handa wrote:
> > > Currently, oom_unkillable_task() is called for twice for each thread,
> > > once at oom_scan_process_thread() and again at oom_badness().
> > >
> > > The reason oom_scan_process_thread() needs to call oom_unkillable_task()
> > > is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
> > > not OOM-killable.
> > >
> > > But there is a problem with this ordering, for oom_task_origin() == true
> > > will unconditionally select that thread regardless of oom_score_adj.
> > > When we merge the OOM reaper, the OOM reaper will mark already reaped
> > > process as OOM-unkillable by updating oom_score_adj. In order to avoid
> > > falling into infinite loop, oom_score_adj needs to be checked before
> > > doing oom_task_origin() test.
> >
> > What would be the infinite loop?
>
> Sequence until we merge the OOM reaper:
>
> (1) select_bad_process() returns p due to oom_task_origin(p) == true.
> (2) oom_kill_process() sends SIGKILL to p and sets TIF_MEMDIE on p.
> (3) p gets stuck at down_read(&mm->mmap_sem) in exit_mm().
How would this happen? oom_task_origin is swapoff resp run_store (which
triggers KSM). While theoretically both can be done from multithreaded
process, does this happen in reality? I seriously doubt so. Doing many
changes in an already complex code for non-realistic cases sounds like a
bad idea to me.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 16:01 ` Tetsuo Handa
2016-02-19 16:17 ` Michal Hocko
@ 2016-02-19 17:15 ` Michal Hocko
2016-02-19 17:34 ` Tetsuo Handa
1 sibling, 1 reply; 8+ messages in thread
From: Michal Hocko @ 2016-02-19 17:15 UTC (permalink / raw)
To: Tetsuo Handa; +Cc: rientjes, hannes, vdavydov, linux-mm
On Sat 20-02-16 01:01:36, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Fri 19-02-16 23:33:31, Tetsuo Handa wrote:
> > > Currently, oom_unkillable_task() is called for twice for each thread,
> > > once at oom_scan_process_thread() and again at oom_badness().
> > >
> > > The reason oom_scan_process_thread() needs to call oom_unkillable_task()
> > > is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
> > > not OOM-killable.
> > >
> > > But there is a problem with this ordering, for oom_task_origin() == true
> > > will unconditionally select that thread regardless of oom_score_adj.
> > > When we merge the OOM reaper, the OOM reaper will mark already reaped
> > > process as OOM-unkillable by updating oom_score_adj. In order to avoid
> > > falling into infinite loop, oom_score_adj needs to be checked before
> > > doing oom_task_origin() test.
> >
> > What would be the infinite loop?
>
> Sequence until we merge the OOM reaper:
>
> (1) select_bad_process() returns p due to oom_task_origin(p) == true.
> (2) oom_kill_process() sends SIGKILL to p and sets TIF_MEMDIE on p.
> (3) p gets stuck at down_read(&mm->mmap_sem) in exit_mm().
> (4) The OOM killer will ignore TIF_MEMDIE on p after some timeout expires.
> (5) select_bad_process() returns p again due to oom_task_origin(p) == true &&
> p->mm != NULL.
And one more thing. The task will stop being oom_task_origin right
after it detects signal pending and retuns from try_to_unuse resp.
unmerge_and_remove_all_rmap_items.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 17:15 ` Michal Hocko
@ 2016-02-19 17:34 ` Tetsuo Handa
2016-02-19 18:40 ` Michal Hocko
0 siblings, 1 reply; 8+ messages in thread
From: Tetsuo Handa @ 2016-02-19 17:34 UTC (permalink / raw)
To: mhocko; +Cc: rientjes, hannes, vdavydov, linux-mm
Michal Hocko wrote:
> On Sat 20-02-16 01:01:36, Tetsuo Handa wrote:
> > Michal Hocko wrote:
> > > On Fri 19-02-16 23:33:31, Tetsuo Handa wrote:
> > > > Currently, oom_unkillable_task() is called for twice for each thread,
> > > > once at oom_scan_process_thread() and again at oom_badness().
> > > >
> > > > The reason oom_scan_process_thread() needs to call oom_unkillable_task()
> > > > is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
> > > > not OOM-killable.
> > > >
> > > > But there is a problem with this ordering, for oom_task_origin() == true
> > > > will unconditionally select that thread regardless of oom_score_adj.
> > > > When we merge the OOM reaper, the OOM reaper will mark already reaped
> > > > process as OOM-unkillable by updating oom_score_adj. In order to avoid
> > > > falling into infinite loop, oom_score_adj needs to be checked before
> > > > doing oom_task_origin() test.
> > >
> > > What would be the infinite loop?
> >
> > Sequence until we merge the OOM reaper:
> >
> > (1) select_bad_process() returns p due to oom_task_origin(p) == true.
> > (2) oom_kill_process() sends SIGKILL to p and sets TIF_MEMDIE on p.
> > (3) p gets stuck at down_read(&mm->mmap_sem) in exit_mm().
> > (4) The OOM killer will ignore TIF_MEMDIE on p after some timeout expires.
> > (5) select_bad_process() returns p again due to oom_task_origin(p) == true &&
> > p->mm != NULL.
>
> And one more thing. The task will stop being oom_task_origin right
> after it detects signal pending and retuns from try_to_unuse resp.
> unmerge_and_remove_all_rmap_items.
That's good. Then, we don't need to worry about oom_task_origin() case.
So, the purpose of this "[PATCH] mm,oom: kill duplicated oom_unkillable_task()
checks." became only to save oom_unkillable_task() call.
I'm trying to update select_bad_process() to use for_each_process() rather than
for_each_process_thread(). If we can do it, I think we can use is_oom_victim()
and respond to your concern
| This will make the scanning much more time consuming (you will check
| all the threads in the same thread group for each scanned thread!). I
| do not think this is acceptable and it is not really needed for the
| !is_sysrq_oom because we are scanning all the threads anyway.
while excluding TIF_MEMDIE processes from candidates.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 17:34 ` Tetsuo Handa
@ 2016-02-19 18:40 ` Michal Hocko
2016-02-20 7:14 ` Tetsuo Handa
0 siblings, 1 reply; 8+ messages in thread
From: Michal Hocko @ 2016-02-19 18:40 UTC (permalink / raw)
To: Tetsuo Handa; +Cc: rientjes, hannes, vdavydov, linux-mm
On Sat 20-02-16 02:34:43, Tetsuo Handa wrote:
> Michal Hocko wrote:
> > On Sat 20-02-16 01:01:36, Tetsuo Handa wrote:
> > > Michal Hocko wrote:
> > > > On Fri 19-02-16 23:33:31, Tetsuo Handa wrote:
> > > > > Currently, oom_unkillable_task() is called for twice for each thread,
> > > > > once at oom_scan_process_thread() and again at oom_badness().
> > > > >
> > > > > The reason oom_scan_process_thread() needs to call oom_unkillable_task()
> > > > > is to skip TIF_MEMDIE test and oom_task_origin() test if that thread is
> > > > > not OOM-killable.
> > > > >
> > > > > But there is a problem with this ordering, for oom_task_origin() == true
> > > > > will unconditionally select that thread regardless of oom_score_adj.
> > > > > When we merge the OOM reaper, the OOM reaper will mark already reaped
> > > > > process as OOM-unkillable by updating oom_score_adj. In order to avoid
> > > > > falling into infinite loop, oom_score_adj needs to be checked before
> > > > > doing oom_task_origin() test.
> > > >
> > > > What would be the infinite loop?
> > >
> > > Sequence until we merge the OOM reaper:
> > >
> > > (1) select_bad_process() returns p due to oom_task_origin(p) == true.
> > > (2) oom_kill_process() sends SIGKILL to p and sets TIF_MEMDIE on p.
> > > (3) p gets stuck at down_read(&mm->mmap_sem) in exit_mm().
> > > (4) The OOM killer will ignore TIF_MEMDIE on p after some timeout expires.
> > > (5) select_bad_process() returns p again due to oom_task_origin(p) == true &&
> > > p->mm != NULL.
> >
> > And one more thing. The task will stop being oom_task_origin right
> > after it detects signal pending and retuns from try_to_unuse resp.
> > unmerge_and_remove_all_rmap_items.
>
> That's good. Then, we don't need to worry about oom_task_origin() case.
> So, the purpose of this "[PATCH] mm,oom: kill duplicated oom_unkillable_task()
> checks." became only to save oom_unkillable_task() call.
which sounds like hard to justify imnho considering:
4 files changed, 50 insertions(+), 78 deletions(-)
> I'm trying to update select_bad_process() to use for_each_process() rather than
> for_each_process_thread(). If we can do it, I think we can use is_oom_victim()
> and respond to your concern
3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel panics"). I
would really recommend to look through the history (git blame is your
friend) before suggesting any changes in this area. The code is really
subtle and many times for good reasons.
--
Michal Hocko
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks.
2016-02-19 18:40 ` Michal Hocko
@ 2016-02-20 7:14 ` Tetsuo Handa
0 siblings, 0 replies; 8+ messages in thread
From: Tetsuo Handa @ 2016-02-20 7:14 UTC (permalink / raw)
To: mhocko; +Cc: rientjes, hannes, vdavydov, linux-mm
Michal Hocko wrote:
> > I'm trying to update select_bad_process() to use for_each_process() rather than
> > for_each_process_thread(). If we can do it, I think we can use is_oom_victim()
> > and respond to your concern
>
> 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel panics"). I
> would really recommend to look through the history (git blame is your
> friend) before suggesting any changes in this area. The code is really
> subtle and many times for good reasons.
I think the code is too subtle to touch.
I wish I were able to propose as simple OOM killing as shown below.
(1) Do not clear TIF_MEMDIE until TASK_DEAD.
This should help cases where memory allocations are needed after
exit_mm().
(2) Do not wait TIF_MEMDIE task forever using simple timeout.
This should help cases where the victim task is unable to make
forward progress.
(3) Do not allow ALLOC_NO_WATERMARKS by TIF_MEMDIE tasks.
This should help avoid depletion of memory reserves.
(4) Allow any SIGKILL or PF_EXITING tasks access part of memory reserves.
This should help exiting tasks to make forward progress.
(5) Set TIF_MEMDIE to all threads chosen by the OOM killer.
This should help reduce mmap_sem problem.
(6) Select next task to set TIF_MEMDIE after timeout rather than
give up TIF_MEMDIE task's memory allocations.
This should help reduce possibility of unexpected failures by
e.g. btrfs's BUG_ON() trap.
(7) Allow GFP_KILLABLE. This should help quicker termination.
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 45993b8..87a004e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -70,7 +70,7 @@ static inline bool oom_task_origin(const struct task_struct *p)
return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
}
-extern void mark_oom_victim(struct task_struct *tsk);
+extern bool mark_oom_victim(struct task_struct *tsk);
extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask,
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195..d4ff477 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -434,8 +434,6 @@ static void exit_mm(struct task_struct *tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
- if (test_thread_flag(TIF_MEMDIE))
- exit_oom_victim(tsk);
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -825,6 +823,7 @@ void do_exit(long code)
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
+ exit_oom_victim(tsk);
schedule();
BUG();
/* Avoid "noreturn function does return". */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ae8b81c..5be5a85 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1258,10 +1258,9 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
- if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- mark_oom_victim(current);
+ if ((fatal_signal_pending(current) || task_will_free_mem(current)) &&
+ mark_oom_victim(current))
goto unlock;
- }
check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d7bb9c1..6e7f5e4a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -50,6 +50,11 @@ int sysctl_oom_dump_tasks = 1;
DEFINE_MUTEX(oom_lock);
+static void oomkiller_reset(unsigned long arg)
+{
+}
+static DEFINE_TIMER(oomkiller_victim_wait_timer, oomkiller_reset, 0, 0);
+
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -278,9 +283,10 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
* This task already has access to memory reserves and is being killed.
* Don't allow any other task to have access to the reserves.
*/
- if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+ if (test_tsk_thread_flag(task, TIF_MEMDIE) && task->state != TASK_DEAD) {
if (!is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
+ return timer_pending(&oomkiller_victim_wait_timer) ?
+ OOM_SCAN_ABORT : OOM_SCAN_CONTINUE;
}
if (!task->mm)
return OOM_SCAN_CONTINUE;
@@ -292,9 +298,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
-
return OOM_SCAN_OK;
}
@@ -584,12 +587,12 @@ static void wake_oom_reaper(struct task_struct *mm)
* Has to be called with oom_lock held and never after
* oom has been disabled already.
*/
-void mark_oom_victim(struct task_struct *tsk)
+bool mark_oom_victim(struct task_struct *tsk)
{
WARN_ON(oom_killer_disabled);
/* OOM killer might race with memcg OOM */
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
- return;
+ return false;
/*
* Make sure that the task is woken up from uninterruptible sleep
* if it is frozen because OOM killer wouldn't be able to free
@@ -598,6 +601,9 @@ void mark_oom_victim(struct task_struct *tsk)
*/
__thaw_task(tsk);
atomic_inc(&oom_victims);
+ /* Make sure that we won't wait for this task forever. */
+ mod_timer(&oomkiller_victim_wait_timer, jiffies + 5 * HZ);
+ return true;
}
/**
@@ -689,8 +695,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
task_lock(p);
- if (p->mm && task_will_free_mem(p)) {
- mark_oom_victim(p);
+ if (task_will_free_mem(p) && mark_oom_victim(p)) {
task_unlock(p);
put_task_struct(p);
return;
@@ -759,20 +764,16 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
task_unlock(victim);
/*
- * Kill all user processes sharing victim->mm in other thread groups, if
- * any. They don't get access to memory reserves, though, to avoid
- * depletion of all memory. This prevents mm->mmap_sem livelock when an
- * oom killed thread cannot exit because it requires the semaphore and
- * its contended by another thread trying to allocate memory itself.
- * That thread will now get access to memory reserves since it has a
- * pending fatal signal.
+ * Kill all user processes sharing victim->mm. We give them access to
+ * memory reserves because they are expected to terminate immediately.
+ * Even if memory reserves were depleted, the OOM reaper takes care of
+ * allowing the OOM killer to choose next victim by marking current
+ * victim as OOM-unkillable and clearing TIF_MEMDIE.
*/
rcu_read_lock();
for_each_process(p) {
if (!process_shares_mm(p, mm))
continue;
- if (same_thread_group(p, victim))
- continue;
if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
/*
@@ -784,6 +785,12 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
}
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+ for_each_thread(p, t) {
+ task_lock(t);
+ if (t->mm)
+ mark_oom_victim(t);
+ task_unlock(t);
+ }
}
rcu_read_unlock();
@@ -863,15 +870,10 @@ bool out_of_memory(struct oom_control *oc)
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
- *
- * But don't select if current has already released its mm and cleared
- * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
*/
- if (current->mm &&
- (fatal_signal_pending(current) || task_will_free_mem(current))) {
- mark_oom_victim(current);
+ if ((fatal_signal_pending(current) || task_will_free_mem(current)) &&
+ mark_oom_victim(current))
return true;
- }
/*
* Check if there were limitations on the allocation (only relevant for
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 85e7588..8c9e707 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3065,10 +3065,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_NO_WATERMARKS;
else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
+ else if (!in_interrupt() && (current->flags & PF_MEMALLOC))
alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (!in_interrupt() && (fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING)))
+ alloc_flags |= ALLOC_HARDER;
}
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -3291,10 +3292,6 @@ retry:
goto nopage;
}
- /* Avoid allocations with no watermarks from looping endlessly */
- if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
-
/*
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related [flat|nested] 8+ messages in thread
end of thread, other threads:[~2016-02-20 7:15 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-19 14:33 [PATCH] mm,oom: kill duplicated oom_unkillable_task() checks Tetsuo Handa
2016-02-19 15:10 ` Michal Hocko
2016-02-19 16:01 ` Tetsuo Handa
2016-02-19 16:17 ` Michal Hocko
2016-02-19 17:15 ` Michal Hocko
2016-02-19 17:34 ` Tetsuo Handa
2016-02-19 18:40 ` Michal Hocko
2016-02-20 7:14 ` Tetsuo Handa
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).