diff for duplicates of <20171011161024.GA26974@castle> diff --git a/a/1.txt b/N1/1.txt index 66261f6..fe97bb4 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -136,498 +136,3 @@ Also, I've closed the race, you've pointed on. Thanks! -------------------------------------------------------------------------------- -From 7f51d26be2d2a5b6e4840574f72beb15920c0993 Mon Sep 17 00:00:00 2001 -From: Roman Gushchin <guro@fb.com> -Date: Thu, 25 May 2017 14:18:45 +0100 -Subject: [v12 3/6] mm, oom: cgroup-aware OOM killer - -Traditionally, the OOM killer is operating on a process level. -Under oom conditions, it finds a process with the highest oom score -and kills it. - -This behavior doesn't suit well the system with many running -containers: - -1) There is no fairness between containers. A small container with -few large processes will be chosen over a large one with huge -number of small processes. - -2) Containers often do not expect that some random process inside -will be killed. In many cases much safer behavior is to kill -all tasks in the container. Traditionally, this was implemented -in userspace, but doing it in the kernel has some advantages, -especially in a case of a system-wide OOM. - -To address these issues, the cgroup-aware OOM killer is introduced. - -This patch introduces the core functionality: an ability to select -a memory cgroup as an OOM victim. Under OOM conditions the OOM killer -looks for the biggest leaf memory cgroup and kills the biggest -task belonging to it. - -The following patches will extend this functionality to consider -non-leaf memory cgroups as OOM victims, and also provide an ability -to kill all tasks belonging to the victim cgroup. - -The root cgroup is treated as a leaf memory cgroup, so it's score -is compared with other leaf memory cgroups. -Due to memcg statistics implementation a special approximation -is used for estimating oom_score of root memory cgroup: we sum -oom_score of the belonging processes (or, to be more precise, -tasks owning their mm structures). - -Signed-off-by: Roman Gushchin <guro@fb.com> -Cc: Michal Hocko <mhocko@suse.com> -Cc: Vladimir Davydov <vdavydov.dev@gmail.com> -Cc: Johannes Weiner <hannes@cmpxchg.org> -Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> -Cc: David Rientjes <rientjes@google.com> -Cc: Andrew Morton <akpm@linux-foundation.org> -Cc: Tejun Heo <tj@kernel.org> -Cc: kernel-team@fb.com -Cc: cgroups@vger.kernel.org -Cc: linux-doc@vger.kernel.org -Cc: linux-kernel@vger.kernel.org -Cc: linux-mm@kvack.org ---- - include/linux/memcontrol.h | 17 +++++ - include/linux/oom.h | 12 ++- - mm/memcontrol.c | 181 +++++++++++++++++++++++++++++++++++++++++++++ - mm/oom_kill.c | 72 +++++++++++++----- - 4 files changed, 262 insertions(+), 20 deletions(-) - -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 69966c461d1c..75b63b68846e 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -35,6 +35,7 @@ struct mem_cgroup; - struct page; - struct mm_struct; - struct kmem_cache; -+struct oom_control; - - /* Cgroup-specific page state, on top of universal node page state */ - enum memcg_stat_item { -@@ -342,6 +343,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ - return css ? container_of(css, struct mem_cgroup, css) : NULL; - } - -+static inline void mem_cgroup_put(struct mem_cgroup *memcg) -+{ -+ css_put(&memcg->css); -+} -+ - #define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - -@@ -480,6 +486,8 @@ static inline bool task_in_memcg_oom(struct task_struct *p) - - bool mem_cgroup_oom_synchronize(bool wait); - -+bool mem_cgroup_select_oom_victim(struct oom_control *oc); -+ - #ifdef CONFIG_MEMCG_SWAP - extern int do_swap_account; - #endif -@@ -744,6 +752,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task, - return true; - } - -+static inline void mem_cgroup_put(struct mem_cgroup *memcg) -+{ -+} -+ - static inline struct mem_cgroup * - mem_cgroup_iter(struct mem_cgroup *root, - struct mem_cgroup *prev, -@@ -936,6 +948,11 @@ static inline - void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) - { - } -+ -+static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc) -+{ -+ return false; -+} - #endif /* CONFIG_MEMCG */ - - /* idx can be of type enum memcg_stat_item or node_stat_item */ -diff --git a/include/linux/oom.h b/include/linux/oom.h -index 76aac4ce39bc..ca78e2d5956e 100644 ---- a/include/linux/oom.h -+++ b/include/linux/oom.h -@@ -9,6 +9,13 @@ - #include <linux/sched/coredump.h> /* MMF_* */ - #include <linux/mm.h> /* VM_FAULT* */ - -+ -+/* -+ * Special value returned by victim selection functions to indicate -+ * that are inflight OOM victims. -+ */ -+#define INFLIGHT_VICTIM ((void *)-1UL) -+ - struct zonelist; - struct notifier_block; - struct mem_cgroup; -@@ -39,7 +46,8 @@ struct oom_control { - - /* Used by oom implementation, do not set */ - unsigned long totalpages; -- struct task_struct *chosen; -+ struct task_struct *chosen_task; -+ struct mem_cgroup *chosen_memcg; - unsigned long chosen_points; - }; - -@@ -101,6 +109,8 @@ extern void oom_killer_enable(void); - - extern struct task_struct *find_lock_task_mm(struct task_struct *p); - -+extern int oom_evaluate_task(struct task_struct *task, void *arg); -+ - /* sysctls */ - extern int sysctl_oom_dump_tasks; - extern int sysctl_oom_kill_allocating_task; -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index df3368734f1c..8f04e1fb9dd9 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -2670,6 +2670,187 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) - return ret; - } - -+static long memcg_oom_badness(struct mem_cgroup *memcg, -+ const nodemask_t *nodemask, -+ unsigned long totalpages) -+{ -+ long points = 0; -+ int nid; -+ pg_data_t *pgdat; -+ -+ for_each_node_state(nid, N_MEMORY) { -+ if (nodemask && !node_isset(nid, *nodemask)) -+ continue; -+ -+ points += mem_cgroup_node_nr_lru_pages(memcg, nid, -+ LRU_ALL_ANON | BIT(LRU_UNEVICTABLE)); -+ -+ pgdat = NODE_DATA(nid); -+ points += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg), -+ NR_SLAB_UNRECLAIMABLE); -+ } -+ -+ points += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) / -+ (PAGE_SIZE / 1024); -+ points += memcg_page_state(memcg, MEMCG_SOCK); -+ points += memcg_page_state(memcg, MEMCG_SWAP); -+ -+ return points; -+} -+ -+/* -+ * Checks if the given memcg is a valid OOM victim and returns a number, -+ * which means the folowing: -+ * -1: there are inflight OOM victim tasks, belonging to the memcg -+ * 0: memcg is not eligible, e.g. all belonging tasks are protected -+ * by oom_score_adj set to OOM_SCORE_ADJ_MIN -+ * >0: memcg is eligible, and the returned value is an estimation -+ * of the memory footprint -+ */ -+static long oom_evaluate_memcg(struct mem_cgroup *memcg, -+ const nodemask_t *nodemask, -+ unsigned long totalpages) -+{ -+ struct css_task_iter it; -+ struct task_struct *task; -+ int eligible = 0; -+ -+ /* -+ * Root memory cgroup is a special case: -+ * we don't have necessary stats to evaluate it exactly as -+ * leaf memory cgroups, so we approximate it's oom_score -+ * by summing oom_score of all belonging tasks, which are -+ * owners of their mm structs. -+ * -+ * If there are inflight OOM victim tasks inside -+ * the root memcg, we return -1. -+ */ -+ if (memcg == root_mem_cgroup) { -+ struct css_task_iter it; -+ struct task_struct *task; -+ long score = 0; -+ -+ css_task_iter_start(&memcg->css, 0, &it); -+ while ((task = css_task_iter_next(&it))) { -+ if (tsk_is_oom_victim(task) && -+ !test_bit(MMF_OOM_SKIP, -+ &task->signal->oom_mm->flags)) { -+ score = -1; -+ break; -+ } -+ -+ task_lock(task); -+ if (!task->mm || task->mm->owner != task) { -+ task_unlock(task); -+ continue; -+ } -+ task_unlock(task); -+ -+ score += oom_badness(task, memcg, nodemask, -+ totalpages); -+ } -+ css_task_iter_end(&it); -+ -+ return score; -+ } -+ -+ /* -+ * Memcg is OOM eligible if there are OOM killable tasks inside. -+ * -+ * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN -+ * as unkillable. -+ * -+ * If there are inflight OOM victim tasks inside the memcg, -+ * we return -1. -+ */ -+ css_task_iter_start(&memcg->css, 0, &it); -+ while ((task = css_task_iter_next(&it))) { -+ if (!eligible && -+ task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) -+ eligible = 1; -+ -+ if (tsk_is_oom_victim(task) && -+ !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) { -+ eligible = -1; -+ break; -+ } -+ } -+ css_task_iter_end(&it); -+ -+ if (eligible <= 0) -+ return eligible; -+ -+ return memcg_oom_badness(memcg, nodemask, totalpages); -+} -+ -+static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc) -+{ -+ struct mem_cgroup *iter; -+ -+ oc->chosen_memcg = NULL; -+ oc->chosen_points = 0; -+ -+ /* -+ * The oom_score is calculated for leaf memory cgroups (including -+ * the root memcg). -+ */ -+ rcu_read_lock(); -+ for_each_mem_cgroup_tree(iter, root) { -+ long score; -+ -+ if (memcg_has_children(iter) && iter != root_mem_cgroup) -+ continue; -+ -+ score = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages); -+ -+ /* -+ * Ignore empty and non-eligible memory cgroups. -+ */ -+ if (score == 0) -+ continue; -+ -+ /* -+ * If there are inflight OOM victims, we don't need -+ * to look further for new victims. -+ */ -+ if (score == -1) { -+ oc->chosen_memcg = INFLIGHT_VICTIM; -+ mem_cgroup_iter_break(root, iter); -+ break; -+ } -+ -+ if (score > oc->chosen_points) { -+ oc->chosen_points = score; -+ oc->chosen_memcg = iter; -+ } -+ } -+ -+ if (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM) -+ css_get(&oc->chosen_memcg->css); -+ -+ rcu_read_unlock(); -+} -+ -+bool mem_cgroup_select_oom_victim(struct oom_control *oc) -+{ -+ struct mem_cgroup *root; -+ -+ if (mem_cgroup_disabled()) -+ return false; -+ -+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) -+ return false; -+ -+ if (oc->memcg) -+ root = oc->memcg; -+ else -+ root = root_mem_cgroup; -+ -+ select_victim_memcg(root, oc); -+ -+ return oc->chosen_memcg; -+} -+ - /* - * Reclaims as many pages from the given memcg as possible. - * -diff --git a/mm/oom_kill.c b/mm/oom_kill.c -index 0b9f36117989..5b670adb850c 100644 ---- a/mm/oom_kill.c -+++ b/mm/oom_kill.c -@@ -309,7 +309,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) - return CONSTRAINT_NONE; - } - --static int oom_evaluate_task(struct task_struct *task, void *arg) -+int oom_evaluate_task(struct task_struct *task, void *arg) - { - struct oom_control *oc = arg; - unsigned long points; -@@ -343,26 +343,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) - goto next; - - /* Prefer thread group leaders for display purposes */ -- if (points == oc->chosen_points && thread_group_leader(oc->chosen)) -+ if (points == oc->chosen_points && thread_group_leader(oc->chosen_task)) - goto next; - select: -- if (oc->chosen) -- put_task_struct(oc->chosen); -+ if (oc->chosen_task) -+ put_task_struct(oc->chosen_task); - get_task_struct(task); -- oc->chosen = task; -+ oc->chosen_task = task; - oc->chosen_points = points; - next: - return 0; - abort: -- if (oc->chosen) -- put_task_struct(oc->chosen); -- oc->chosen = (void *)-1UL; -+ if (oc->chosen_task) -+ put_task_struct(oc->chosen_task); -+ oc->chosen_task = INFLIGHT_VICTIM; - return 1; - } - - /* - * Simple selection loop. We choose the process with the highest number of -- * 'points'. In case scan was aborted, oc->chosen is set to -1. -+ * 'points'. In case scan was aborted, oc->chosen_task is set to -1. - */ - static void select_bad_process(struct oom_control *oc) - { -@@ -923,7 +923,7 @@ static void __oom_kill_process(struct task_struct *victim) - - static void oom_kill_process(struct oom_control *oc, const char *message) - { -- struct task_struct *p = oc->chosen; -+ struct task_struct *p = oc->chosen_task; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; -@@ -984,6 +984,27 @@ static void oom_kill_process(struct oom_control *oc, const char *message) - __oom_kill_process(victim); - } - -+static bool oom_kill_memcg_victim(struct oom_control *oc) -+{ -+ -+ if (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM) -+ return oc->chosen_memcg; -+ -+ /* Kill a task in the chosen memcg with the biggest memory footprint */ -+ oc->chosen_points = 0; -+ oc->chosen_task = NULL; -+ mem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc); -+ -+ if (oc->chosen_task == NULL || oc->chosen_task == INFLIGHT_VICTIM) -+ goto out; -+ -+ __oom_kill_process(oc->chosen_task); -+ -+out: -+ mem_cgroup_put(oc->chosen_memcg); -+ return oc->chosen_task; -+} -+ - /* - * Determines whether the kernel must panic because of the panic_on_oom sysctl. - */ -@@ -1036,6 +1057,7 @@ bool out_of_memory(struct oom_control *oc) - { - unsigned long freed = 0; - enum oom_constraint constraint = CONSTRAINT_NONE; -+ bool delay = false; /* if set, delay next allocation attempt */ - - if (oom_killer_disabled) - return false; -@@ -1080,27 +1102,39 @@ bool out_of_memory(struct oom_control *oc) - current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && - current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { - get_task_struct(current); -- oc->chosen = current; -+ oc->chosen_task = current; - oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); - return true; - } - -+ if (mem_cgroup_select_oom_victim(oc)) { -+ if (oom_kill_memcg_victim(oc)) -+ delay = true; -+ -+ goto out; -+ } -+ - select_bad_process(oc); - /* Found nothing?!?! Either we hang forever, or we panic. */ -- if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { -+ if (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) { - dump_header(oc, NULL); - panic("Out of memory and no killable processes...\n"); - } -- if (oc->chosen && oc->chosen != (void *)-1UL) { -+ if (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) { - oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : - "Memory cgroup out of memory"); -- /* -- * Give the killed process a good chance to exit before trying -- * to allocate memory again. -- */ -- schedule_timeout_killable(1); -+ delay = true; - } -- return !!oc->chosen; -+ -+out: -+ /* -+ * Give the killed process a good chance to exit before trying -+ * to allocate memory again. -+ */ -+ if (delay) -+ schedule_timeout_killable(1); -+ -+ return !!oc->chosen_task; - } - - /* --- -2.13.6 - --- -To unsubscribe, send a message with 'unsubscribe linux-mm' in -the body to majordomo@kvack.org. For more info on Linux MM, -see: http://www.linux-mm.org/ . -Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> diff --git a/a/content_digest b/N1/content_digest index ba7f882..2ff53f1 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -157,501 +157,6 @@ "\n" "Thanks!\n" "\n" - "--------------------------------------------------------------------------------\n" - "From 7f51d26be2d2a5b6e4840574f72beb15920c0993 Mon Sep 17 00:00:00 2001\n" - "From: Roman Gushchin <guro@fb.com>\n" - "Date: Thu, 25 May 2017 14:18:45 +0100\n" - "Subject: [v12 3/6] mm, oom: cgroup-aware OOM killer\n" - "\n" - "Traditionally, the OOM killer is operating on a process level.\n" - "Under oom conditions, it finds a process with the highest oom score\n" - "and kills it.\n" - "\n" - "This behavior doesn't suit well the system with many running\n" - "containers:\n" - "\n" - "1) There is no fairness between containers. A small container with\n" - "few large processes will be chosen over a large one with huge\n" - "number of small processes.\n" - "\n" - "2) Containers often do not expect that some random process inside\n" - "will be killed. In many cases much safer behavior is to kill\n" - "all tasks in the container. Traditionally, this was implemented\n" - "in userspace, but doing it in the kernel has some advantages,\n" - "especially in a case of a system-wide OOM.\n" - "\n" - "To address these issues, the cgroup-aware OOM killer is introduced.\n" - "\n" - "This patch introduces the core functionality: an ability to select\n" - "a memory cgroup as an OOM victim. Under OOM conditions the OOM killer\n" - "looks for the biggest leaf memory cgroup and kills the biggest\n" - "task belonging to it.\n" - "\n" - "The following patches will extend this functionality to consider\n" - "non-leaf memory cgroups as OOM victims, and also provide an ability\n" - "to kill all tasks belonging to the victim cgroup.\n" - "\n" - "The root cgroup is treated as a leaf memory cgroup, so it's score\n" - "is compared with other leaf memory cgroups.\n" - "Due to memcg statistics implementation a special approximation\n" - "is used for estimating oom_score of root memory cgroup: we sum\n" - "oom_score of the belonging processes (or, to be more precise,\n" - "tasks owning their mm structures).\n" - "\n" - "Signed-off-by: Roman Gushchin <guro@fb.com>\n" - "Cc: Michal Hocko <mhocko@suse.com>\n" - "Cc: Vladimir Davydov <vdavydov.dev@gmail.com>\n" - "Cc: Johannes Weiner <hannes@cmpxchg.org>\n" - "Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\n" - "Cc: David Rientjes <rientjes@google.com>\n" - "Cc: Andrew Morton <akpm@linux-foundation.org>\n" - "Cc: Tejun Heo <tj@kernel.org>\n" - "Cc: kernel-team@fb.com\n" - "Cc: cgroups@vger.kernel.org\n" - "Cc: linux-doc@vger.kernel.org\n" - "Cc: linux-kernel@vger.kernel.org\n" - "Cc: linux-mm@kvack.org\n" - "---\n" - " include/linux/memcontrol.h | 17 +++++\n" - " include/linux/oom.h | 12 ++-\n" - " mm/memcontrol.c | 181 +++++++++++++++++++++++++++++++++++++++++++++\n" - " mm/oom_kill.c | 72 +++++++++++++-----\n" - " 4 files changed, 262 insertions(+), 20 deletions(-)\n" - "\n" - "diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h\n" - "index 69966c461d1c..75b63b68846e 100644\n" - "--- a/include/linux/memcontrol.h\n" - "+++ b/include/linux/memcontrol.h\n" - "@@ -35,6 +35,7 @@ struct mem_cgroup;\n" - " struct page;\n" - " struct mm_struct;\n" - " struct kmem_cache;\n" - "+struct oom_control;\n" - " \n" - " /* Cgroup-specific page state, on top of universal node page state */\n" - " enum memcg_stat_item {\n" - "@@ -342,6 +343,11 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){\n" - " \treturn css ? container_of(css, struct mem_cgroup, css) : NULL;\n" - " }\n" - " \n" - "+static inline void mem_cgroup_put(struct mem_cgroup *memcg)\n" - "+{\n" - "+\tcss_put(&memcg->css);\n" - "+}\n" - "+\n" - " #define mem_cgroup_from_counter(counter, member)\t\\\n" - " \tcontainer_of(counter, struct mem_cgroup, member)\n" - " \n" - "@@ -480,6 +486,8 @@ static inline bool task_in_memcg_oom(struct task_struct *p)\n" - " \n" - " bool mem_cgroup_oom_synchronize(bool wait);\n" - " \n" - "+bool mem_cgroup_select_oom_victim(struct oom_control *oc);\n" - "+\n" - " #ifdef CONFIG_MEMCG_SWAP\n" - " extern int do_swap_account;\n" - " #endif\n" - "@@ -744,6 +752,10 @@ static inline bool task_in_mem_cgroup(struct task_struct *task,\n" - " \treturn true;\n" - " }\n" - " \n" - "+static inline void mem_cgroup_put(struct mem_cgroup *memcg)\n" - "+{\n" - "+}\n" - "+\n" - " static inline struct mem_cgroup *\n" - " mem_cgroup_iter(struct mem_cgroup *root,\n" - " \t\tstruct mem_cgroup *prev,\n" - "@@ -936,6 +948,11 @@ static inline\n" - " void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)\n" - " {\n" - " }\n" - "+\n" - "+static inline bool mem_cgroup_select_oom_victim(struct oom_control *oc)\n" - "+{\n" - "+\treturn false;\n" - "+}\n" - " #endif /* CONFIG_MEMCG */\n" - " \n" - " /* idx can be of type enum memcg_stat_item or node_stat_item */\n" - "diff --git a/include/linux/oom.h b/include/linux/oom.h\n" - "index 76aac4ce39bc..ca78e2d5956e 100644\n" - "--- a/include/linux/oom.h\n" - "+++ b/include/linux/oom.h\n" - "@@ -9,6 +9,13 @@\n" - " #include <linux/sched/coredump.h> /* MMF_* */\n" - " #include <linux/mm.h> /* VM_FAULT* */\n" - " \n" - "+\n" - "+/*\n" - "+ * Special value returned by victim selection functions to indicate\n" - "+ * that are inflight OOM victims.\n" - "+ */\n" - "+#define INFLIGHT_VICTIM ((void *)-1UL)\n" - "+\n" - " struct zonelist;\n" - " struct notifier_block;\n" - " struct mem_cgroup;\n" - "@@ -39,7 +46,8 @@ struct oom_control {\n" - " \n" - " \t/* Used by oom implementation, do not set */\n" - " \tunsigned long totalpages;\n" - "-\tstruct task_struct *chosen;\n" - "+\tstruct task_struct *chosen_task;\n" - "+\tstruct mem_cgroup *chosen_memcg;\n" - " \tunsigned long chosen_points;\n" - " };\n" - " \n" - "@@ -101,6 +109,8 @@ extern void oom_killer_enable(void);\n" - " \n" - " extern struct task_struct *find_lock_task_mm(struct task_struct *p);\n" - " \n" - "+extern int oom_evaluate_task(struct task_struct *task, void *arg);\n" - "+\n" - " /* sysctls */\n" - " extern int sysctl_oom_dump_tasks;\n" - " extern int sysctl_oom_kill_allocating_task;\n" - "diff --git a/mm/memcontrol.c b/mm/memcontrol.c\n" - "index df3368734f1c..8f04e1fb9dd9 100644\n" - "--- a/mm/memcontrol.c\n" - "+++ b/mm/memcontrol.c\n" - "@@ -2670,6 +2670,187 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)\n" - " \treturn ret;\n" - " }\n" - " \n" - "+static long memcg_oom_badness(struct mem_cgroup *memcg,\n" - "+\t\t\t const nodemask_t *nodemask,\n" - "+\t\t\t unsigned long totalpages)\n" - "+{\n" - "+\tlong points = 0;\n" - "+\tint nid;\n" - "+\tpg_data_t *pgdat;\n" - "+\n" - "+\tfor_each_node_state(nid, N_MEMORY) {\n" - "+\t\tif (nodemask && !node_isset(nid, *nodemask))\n" - "+\t\t\tcontinue;\n" - "+\n" - "+\t\tpoints += mem_cgroup_node_nr_lru_pages(memcg, nid,\n" - "+\t\t\t\tLRU_ALL_ANON | BIT(LRU_UNEVICTABLE));\n" - "+\n" - "+\t\tpgdat = NODE_DATA(nid);\n" - "+\t\tpoints += lruvec_page_state(mem_cgroup_lruvec(pgdat, memcg),\n" - "+\t\t\t\t\t NR_SLAB_UNRECLAIMABLE);\n" - "+\t}\n" - "+\n" - "+\tpoints += memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) /\n" - "+\t\t(PAGE_SIZE / 1024);\n" - "+\tpoints += memcg_page_state(memcg, MEMCG_SOCK);\n" - "+\tpoints += memcg_page_state(memcg, MEMCG_SWAP);\n" - "+\n" - "+\treturn points;\n" - "+}\n" - "+\n" - "+/*\n" - "+ * Checks if the given memcg is a valid OOM victim and returns a number,\n" - "+ * which means the folowing:\n" - "+ * -1: there are inflight OOM victim tasks, belonging to the memcg\n" - "+ * 0: memcg is not eligible, e.g. all belonging tasks are protected\n" - "+ * by oom_score_adj set to OOM_SCORE_ADJ_MIN\n" - "+ * >0: memcg is eligible, and the returned value is an estimation\n" - "+ * of the memory footprint\n" - "+ */\n" - "+static long oom_evaluate_memcg(struct mem_cgroup *memcg,\n" - "+\t\t\t const nodemask_t *nodemask,\n" - "+\t\t\t unsigned long totalpages)\n" - "+{\n" - "+\tstruct css_task_iter it;\n" - "+\tstruct task_struct *task;\n" - "+\tint eligible = 0;\n" - "+\n" - "+\t/*\n" - "+\t * Root memory cgroup is a special case:\n" - "+\t * we don't have necessary stats to evaluate it exactly as\n" - "+\t * leaf memory cgroups, so we approximate it's oom_score\n" - "+\t * by summing oom_score of all belonging tasks, which are\n" - "+\t * owners of their mm structs.\n" - "+\t *\n" - "+\t * If there are inflight OOM victim tasks inside\n" - "+\t * the root memcg, we return -1.\n" - "+\t */\n" - "+\tif (memcg == root_mem_cgroup) {\n" - "+\t\tstruct css_task_iter it;\n" - "+\t\tstruct task_struct *task;\n" - "+\t\tlong score = 0;\n" - "+\n" - "+\t\tcss_task_iter_start(&memcg->css, 0, &it);\n" - "+\t\twhile ((task = css_task_iter_next(&it))) {\n" - "+\t\t\tif (tsk_is_oom_victim(task) &&\n" - "+\t\t\t !test_bit(MMF_OOM_SKIP,\n" - "+\t\t\t\t &task->signal->oom_mm->flags)) {\n" - "+\t\t\t\tscore = -1;\n" - "+\t\t\t\tbreak;\n" - "+\t\t\t}\n" - "+\n" - "+\t\t\ttask_lock(task);\n" - "+\t\t\tif (!task->mm || task->mm->owner != task) {\n" - "+\t\t\t\ttask_unlock(task);\n" - "+\t\t\t\tcontinue;\n" - "+\t\t\t}\n" - "+\t\t\ttask_unlock(task);\n" - "+\n" - "+\t\t\tscore += oom_badness(task, memcg, nodemask,\n" - "+\t\t\t\t\t totalpages);\n" - "+\t\t}\n" - "+\t\tcss_task_iter_end(&it);\n" - "+\n" - "+\t\treturn score;\n" - "+\t}\n" - "+\n" - "+\t/*\n" - "+\t * Memcg is OOM eligible if there are OOM killable tasks inside.\n" - "+\t *\n" - "+\t * We treat tasks with oom_score_adj set to OOM_SCORE_ADJ_MIN\n" - "+\t * as unkillable.\n" - "+\t *\n" - "+\t * If there are inflight OOM victim tasks inside the memcg,\n" - "+\t * we return -1.\n" - "+\t */\n" - "+\tcss_task_iter_start(&memcg->css, 0, &it);\n" - "+\twhile ((task = css_task_iter_next(&it))) {\n" - "+\t\tif (!eligible &&\n" - "+\t\t task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN)\n" - "+\t\t\teligible = 1;\n" - "+\n" - "+\t\tif (tsk_is_oom_victim(task) &&\n" - "+\t\t !test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) {\n" - "+\t\t\teligible = -1;\n" - "+\t\t\tbreak;\n" - "+\t\t}\n" - "+\t}\n" - "+\tcss_task_iter_end(&it);\n" - "+\n" - "+\tif (eligible <= 0)\n" - "+\t\treturn eligible;\n" - "+\n" - "+\treturn memcg_oom_badness(memcg, nodemask, totalpages);\n" - "+}\n" - "+\n" - "+static void select_victim_memcg(struct mem_cgroup *root, struct oom_control *oc)\n" - "+{\n" - "+\tstruct mem_cgroup *iter;\n" - "+\n" - "+\toc->chosen_memcg = NULL;\n" - "+\toc->chosen_points = 0;\n" - "+\n" - "+\t/*\n" - "+\t * The oom_score is calculated for leaf memory cgroups (including\n" - "+\t * the root memcg).\n" - "+\t */\n" - "+\trcu_read_lock();\n" - "+\tfor_each_mem_cgroup_tree(iter, root) {\n" - "+\t\tlong score;\n" - "+\n" - "+\t\tif (memcg_has_children(iter) && iter != root_mem_cgroup)\n" - "+\t\t\tcontinue;\n" - "+\n" - "+\t\tscore = oom_evaluate_memcg(iter, oc->nodemask, oc->totalpages);\n" - "+\n" - "+\t\t/*\n" - "+\t\t * Ignore empty and non-eligible memory cgroups.\n" - "+\t\t */\n" - "+\t\tif (score == 0)\n" - "+\t\t\tcontinue;\n" - "+\n" - "+\t\t/*\n" - "+\t\t * If there are inflight OOM victims, we don't need\n" - "+\t\t * to look further for new victims.\n" - "+\t\t */\n" - "+\t\tif (score == -1) {\n" - "+\t\t\toc->chosen_memcg = INFLIGHT_VICTIM;\n" - "+\t\t\tmem_cgroup_iter_break(root, iter);\n" - "+\t\t\tbreak;\n" - "+\t\t}\n" - "+\n" - "+\t\tif (score > oc->chosen_points) {\n" - "+\t\t\toc->chosen_points = score;\n" - "+\t\t\toc->chosen_memcg = iter;\n" - "+\t\t}\n" - "+\t}\n" - "+\n" - "+\tif (oc->chosen_memcg && oc->chosen_memcg != INFLIGHT_VICTIM)\n" - "+\t\tcss_get(&oc->chosen_memcg->css);\n" - "+\n" - "+\trcu_read_unlock();\n" - "+}\n" - "+\n" - "+bool mem_cgroup_select_oom_victim(struct oom_control *oc)\n" - "+{\n" - "+\tstruct mem_cgroup *root;\n" - "+\n" - "+\tif (mem_cgroup_disabled())\n" - "+\t\treturn false;\n" - "+\n" - "+\tif (!cgroup_subsys_on_dfl(memory_cgrp_subsys))\n" - "+\t\treturn false;\n" - "+\n" - "+\tif (oc->memcg)\n" - "+\t\troot = oc->memcg;\n" - "+\telse\n" - "+\t\troot = root_mem_cgroup;\n" - "+\n" - "+\tselect_victim_memcg(root, oc);\n" - "+\n" - "+\treturn oc->chosen_memcg;\n" - "+}\n" - "+\n" - " /*\n" - " * Reclaims as many pages from the given memcg as possible.\n" - " *\n" - "diff --git a/mm/oom_kill.c b/mm/oom_kill.c\n" - "index 0b9f36117989..5b670adb850c 100644\n" - "--- a/mm/oom_kill.c\n" - "+++ b/mm/oom_kill.c\n" - "@@ -309,7 +309,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)\n" - " \treturn CONSTRAINT_NONE;\n" - " }\n" - " \n" - "-static int oom_evaluate_task(struct task_struct *task, void *arg)\n" - "+int oom_evaluate_task(struct task_struct *task, void *arg)\n" - " {\n" - " \tstruct oom_control *oc = arg;\n" - " \tunsigned long points;\n" - "@@ -343,26 +343,26 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)\n" - " \t\tgoto next;\n" - " \n" - " \t/* Prefer thread group leaders for display purposes */\n" - "-\tif (points == oc->chosen_points && thread_group_leader(oc->chosen))\n" - "+\tif (points == oc->chosen_points && thread_group_leader(oc->chosen_task))\n" - " \t\tgoto next;\n" - " select:\n" - "-\tif (oc->chosen)\n" - "-\t\tput_task_struct(oc->chosen);\n" - "+\tif (oc->chosen_task)\n" - "+\t\tput_task_struct(oc->chosen_task);\n" - " \tget_task_struct(task);\n" - "-\toc->chosen = task;\n" - "+\toc->chosen_task = task;\n" - " \toc->chosen_points = points;\n" - " next:\n" - " \treturn 0;\n" - " abort:\n" - "-\tif (oc->chosen)\n" - "-\t\tput_task_struct(oc->chosen);\n" - "-\toc->chosen = (void *)-1UL;\n" - "+\tif (oc->chosen_task)\n" - "+\t\tput_task_struct(oc->chosen_task);\n" - "+\toc->chosen_task = INFLIGHT_VICTIM;\n" - " \treturn 1;\n" - " }\n" - " \n" - " /*\n" - " * Simple selection loop. We choose the process with the highest number of\n" - "- * 'points'. In case scan was aborted, oc->chosen is set to -1.\n" - "+ * 'points'. In case scan was aborted, oc->chosen_task is set to -1.\n" - " */\n" - " static void select_bad_process(struct oom_control *oc)\n" - " {\n" - "@@ -923,7 +923,7 @@ static void __oom_kill_process(struct task_struct *victim)\n" - " \n" - " static void oom_kill_process(struct oom_control *oc, const char *message)\n" - " {\n" - "-\tstruct task_struct *p = oc->chosen;\n" - "+\tstruct task_struct *p = oc->chosen_task;\n" - " \tunsigned int points = oc->chosen_points;\n" - " \tstruct task_struct *victim = p;\n" - " \tstruct task_struct *child;\n" - "@@ -984,6 +984,27 @@ static void oom_kill_process(struct oom_control *oc, const char *message)\n" - " \t__oom_kill_process(victim);\n" - " }\n" - " \n" - "+static bool oom_kill_memcg_victim(struct oom_control *oc)\n" - "+{\n" - "+\n" - "+\tif (oc->chosen_memcg == NULL || oc->chosen_memcg == INFLIGHT_VICTIM)\n" - "+\t\treturn oc->chosen_memcg;\n" - "+\n" - "+\t/* Kill a task in the chosen memcg with the biggest memory footprint */\n" - "+\toc->chosen_points = 0;\n" - "+\toc->chosen_task = NULL;\n" - "+\tmem_cgroup_scan_tasks(oc->chosen_memcg, oom_evaluate_task, oc);\n" - "+\n" - "+\tif (oc->chosen_task == NULL || oc->chosen_task == INFLIGHT_VICTIM)\n" - "+\t\tgoto out;\n" - "+\n" - "+\t__oom_kill_process(oc->chosen_task);\n" - "+\n" - "+out:\n" - "+\tmem_cgroup_put(oc->chosen_memcg);\n" - "+\treturn oc->chosen_task;\n" - "+}\n" - "+\n" - " /*\n" - " * Determines whether the kernel must panic because of the panic_on_oom sysctl.\n" - " */\n" - "@@ -1036,6 +1057,7 @@ bool out_of_memory(struct oom_control *oc)\n" - " {\n" - " \tunsigned long freed = 0;\n" - " \tenum oom_constraint constraint = CONSTRAINT_NONE;\n" - "+\tbool delay = false; /* if set, delay next allocation attempt */\n" - " \n" - " \tif (oom_killer_disabled)\n" - " \t\treturn false;\n" - "@@ -1080,27 +1102,39 @@ bool out_of_memory(struct oom_control *oc)\n" - " \t current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&\n" - " \t current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {\n" - " \t\tget_task_struct(current);\n" - "-\t\toc->chosen = current;\n" - "+\t\toc->chosen_task = current;\n" - " \t\toom_kill_process(oc, \"Out of memory (oom_kill_allocating_task)\");\n" - " \t\treturn true;\n" - " \t}\n" - " \n" - "+\tif (mem_cgroup_select_oom_victim(oc)) {\n" - "+\t\tif (oom_kill_memcg_victim(oc))\n" - "+\t\t delay = true;\n" - "+\n" - "+\t\tgoto out;\n" - "+\t}\n" - "+\n" - " \tselect_bad_process(oc);\n" - " \t/* Found nothing?!?! Either we hang forever, or we panic. */\n" - "-\tif (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {\n" - "+\tif (!oc->chosen_task && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {\n" - " \t\tdump_header(oc, NULL);\n" - " \t\tpanic(\"Out of memory and no killable processes...\\n\");\n" - " \t}\n" - "-\tif (oc->chosen && oc->chosen != (void *)-1UL) {\n" - "+\tif (oc->chosen_task && oc->chosen_task != INFLIGHT_VICTIM) {\n" - " \t\toom_kill_process(oc, !is_memcg_oom(oc) ? \"Out of memory\" :\n" - " \t\t\t\t \"Memory cgroup out of memory\");\n" - "-\t\t/*\n" - "-\t\t * Give the killed process a good chance to exit before trying\n" - "-\t\t * to allocate memory again.\n" - "-\t\t */\n" - "-\t\tschedule_timeout_killable(1);\n" - "+\t\tdelay = true;\n" - " \t}\n" - "-\treturn !!oc->chosen;\n" - "+\n" - "+out:\n" - "+\t/*\n" - "+\t * Give the killed process a good chance to exit before trying\n" - "+\t * to allocate memory again.\n" - "+\t */\n" - "+\tif (delay)\n" - "+\t\tschedule_timeout_killable(1);\n" - "+\n" - "+\treturn !!oc->chosen_task;\n" - " }\n" - " \n" - " /*\n" - "-- \n" - "2.13.6\n" - "\n" - "--\n" - "To unsubscribe, send a message with 'unsubscribe linux-mm' in\n" - "the body to majordomo@kvack.org. For more info on Linux MM,\n" - "see: http://www.linux-mm.org/ .\n" - "Don't email: <a href=mailto:\"dont@kvack.org\"> email@kvack.org </a>" + -------------------------------------------------------------------------------- -a7b18903c07d67f6c781e8766f8ee742491429112ed9b2fbd11e7caec98b458f +106fd69ef631df893ae16fad29b547b50875ffd9024f393cb90861d7df8123bb
diff --git a/a/1.txt b/N2/1.txt index 66261f6..8e1a567 100644 --- a/a/1.txt +++ b/N2/1.txt @@ -136,7 +136,7 @@ Also, I've closed the race, you've pointed on. Thanks! -------------------------------------------------------------------------------- -From 7f51d26be2d2a5b6e4840574f72beb15920c0993 Mon Sep 17 00:00:00 2001 +>From 7f51d26be2d2a5b6e4840574f72beb15920c0993 Mon Sep 17 00:00:00 2001 From: Roman Gushchin <guro@fb.com> Date: Thu, 25 May 2017 14:18:45 +0100 Subject: [v12 3/6] mm, oom: cgroup-aware OOM killer @@ -625,9 +625,3 @@ index 0b9f36117989..5b670adb850c 100644 /* -- 2.13.6 - --- -To unsubscribe, send a message with 'unsubscribe linux-mm' in -the body to majordomo@kvack.org. For more info on Linux MM, -see: http://www.linux-mm.org/ . -Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> diff --git a/a/content_digest b/N2/content_digest index ba7f882..7edbf91 100644 --- a/a/content_digest +++ b/N2/content_digest @@ -7,17 +7,17 @@ "Subject\0Re: [v11 3/6] mm, oom: cgroup-aware OOM killer\0" "Date\0Wed, 11 Oct 2017 17:10:24 +0100\0" "To\0David Rientjes <rientjes@google.com>\0" - "Cc\0linux-mm@kvack.org" + "Cc\0<linux-mm@kvack.org>" Michal Hocko <mhocko@kernel.org> Vladimir Davydov <vdavydov.dev@gmail.com> Johannes Weiner <hannes@cmpxchg.org> Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> Andrew Morton <akpm@linux-foundation.org> Tejun Heo <tj@kernel.org> - kernel-team@fb.com - cgroups@vger.kernel.org - linux-doc@vger.kernel.org - " linux-kernel@vger.kernel.org\0" + <kernel-team@fb.com> + <cgroups@vger.kernel.org> + <linux-doc@vger.kernel.org> + " <linux-kernel@vger.kernel.org>\0" "\00:1\0" "b\0" "On Tue, Oct 10, 2017 at 02:13:00PM -0700, David Rientjes wrote:\n" @@ -158,7 +158,7 @@ "Thanks!\n" "\n" "--------------------------------------------------------------------------------\n" - "From 7f51d26be2d2a5b6e4840574f72beb15920c0993 Mon Sep 17 00:00:00 2001\n" + ">From 7f51d26be2d2a5b6e4840574f72beb15920c0993 Mon Sep 17 00:00:00 2001\n" "From: Roman Gushchin <guro@fb.com>\n" "Date: Thu, 25 May 2017 14:18:45 +0100\n" "Subject: [v12 3/6] mm, oom: cgroup-aware OOM killer\n" @@ -646,12 +646,6 @@ " \n" " /*\n" "-- \n" - "2.13.6\n" - "\n" - "--\n" - "To unsubscribe, send a message with 'unsubscribe linux-mm' in\n" - "the body to majordomo@kvack.org. For more info on Linux MM,\n" - "see: http://www.linux-mm.org/ .\n" - "Don't email: <a href=mailto:\"dont@kvack.org\"> email@kvack.org </a>" + 2.13.6 -a7b18903c07d67f6c781e8766f8ee742491429112ed9b2fbd11e7caec98b458f +da42d038148a68e63bf775985cddccf828166f3dc994e551d4ee880ef491255d
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.