diff for duplicates of <20121202194544.GA20549@gmail.com> diff --git a/a/1.txt b/N1/1.txt index 35bb9d5..f96b429 100644 --- a/a/1.txt +++ b/N1/1.txt @@ -7,3 +7,163 @@ Thanks, Ingo ---------------------------> +>From 92429d012ddc551626d7e414469e080d4b41a0c9 Mon Sep 17 00:00:00 2001 +From: Ingo Molnar <mingo@kernel.org> +Date: Sun, 2 Dec 2012 15:29:42 +0100 +Subject: [PATCH] sched: Add RSS filter to NUMA-balancing + +NUMA-balancing, combined with NUMA-affine memory migration, +is a relatively long-term process (compared to the typical +time scale of scheduling) that takes time to establish and +converge - on the time scale of of several seconds or more. + +Small, short-lived and don't have much of a NUMA placement +cost to begin with, so don't NUMA-balance them. A task needs +to execute long enough and needs to establish a large enough +user-space memory image to benefit from more intelligent +NUMA balancing. + +We already have a CPU time limit before tasks are affected +by NUMA balancing - this change adds the memory equivalent: +by introducing an RSS limit of 128 MBs. + +In practice this excludes most short-lived tasks - the limit +is in fact probably a bit on the conservative side - but with +intrusive kernel features conservative is good. + +The /proc/sys/kernel/sched_numa_rss_threshold_mb value can be +tuned runtime - setting it to 0 turns off this filter. + +To implement the RSS filter first factor out a clean +task_numa_candidate() function and comment on the various +reasons of why we wouldn't want to begin to NUMA-balance +a particular task (yet). Then add the RSS check. + +Note, we are using the p->hiwater_rss value instead of the +current RSS size. We do this to avoid tasks flipping in and +out of the limit, if their RSS fluctuates around the limit. +The RSS high-water value increases monotonically in the +life-time of a task, so there's a single, precise transition +to NUMA-balancing as the limit is crossed. + +Cc: Linus Torvalds <torvalds@linux-foundation.org> +Cc: Andrew Morton <akpm@linux-foundation.org> +Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> +Cc: Andrea Arcangeli <aarcange@redhat.com> +Cc: Rik van Riel <riel@redhat.com> +Cc: Mel Gorman <mgorman@suse.de> +Cc: Hugh Dickins <hughd@google.com> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +--- + include/linux/sched.h | 1 + + kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------ + kernel/sysctl.c | 7 +++++++ + 3 files changed, 56 insertions(+), 6 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index ce834e7..6a29dfd 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2059,6 +2059,7 @@ extern unsigned int sysctl_sched_numa_scan_period_min; + extern unsigned int sysctl_sched_numa_scan_period_max; + extern unsigned int sysctl_sched_numa_scan_size_min; + extern unsigned int sysctl_sched_numa_scan_size_max; ++extern unsigned int sysctl_sched_numa_rss_threshold; + extern unsigned int sysctl_sched_numa_settle_count; + + #ifdef CONFIG_SCHED_DEBUG +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9667191..21c10f7 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -812,6 +812,8 @@ unsigned int sysctl_sched_numa_scan_period_max __read_mostly = 100*16; /* ms */ + unsigned int sysctl_sched_numa_scan_size_min __read_mostly = 32; /* MB */ + unsigned int sysctl_sched_numa_scan_size_max __read_mostly = 512; /* MB */ + ++unsigned int sysctl_sched_numa_rss_threshold __read_mostly = 128; /* MB */ ++ + /* + * Wait for the 2-sample stuff to settle before migrating again + */ +@@ -2486,17 +2488,57 @@ static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr) + task_work_add(curr, work, true); + } + +-static void task_tick_numa(struct rq *rq, struct task_struct *curr) ++/* ++ * Is this task worth NUMA-scanning and NUMA-balancing? ++ */ ++static bool task_numa_candidate(struct task_struct *p) + { ++ unsigned long rss_high; ++ unsigned long rss_limit; ++ ++ /* kthreads don't have any user-space memory to scan: */ ++ if (!p->mm || !p->numa_faults) ++ return false; ++ + /* +- * We don't care about NUMA placement if we don't have memory +- * or are exiting: ++ * Exiting tasks won't touch any user-space memory in the future, ++ * and this also avoids a race with work_exit(): + */ +- if (!curr->mm || (curr->flags & PF_EXITING) || !curr->numa_faults) +- return; ++ if (p->flags & PF_EXITING) ++ return false; + + /* Don't disturb hard-bound tasks: */ +- if (sched_feat(NUMA_EXCLUDE_AFFINE) && (curr->nr_cpus_allowed != num_online_cpus())) { ++ if (sched_feat(NUMA_EXCLUDE_AFFINE)) { ++ if (p->nr_cpus_allowed != num_online_cpus()) ++ return false; ++ } ++ ++ /* ++ * NUMA-balancing, combined with NUMA memory migration, ++ * is a long-term process that takes time to establish ++ * and converge, on the time scale of of several seconds ++ * or more. ++ * ++ * Small tasks are usually short-lived and don't have much ++ * of a NUMA placement cost to begin with, so don't ++ * NUMA-balance them: ++ */ ++ rss_limit = sysctl_sched_numa_rss_threshold; ++ rss_limit <<= 20 - PAGE_SHIFT; /* MB to pages */ ++ ++ rss_high = get_mm_rss(p->mm); ++ rss_high = max(p->mm->hiwater_rss, rss_high); ++ ++ if (rss_high < rss_limit) ++ return false; ++ ++ return true; ++} ++ ++static void task_tick_numa(struct rq *rq, struct task_struct *curr) ++{ ++ /* Cheap checks first: */ ++ if (!task_numa_candidate(curr)) { + if (curr->numa_shared >= 0) + curr->numa_shared = -1; + return; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6ddfae..75ab895 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -388,6 +388,13 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + { ++ .procname = "sched_numa_rss_threshold_mb", ++ .data = &sysctl_sched_numa_rss_threshold, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { + .procname = "sched_numa_settle_count", + .data = &sysctl_sched_numa_settle_count, + .maxlen = sizeof(unsigned int), diff --git a/a/content_digest b/N1/content_digest index 56f9aee..fd419a0 100644 --- a/a/content_digest +++ b/N1/content_digest @@ -27,6 +27,166 @@ "\n" "\tIngo\n" "\n" - ---------------------------> + "--------------------------->\n" + ">From 92429d012ddc551626d7e414469e080d4b41a0c9 Mon Sep 17 00:00:00 2001\n" + "From: Ingo Molnar <mingo@kernel.org>\n" + "Date: Sun, 2 Dec 2012 15:29:42 +0100\n" + "Subject: [PATCH] sched: Add RSS filter to NUMA-balancing\n" + "\n" + "NUMA-balancing, combined with NUMA-affine memory migration,\n" + "is a relatively long-term process (compared to the typical\n" + "time scale of scheduling) that takes time to establish and\n" + "converge - on the time scale of of several seconds or more.\n" + "\n" + "Small, short-lived and don't have much of a NUMA placement\n" + "cost to begin with, so don't NUMA-balance them. A task needs\n" + "to execute long enough and needs to establish a large enough\n" + "user-space memory image to benefit from more intelligent\n" + "NUMA balancing.\n" + "\n" + "We already have a CPU time limit before tasks are affected\n" + "by NUMA balancing - this change adds the memory equivalent:\n" + "by introducing an RSS limit of 128 MBs.\n" + "\n" + "In practice this excludes most short-lived tasks - the limit\n" + "is in fact probably a bit on the conservative side - but with\n" + "intrusive kernel features conservative is good.\n" + "\n" + "The /proc/sys/kernel/sched_numa_rss_threshold_mb value can be\n" + "tuned runtime - setting it to 0 turns off this filter.\n" + "\n" + "To implement the RSS filter first factor out a clean\n" + "task_numa_candidate() function and comment on the various\n" + "reasons of why we wouldn't want to begin to NUMA-balance\n" + "a particular task (yet). Then add the RSS check.\n" + "\n" + "Note, we are using the p->hiwater_rss value instead of the\n" + "current RSS size. We do this to avoid tasks flipping in and\n" + "out of the limit, if their RSS fluctuates around the limit.\n" + "The RSS high-water value increases monotonically in the\n" + "life-time of a task, so there's a single, precise transition\n" + "to NUMA-balancing as the limit is crossed.\n" + "\n" + "Cc: Linus Torvalds <torvalds@linux-foundation.org>\n" + "Cc: Andrew Morton <akpm@linux-foundation.org>\n" + "Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>\n" + "Cc: Andrea Arcangeli <aarcange@redhat.com>\n" + "Cc: Rik van Riel <riel@redhat.com>\n" + "Cc: Mel Gorman <mgorman@suse.de>\n" + "Cc: Hugh Dickins <hughd@google.com>\n" + "Signed-off-by: Ingo Molnar <mingo@kernel.org>\n" + "---\n" + " include/linux/sched.h | 1 +\n" + " kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++++++++++++------\n" + " kernel/sysctl.c | 7 +++++++\n" + " 3 files changed, 56 insertions(+), 6 deletions(-)\n" + "\n" + "diff --git a/include/linux/sched.h b/include/linux/sched.h\n" + "index ce834e7..6a29dfd 100644\n" + "--- a/include/linux/sched.h\n" + "+++ b/include/linux/sched.h\n" + "@@ -2059,6 +2059,7 @@ extern unsigned int sysctl_sched_numa_scan_period_min;\n" + " extern unsigned int sysctl_sched_numa_scan_period_max;\n" + " extern unsigned int sysctl_sched_numa_scan_size_min;\n" + " extern unsigned int sysctl_sched_numa_scan_size_max;\n" + "+extern unsigned int sysctl_sched_numa_rss_threshold;\n" + " extern unsigned int sysctl_sched_numa_settle_count;\n" + " \n" + " #ifdef CONFIG_SCHED_DEBUG\n" + "diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c\n" + "index 9667191..21c10f7 100644\n" + "--- a/kernel/sched/fair.c\n" + "+++ b/kernel/sched/fair.c\n" + "@@ -812,6 +812,8 @@ unsigned int sysctl_sched_numa_scan_period_max\t__read_mostly = 100*16;\t/* ms */\n" + " unsigned int sysctl_sched_numa_scan_size_min\t__read_mostly = 32;\t/* MB */\n" + " unsigned int sysctl_sched_numa_scan_size_max\t__read_mostly = 512;\t/* MB */\n" + " \n" + "+unsigned int sysctl_sched_numa_rss_threshold\t__read_mostly = 128;\t/* MB */\n" + "+\n" + " /*\n" + " * Wait for the 2-sample stuff to settle before migrating again\n" + " */\n" + "@@ -2486,17 +2488,57 @@ static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr)\n" + " \ttask_work_add(curr, work, true);\n" + " }\n" + " \n" + "-static void task_tick_numa(struct rq *rq, struct task_struct *curr)\n" + "+/*\n" + "+ * Is this task worth NUMA-scanning and NUMA-balancing?\n" + "+ */\n" + "+static bool task_numa_candidate(struct task_struct *p)\n" + " {\n" + "+\tunsigned long rss_high;\n" + "+\tunsigned long rss_limit;\n" + "+\n" + "+\t/* kthreads don't have any user-space memory to scan: */\n" + "+\tif (!p->mm || !p->numa_faults)\n" + "+\t\treturn false;\n" + "+\n" + " \t/*\n" + "-\t * We don't care about NUMA placement if we don't have memory\n" + "-\t * or are exiting:\n" + "+\t * Exiting tasks won't touch any user-space memory in the future,\n" + "+\t * and this also avoids a race with work_exit():\n" + " \t */\n" + "-\tif (!curr->mm || (curr->flags & PF_EXITING) || !curr->numa_faults)\n" + "-\t\treturn;\n" + "+\tif (p->flags & PF_EXITING)\n" + "+\t\treturn false;\n" + " \n" + " \t/* Don't disturb hard-bound tasks: */\n" + "-\tif (sched_feat(NUMA_EXCLUDE_AFFINE) && (curr->nr_cpus_allowed != num_online_cpus())) {\n" + "+\tif (sched_feat(NUMA_EXCLUDE_AFFINE)) {\n" + "+\t\tif (p->nr_cpus_allowed != num_online_cpus())\n" + "+\t\t\treturn false;\n" + "+\t}\n" + "+\n" + "+\t/*\n" + "+\t * NUMA-balancing, combined with NUMA memory migration,\n" + "+\t * is a long-term process that takes time to establish\n" + "+\t * and converge, on the time scale of of several seconds\n" + "+\t * or more.\n" + "+\t *\n" + "+\t * Small tasks are usually short-lived and don't have much\n" + "+\t * of a NUMA placement cost to begin with, so don't\n" + "+\t * NUMA-balance them:\n" + "+\t */\n" + "+\trss_limit = sysctl_sched_numa_rss_threshold;\n" + "+\trss_limit <<= 20 - PAGE_SHIFT; /* MB to pages */\n" + "+\n" + "+\trss_high = get_mm_rss(p->mm);\n" + "+\trss_high = max(p->mm->hiwater_rss, rss_high);\n" + "+\n" + "+\tif (rss_high < rss_limit)\n" + "+\t\treturn false;\n" + "+\n" + "+\treturn true;\n" + "+}\n" + "+\n" + "+static void task_tick_numa(struct rq *rq, struct task_struct *curr)\n" + "+{\n" + "+\t/* Cheap checks first: */\n" + "+\tif (!task_numa_candidate(curr)) {\n" + " \t\tif (curr->numa_shared >= 0)\n" + " \t\t\tcurr->numa_shared = -1;\n" + " \t\treturn;\n" + "diff --git a/kernel/sysctl.c b/kernel/sysctl.c\n" + "index b6ddfae..75ab895 100644\n" + "--- a/kernel/sysctl.c\n" + "+++ b/kernel/sysctl.c\n" + "@@ -388,6 +388,13 @@ static struct ctl_table kern_table[] = {\n" + " \t\t.proc_handler\t= proc_dointvec,\n" + " \t},\n" + " \t{\n" + "+\t\t.procname\t= \"sched_numa_rss_threshold_mb\",\n" + "+\t\t.data\t\t= &sysctl_sched_numa_rss_threshold,\n" + "+\t\t.maxlen\t\t= sizeof(unsigned int),\n" + "+\t\t.mode\t\t= 0644,\n" + "+\t\t.proc_handler\t= proc_dointvec,\n" + "+\t},\n" + "+\t{\n" + " \t\t.procname\t= \"sched_numa_settle_count\",\n" + " \t\t.data\t\t= &sysctl_sched_numa_settle_count,\n" + " \t\t.maxlen\t\t= sizeof(unsigned int)," -0c4025d8b30ec529292d801c62c92c4182190448f3fe9244af2d125b1e3f46c6 +3071c62438470bbf4d8e41eafab3e861ab3d2809f80799090b97d78a686c6f3f
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.