All of lore.kernel.org
 help / color / mirror / Atom feed
diff for duplicates of <20121202194544.GA20549@gmail.com>

diff --git a/a/1.txt b/N1/1.txt
index 35bb9d5..f96b429 100644
--- a/a/1.txt
+++ b/N1/1.txt
@@ -7,3 +7,163 @@ Thanks,
 	Ingo
 
 --------------------------->
+>From 92429d012ddc551626d7e414469e080d4b41a0c9 Mon Sep 17 00:00:00 2001
+From: Ingo Molnar <mingo@kernel.org>
+Date: Sun, 2 Dec 2012 15:29:42 +0100
+Subject: [PATCH] sched: Add RSS filter to NUMA-balancing
+
+NUMA-balancing, combined with NUMA-affine memory migration,
+is a relatively long-term process (compared to the typical
+time scale of scheduling) that takes time to establish and
+converge - on the time scale of of several seconds or more.
+
+Small, short-lived and don't have much of a NUMA placement
+cost to begin with, so don't NUMA-balance them. A task needs
+to execute long enough and needs to establish a large enough
+user-space memory image to benefit from more intelligent
+NUMA balancing.
+
+We already have a CPU time limit before tasks are affected
+by NUMA balancing - this change adds the memory equivalent:
+by introducing an RSS limit of 128 MBs.
+
+In practice this excludes most short-lived tasks - the limit
+is in fact probably a bit on the conservative side - but with
+intrusive kernel features conservative is good.
+
+The /proc/sys/kernel/sched_numa_rss_threshold_mb value can be
+tuned runtime - setting it to 0 turns off this filter.
+
+To implement the RSS filter first factor out a clean
+task_numa_candidate() function and comment on the various
+reasons of why we wouldn't want to begin to NUMA-balance
+a particular task (yet). Then add the RSS check.
+
+Note, we are using the p->hiwater_rss value instead of the
+current RSS size. We do this to avoid tasks flipping in and
+out of the limit, if their RSS fluctuates around the limit.
+The RSS high-water value increases monotonically in the
+life-time of a task, so there's a single, precise transition
+to NUMA-balancing as the limit is crossed.
+
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Hugh Dickins <hughd@google.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+---
+ include/linux/sched.h |  1 +
+ kernel/sched/fair.c   | 54 +++++++++++++++++++++++++++++++++++++++++++++------
+ kernel/sysctl.c       |  7 +++++++
+ 3 files changed, 56 insertions(+), 6 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index ce834e7..6a29dfd 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -2059,6 +2059,7 @@ extern unsigned int sysctl_sched_numa_scan_period_min;
+ extern unsigned int sysctl_sched_numa_scan_period_max;
+ extern unsigned int sysctl_sched_numa_scan_size_min;
+ extern unsigned int sysctl_sched_numa_scan_size_max;
++extern unsigned int sysctl_sched_numa_rss_threshold;
+ extern unsigned int sysctl_sched_numa_settle_count;
+ 
+ #ifdef CONFIG_SCHED_DEBUG
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 9667191..21c10f7 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -812,6 +812,8 @@ unsigned int sysctl_sched_numa_scan_period_max	__read_mostly = 100*16;	/* ms */
+ unsigned int sysctl_sched_numa_scan_size_min	__read_mostly =  32;	/* MB */
+ unsigned int sysctl_sched_numa_scan_size_max	__read_mostly = 512;	/* MB */
+ 
++unsigned int sysctl_sched_numa_rss_threshold	__read_mostly = 128;	/* MB */
++
+ /*
+  * Wait for the 2-sample stuff to settle before migrating again
+  */
+@@ -2486,17 +2488,57 @@ static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr)
+ 	task_work_add(curr, work, true);
+ }
+ 
+-static void task_tick_numa(struct rq *rq, struct task_struct *curr)
++/*
++ * Is this task worth NUMA-scanning and NUMA-balancing?
++ */
++static bool task_numa_candidate(struct task_struct *p)
+ {
++	unsigned long rss_high;
++	unsigned long rss_limit;
++
++	/* kthreads don't have any user-space memory to scan: */
++	if (!p->mm || !p->numa_faults)
++		return false;
++
+ 	/*
+-	 * We don't care about NUMA placement if we don't have memory
+-	 * or are exiting:
++	 * Exiting tasks won't touch any user-space memory in the future,
++	 * and this also avoids a race with work_exit():
+ 	 */
+-	if (!curr->mm || (curr->flags & PF_EXITING) || !curr->numa_faults)
+-		return;
++	if (p->flags & PF_EXITING)
++		return false;
+ 
+ 	/* Don't disturb hard-bound tasks: */
+-	if (sched_feat(NUMA_EXCLUDE_AFFINE) && (curr->nr_cpus_allowed != num_online_cpus())) {
++	if (sched_feat(NUMA_EXCLUDE_AFFINE)) {
++		if (p->nr_cpus_allowed != num_online_cpus())
++			return false;
++	}
++
++	/*
++	 * NUMA-balancing, combined with NUMA memory migration,
++	 * is a long-term process that takes time to establish
++	 * and converge, on the time scale of of several seconds
++	 * or more.
++	 *
++	 * Small tasks are usually short-lived and don't have much
++	 * of a NUMA placement cost to begin with, so don't
++	 * NUMA-balance them:
++	 */
++	rss_limit = sysctl_sched_numa_rss_threshold;
++	rss_limit <<= 20 - PAGE_SHIFT; /* MB to pages */
++
++	rss_high = get_mm_rss(p->mm);
++	rss_high = max(p->mm->hiwater_rss, rss_high);
++
++	if (rss_high < rss_limit)
++		return false;
++
++	return true;
++}
++
++static void task_tick_numa(struct rq *rq, struct task_struct *curr)
++{
++	/* Cheap checks first: */
++	if (!task_numa_candidate(curr)) {
+ 		if (curr->numa_shared >= 0)
+ 			curr->numa_shared = -1;
+ 		return;
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index b6ddfae..75ab895 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -388,6 +388,13 @@ static struct ctl_table kern_table[] = {
+ 		.proc_handler	= proc_dointvec,
+ 	},
+ 	{
++		.procname	= "sched_numa_rss_threshold_mb",
++		.data		= &sysctl_sched_numa_rss_threshold,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec,
++	},
++	{
+ 		.procname	= "sched_numa_settle_count",
+ 		.data		= &sysctl_sched_numa_settle_count,
+ 		.maxlen		= sizeof(unsigned int),
diff --git a/a/content_digest b/N1/content_digest
index 56f9aee..fd419a0 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -27,6 +27,166 @@
  "\n"
  "\tIngo\n"
  "\n"
- --------------------------->
+ "--------------------------->\n"
+ ">From 92429d012ddc551626d7e414469e080d4b41a0c9 Mon Sep 17 00:00:00 2001\n"
+ "From: Ingo Molnar <mingo@kernel.org>\n"
+ "Date: Sun, 2 Dec 2012 15:29:42 +0100\n"
+ "Subject: [PATCH] sched: Add RSS filter to NUMA-balancing\n"
+ "\n"
+ "NUMA-balancing, combined with NUMA-affine memory migration,\n"
+ "is a relatively long-term process (compared to the typical\n"
+ "time scale of scheduling) that takes time to establish and\n"
+ "converge - on the time scale of of several seconds or more.\n"
+ "\n"
+ "Small, short-lived and don't have much of a NUMA placement\n"
+ "cost to begin with, so don't NUMA-balance them. A task needs\n"
+ "to execute long enough and needs to establish a large enough\n"
+ "user-space memory image to benefit from more intelligent\n"
+ "NUMA balancing.\n"
+ "\n"
+ "We already have a CPU time limit before tasks are affected\n"
+ "by NUMA balancing - this change adds the memory equivalent:\n"
+ "by introducing an RSS limit of 128 MBs.\n"
+ "\n"
+ "In practice this excludes most short-lived tasks - the limit\n"
+ "is in fact probably a bit on the conservative side - but with\n"
+ "intrusive kernel features conservative is good.\n"
+ "\n"
+ "The /proc/sys/kernel/sched_numa_rss_threshold_mb value can be\n"
+ "tuned runtime - setting it to 0 turns off this filter.\n"
+ "\n"
+ "To implement the RSS filter first factor out a clean\n"
+ "task_numa_candidate() function and comment on the various\n"
+ "reasons of why we wouldn't want to begin to NUMA-balance\n"
+ "a particular task (yet). Then add the RSS check.\n"
+ "\n"
+ "Note, we are using the p->hiwater_rss value instead of the\n"
+ "current RSS size. We do this to avoid tasks flipping in and\n"
+ "out of the limit, if their RSS fluctuates around the limit.\n"
+ "The RSS high-water value increases monotonically in the\n"
+ "life-time of a task, so there's a single, precise transition\n"
+ "to NUMA-balancing as the limit is crossed.\n"
+ "\n"
+ "Cc: Linus Torvalds <torvalds@linux-foundation.org>\n"
+ "Cc: Andrew Morton <akpm@linux-foundation.org>\n"
+ "Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>\n"
+ "Cc: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Cc: Rik van Riel <riel@redhat.com>\n"
+ "Cc: Mel Gorman <mgorman@suse.de>\n"
+ "Cc: Hugh Dickins <hughd@google.com>\n"
+ "Signed-off-by: Ingo Molnar <mingo@kernel.org>\n"
+ "---\n"
+ " include/linux/sched.h |  1 +\n"
+ " kernel/sched/fair.c   | 54 +++++++++++++++++++++++++++++++++++++++++++++------\n"
+ " kernel/sysctl.c       |  7 +++++++\n"
+ " 3 files changed, 56 insertions(+), 6 deletions(-)\n"
+ "\n"
+ "diff --git a/include/linux/sched.h b/include/linux/sched.h\n"
+ "index ce834e7..6a29dfd 100644\n"
+ "--- a/include/linux/sched.h\n"
+ "+++ b/include/linux/sched.h\n"
+ "@@ -2059,6 +2059,7 @@ extern unsigned int sysctl_sched_numa_scan_period_min;\n"
+ " extern unsigned int sysctl_sched_numa_scan_period_max;\n"
+ " extern unsigned int sysctl_sched_numa_scan_size_min;\n"
+ " extern unsigned int sysctl_sched_numa_scan_size_max;\n"
+ "+extern unsigned int sysctl_sched_numa_rss_threshold;\n"
+ " extern unsigned int sysctl_sched_numa_settle_count;\n"
+ " \n"
+ " #ifdef CONFIG_SCHED_DEBUG\n"
+ "diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c\n"
+ "index 9667191..21c10f7 100644\n"
+ "--- a/kernel/sched/fair.c\n"
+ "+++ b/kernel/sched/fair.c\n"
+ "@@ -812,6 +812,8 @@ unsigned int sysctl_sched_numa_scan_period_max\t__read_mostly = 100*16;\t/* ms */\n"
+ " unsigned int sysctl_sched_numa_scan_size_min\t__read_mostly =  32;\t/* MB */\n"
+ " unsigned int sysctl_sched_numa_scan_size_max\t__read_mostly = 512;\t/* MB */\n"
+ " \n"
+ "+unsigned int sysctl_sched_numa_rss_threshold\t__read_mostly = 128;\t/* MB */\n"
+ "+\n"
+ " /*\n"
+ "  * Wait for the 2-sample stuff to settle before migrating again\n"
+ "  */\n"
+ "@@ -2486,17 +2488,57 @@ static void task_tick_numa_placement(struct rq *rq, struct task_struct *curr)\n"
+ " \ttask_work_add(curr, work, true);\n"
+ " }\n"
+ " \n"
+ "-static void task_tick_numa(struct rq *rq, struct task_struct *curr)\n"
+ "+/*\n"
+ "+ * Is this task worth NUMA-scanning and NUMA-balancing?\n"
+ "+ */\n"
+ "+static bool task_numa_candidate(struct task_struct *p)\n"
+ " {\n"
+ "+\tunsigned long rss_high;\n"
+ "+\tunsigned long rss_limit;\n"
+ "+\n"
+ "+\t/* kthreads don't have any user-space memory to scan: */\n"
+ "+\tif (!p->mm || !p->numa_faults)\n"
+ "+\t\treturn false;\n"
+ "+\n"
+ " \t/*\n"
+ "-\t * We don't care about NUMA placement if we don't have memory\n"
+ "-\t * or are exiting:\n"
+ "+\t * Exiting tasks won't touch any user-space memory in the future,\n"
+ "+\t * and this also avoids a race with work_exit():\n"
+ " \t */\n"
+ "-\tif (!curr->mm || (curr->flags & PF_EXITING) || !curr->numa_faults)\n"
+ "-\t\treturn;\n"
+ "+\tif (p->flags & PF_EXITING)\n"
+ "+\t\treturn false;\n"
+ " \n"
+ " \t/* Don't disturb hard-bound tasks: */\n"
+ "-\tif (sched_feat(NUMA_EXCLUDE_AFFINE) && (curr->nr_cpus_allowed != num_online_cpus())) {\n"
+ "+\tif (sched_feat(NUMA_EXCLUDE_AFFINE)) {\n"
+ "+\t\tif (p->nr_cpus_allowed != num_online_cpus())\n"
+ "+\t\t\treturn false;\n"
+ "+\t}\n"
+ "+\n"
+ "+\t/*\n"
+ "+\t * NUMA-balancing, combined with NUMA memory migration,\n"
+ "+\t * is a long-term process that takes time to establish\n"
+ "+\t * and converge, on the time scale of of several seconds\n"
+ "+\t * or more.\n"
+ "+\t *\n"
+ "+\t * Small tasks are usually short-lived and don't have much\n"
+ "+\t * of a NUMA placement cost to begin with, so don't\n"
+ "+\t * NUMA-balance them:\n"
+ "+\t */\n"
+ "+\trss_limit = sysctl_sched_numa_rss_threshold;\n"
+ "+\trss_limit <<= 20 - PAGE_SHIFT; /* MB to pages */\n"
+ "+\n"
+ "+\trss_high = get_mm_rss(p->mm);\n"
+ "+\trss_high = max(p->mm->hiwater_rss, rss_high);\n"
+ "+\n"
+ "+\tif (rss_high < rss_limit)\n"
+ "+\t\treturn false;\n"
+ "+\n"
+ "+\treturn true;\n"
+ "+}\n"
+ "+\n"
+ "+static void task_tick_numa(struct rq *rq, struct task_struct *curr)\n"
+ "+{\n"
+ "+\t/* Cheap checks first: */\n"
+ "+\tif (!task_numa_candidate(curr)) {\n"
+ " \t\tif (curr->numa_shared >= 0)\n"
+ " \t\t\tcurr->numa_shared = -1;\n"
+ " \t\treturn;\n"
+ "diff --git a/kernel/sysctl.c b/kernel/sysctl.c\n"
+ "index b6ddfae..75ab895 100644\n"
+ "--- a/kernel/sysctl.c\n"
+ "+++ b/kernel/sysctl.c\n"
+ "@@ -388,6 +388,13 @@ static struct ctl_table kern_table[] = {\n"
+ " \t\t.proc_handler\t= proc_dointvec,\n"
+ " \t},\n"
+ " \t{\n"
+ "+\t\t.procname\t= \"sched_numa_rss_threshold_mb\",\n"
+ "+\t\t.data\t\t= &sysctl_sched_numa_rss_threshold,\n"
+ "+\t\t.maxlen\t\t= sizeof(unsigned int),\n"
+ "+\t\t.mode\t\t= 0644,\n"
+ "+\t\t.proc_handler\t= proc_dointvec,\n"
+ "+\t},\n"
+ "+\t{\n"
+ " \t\t.procname\t= \"sched_numa_settle_count\",\n"
+ " \t\t.data\t\t= &sysctl_sched_numa_settle_count,\n"
+ " \t\t.maxlen\t\t= sizeof(unsigned int),"
 
-0c4025d8b30ec529292d801c62c92c4182190448f3fe9244af2d125b1e3f46c6
+3071c62438470bbf4d8e41eafab3e861ab3d2809f80799090b97d78a686c6f3f

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.