[PATCH 16/19] sched, numa: NUMA home-node selection code

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: mingo@kernel.org, riel@redhat.com, oleg@redhat.com,
	pjt@google.com, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, tglx@linutronix.de,
	Lee.Schermerhorn@hp.com
Cc: linux-kernel@vger.kernel.org, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH 16/19] sched, numa: NUMA home-node selection code
Date: Tue, 31 Jul 2012 21:12:20 +0200	[thread overview]
Message-ID: <20120731192809.308468547@chello.nl> (raw)
In-Reply-To: 20120731191204.540691987@chello.nl

[-- Attachment #1: numa-1.patch --]
[-- Type: text/plain, Size: 12429 bytes --]

Now that we have infrastructure in place to migrate pages back to
their home-node, and migrate memory towards the home-node, we need to
set the home-node.

Instead of creating a seconday control loop, fully rely on the
existing load-balancer to do the right thing. The home-node selection
logic will simply pick the node the task has been found to run on
for two consequtive samples (see task_tick_numa).

This means NUMA placement is directly related to regular placement.
The home-node logic in the load-balancer tries to keep a task on the
home-node wheras the fairness and work-conserving constraints will try
and move it away.

The balance between these two 'forces' is what will result in the NUMA
placement.

Cc: Rik van Riel <riel@redhat.com>
Cc: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/init_task.h |    3 
 include/linux/mm_types.h  |    3 
 include/linux/sched.h     |   19 +++--
 kernel/sched/core.c       |   18 ++++-
 kernel/sched/fair.c       |  163 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/features.h   |    1 
 kernel/sched/sched.h      |   33 ++++++---
 kernel/sysctl.c           |   13 +++
 8 files changed, 227 insertions(+), 26 deletions(-)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -145,7 +145,8 @@ extern struct task_group root_task_group
 
 #ifdef CONFIG_NUMA
 # define INIT_TASK_NUMA(tsk)						\
-	.node = -1,
+	.node = -1,							\
+	.node_last = -1,
 #else
 # define INIT_TASK_NUMA(tsk)
 #endif
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -388,6 +388,9 @@ struct mm_struct {
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	struct cpumask cpumask_allocation;
 #endif
+#ifdef CONFIG_NUMA
+	unsigned long numa_next_scan;
+#endif
 	struct uprobes_state uprobes_state;
 };
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -62,6 +62,7 @@ struct sched_param {
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
+#include <linux/task_work.h>
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -1519,8 +1520,14 @@ struct task_struct {
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
 	short pref_node_fork;
-	int node;
-#endif
+
+	int node;			/* task home node   */
+	int node_last;			/* home node filter */
+#ifdef CONFIG_SMP
+	u64 node_stamp;			/* migration stamp  */
+	unsigned long numa_contrib;
+#endif /* CONFIG_SMP  */
+#endif /* CONFIG_NUMA */
 	struct rcu_head rcu;
 
 	/*
@@ -2029,22 +2036,22 @@ extern unsigned int sysctl_sched_nr_migr
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
 extern unsigned int sysctl_sched_shares_window;
+extern unsigned int sysctl_sched_numa_task_period;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos);
-#endif
-#ifdef CONFIG_SCHED_DEBUG
+
 static inline unsigned int get_sysctl_timer_migration(void)
 {
 	return sysctl_timer_migration;
 }
-#else
+#else /* CONFIG_SCHED_DEBUG */
 static inline unsigned int get_sysctl_timer_migration(void)
 {
 	return 1;
 }
-#endif
+#endif /* CONFIG_SCHED_DEBUG */
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1722,6 +1722,17 @@ static void __sched_fork(struct task_str
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
+
+#ifdef CONFIG_NUMA
+	if (p->mm && atomic_read(&p->mm->mm_users) == 1)
+		p->mm->numa_next_scan = jiffies;
+
+	p->node = -1;
+	p->node_last = -1;
+#ifdef CONFIG_SMP
+	p->node_stamp = 0ULL;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_NUMA */
 }
 
 /*
@@ -6558,9 +6569,9 @@ static struct sched_domain_topology_leve
  * Requeues a task ensuring its on the right load-balance list so
  * that it might get migrated to its new home.
  *
- * Note that we cannot actively migrate ourselves since our callers
- * can be from atomic context. We rely on the regular load-balance
- * mechanisms to move us around -- its all preference anyway.
+ * Since home-node is pure preference there's no hard migrate to force
+ * us anywhere, this also allows us to call this from atomic context if
+ * required.
  */
 void sched_setnode(struct task_struct *p, int node)
 {
@@ -6578,6 +6589,7 @@ void sched_setnode(struct task_struct *p
 		p->sched_class->put_prev_task(rq, p);
 
 	p->node = node;
+	p->node_last = node;
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -27,6 +27,7 @@
 #include <linux/profile.h>
 #include <linux/interrupt.h>
 #include <linux/random.h>
+#include <linux/mempolicy.h>
 
 #include <trace/events/sched.h>
 
@@ -774,6 +775,139 @@ update_stats_curr_start(struct cfs_rq *c
 }
 
 /**************************************************
+ * Scheduling class numa methods.
+ *
+ * The purpose of the NUMA bits are to maintain compute (task) and data
+ * (memory) locality. We try and achieve this by making tasks stick to
+ * a particular node (their home node) but if fairness mandates they run
+ * elsewhere for long enough, we let the memory follow them.
+ *
+ * Tasks start out with their home-node unset (-1) this effectively means
+ * they act !NUMA until we've established the task is busy enough to bother
+ * with placement.
+ */
+
+static unsigned long task_h_load(struct task_struct *p);
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NUMA)
+static void account_offnode_enqueue(struct rq *rq, struct task_struct *p)
+{
+	p->numa_contrib = task_h_load(p);
+	rq->offnode_weight += p->numa_contrib;
+	rq->offnode_running++;
+}
+static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
+{
+	rq->offnode_weight -= p->numa_contrib;
+	rq->offnode_running--;
+}
+
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_sched_numa_task_period = 2500;
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ */
+void task_numa_work(struct callback_head *work)
+{
+	unsigned long migrate, next_scan, now = jiffies;
+	struct task_struct *t, *p = current;
+	int node = p->node_last;
+
+	WARN_ON_ONCE(p != container_of(work, struct task_struct, rcu));
+
+	/*
+	 * Who cares about NUMA placement when they're dying.
+	 */
+	if (p->flags & PF_EXITING)
+		return;
+
+	/*
+	 * Enforce maximal migration frequency..
+	 */
+	migrate = p->mm->numa_next_scan;
+	if (time_before(now, migrate))
+		return;
+
+	next_scan = now + 2*msecs_to_jiffies(sysctl_sched_numa_task_period);
+	if (cmpxchg(&p->mm->numa_next_scan, migrate, next_scan) != migrate)
+		return;
+
+	rcu_read_lock();
+	t = p;
+	do {
+		sched_setnode(t, node);
+	} while ((t = next_thread(p)) != p);
+	rcu_read_unlock();
+
+	lazy_migrate_process(p->mm);
+}
+
+/*
+ * Sample task location from hardirq context (tick), this has minimal bias with
+ * obvious exceptions of frequency interference and tick avoidance techniques.
+ * If this were to become a problem we could move this sampling into the
+ * sleep/wakeup path -- but we'd prefer to avoid that for obvious reasons.
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+	u64 period, now;
+	int node;
+
+	/*
+	 * We don't care about NUMA placement if we don't have memory.
+	 */
+	if (!curr->mm)
+		return;
+
+	/*
+	 * Sample our node location every @sysctl_sched_numa_task_period
+	 * runtime ms. We use a two stage selection in order to filter
+	 * unlikely locations.
+	 *
+	 * If P(n) is the probability we're on node 'n', then the probability
+	 * we sample the same node twice is P(n)^2. This quadric squishes small
+	 * values and makes it more likely we end up on nodes where we have
+	 * significant presence.
+	 *
+	 * Using runtime rather than walltime has the dual advantage that
+	 * we (mostly) drive the selection from busy threads and that the
+	 * task needs to have done some actual work before we bother with
+	 * NUMA placement.
+	 */
+	now = curr->se.sum_exec_runtime;
+	period = (u64)sysctl_sched_numa_task_period * NSEC_PER_MSEC;
+
+	if (now - curr->node_stamp > period) {
+		curr->node_stamp = now;
+		node = numa_node_id();
+
+		if (curr->node_last == node && curr->node != node) {
+			/*
+			 * We can re-use curr->rcu because we checked curr->mm
+			 * != NULL so release_task()->call_rcu() was not called
+			 * yet and exit_task_work() is called before
+			 * exit_notify().
+			 */
+			init_task_work(&curr->rcu, task_numa_work);
+			task_work_add(curr, &curr->rcu, true);
+		}
+		curr->node_last = node;
+	}
+}
+#else
+static void account_offnode_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void account_offnode_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
+#endif /* SMP && NUMA */
+
+/**************************************************
  * Scheduling class queueing methods:
  */
 
@@ -784,9 +918,19 @@ account_entity_enqueue(struct cfs_rq *cf
 	if (!parent_entity(se))
 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-	if (entity_is_task(se))
-		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
-#endif
+	if (entity_is_task(se)) {
+		struct rq *rq = rq_of(cfs_rq);
+		struct task_struct *p = task_of(se);
+		struct list_head *tasks = &rq->cfs_tasks;
+
+		if (offnode_task(p)) {
+			account_offnode_enqueue(rq, p);
+			tasks = offnode_tasks(rq);
+		}
+
+		list_add(&se->group_node, tasks);
+	}
+#endif /* CONFIG_SMP */
 	cfs_rq->nr_running++;
 }
 
@@ -796,8 +940,14 @@ account_entity_dequeue(struct cfs_rq *cf
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
+		struct task_struct *p = task_of(se);
+
 		list_del_init(&se->group_node);
+
+		if (offnode_task(p))
+			account_offnode_dequeue(rq_of(cfs_rq), p);
+	}
 	cfs_rq->nr_running--;
 }
 
@@ -3286,8 +3436,6 @@ static int move_one_task(struct lb_env *
 	return 0;
 }
 
-static unsigned long task_h_load(struct task_struct *p);
-
 static const unsigned int sched_nr_migrate_break = 32;
 
 /*
@@ -5173,6 +5321,9 @@ static void task_tick_fair(struct rq *rq
 		cfs_rq = cfs_rq_of(se);
 		entity_tick(cfs_rq, se, queued);
 	}
+
+	if (sched_feat_numa(NUMA))
+		task_tick_numa(rq, curr);
 }
 
 /*
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -71,6 +71,7 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 SCHED_FEAT(LB_MIN, false)
 
 #ifdef CONFIG_NUMA
+SCHED_FEAT(NUMA,           true)
 SCHED_FEAT(NUMA_HOT,       true)
 SCHED_FEAT(NUMA_BIAS,      true)
 SCHED_FEAT(NUMA_PULL,      true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -471,15 +471,6 @@ struct rq {
 #endif
 };
 
-static inline struct list_head *offnode_tasks(struct rq *rq)
-{
-#ifdef CONFIG_NUMA
-	return &rq->offnode_tasks;
-#else
-	return NULL;
-#endif
-}
-
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -497,6 +488,30 @@ DECLARE_PER_CPU(struct rq, runqueues);
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NUMA)
+static inline bool offnode_task(struct task_struct *t)
+{
+	return t->node != -1 && t->node != cpu_to_node(task_cpu(t));
+}
+
+static inline struct list_head *offnode_tasks(struct rq *rq)
+{
+	return &rq->offnode_tasks;
+}
+
+void sched_setnode(struct task_struct *p, int node);
+#else /* SMP && NUMA */
+static inline bool offnode_task(struct task_struct *t)
+{
+	return false;
+}
+
+static inline struct list_head *offnode_tasks(struct rq *rq)
+{
+	return NULL;
+}
+#endif /* SMP && NUMA */
+
 #ifdef CONFIG_SMP
 
 #define rcu_dereference_check_sched_domain(p) \
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -291,6 +291,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
+#ifdef CONFIG_SMP
 	{
 		.procname	= "sched_tunable_scaling",
 		.data		= &sysctl_sched_tunable_scaling,
@@ -337,7 +338,17 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
-#endif
+#ifdef CONFIG_NUMA
+	{
+		.procname	= "sched_numa_task_period_ms",
+		.data		= &sysctl_sched_numa_task_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,

next prev parent reply	other threads:[~2012-07-31 19:46 UTC|newest]

Thread overview: 54+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-07-31 19:12 [PATCH 00/19] sched-numa rewrite Peter Zijlstra
2012-07-31 19:12 ` [PATCH 01/19] task_work: Remove dependency on sched.h Peter Zijlstra
2012-07-31 20:52   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 02/19] mm/mpol: Remove NUMA_INTERLEAVE_HIT Peter Zijlstra
2012-07-31 20:52   ` Rik van Riel
2012-08-09 21:41   ` Andrea Arcangeli
2012-08-10  0:50     ` Andi Kleen
2012-07-31 19:12 ` [PATCH 03/19] mm/mpol: Make MPOL_LOCAL a real policy Peter Zijlstra
2012-07-31 20:52   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 04/19] mm, thp: Preserve pgprot across huge page split Peter Zijlstra
2012-07-31 20:53   ` Rik van Riel
2012-08-09 21:42   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 05/19] mm, mpol: Create special PROT_NONE infrastructure Peter Zijlstra
2012-07-31 20:55   ` Rik van Riel
2012-08-09 21:43   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 06/19] mm/mpol: Add MPOL_MF_LAZY Peter Zijlstra
2012-07-31 21:04   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 07/19] mm/mpol: Add MPOL_MF_NOOP Peter Zijlstra
2012-07-31 21:06   ` Rik van Riel
2012-08-09 21:44   ` Andrea Arcangeli
2012-10-01  9:36   ` Michael Kerrisk
2012-10-01  9:45     ` Ingo Molnar
2012-07-31 19:12 ` [PATCH 08/19] mm/mpol: Check for misplaced page Peter Zijlstra
2012-07-31 21:13   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 09/19] mm, migrate: Introduce migrate_misplaced_page() Peter Zijlstra
2012-07-31 21:16   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 10/19] mm, mpol: Use special PROT_NONE to migrate pages Peter Zijlstra
2012-07-31 21:24   ` Rik van Riel
2012-08-09 21:44   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 11/19] sched, mm: Introduce tsk_home_node() Peter Zijlstra
2012-07-31 21:30   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 12/19] mm/mpol: Make mempolicy home-node aware Peter Zijlstra
2012-07-31 21:33   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 13/19] sched: Introduce sched_feat_numa() Peter Zijlstra
2012-07-31 21:34   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 14/19] sched: Make find_busiest_queue() a method Peter Zijlstra
2012-07-31 21:34   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 15/19] sched: Implement home-node awareness Peter Zijlstra
2012-07-31 21:52   ` Rik van Riel
2012-08-09 21:51   ` Andrea Arcangeli
2012-07-31 19:12 ` Peter Zijlstra [this message]
2012-07-31 21:52   ` [PATCH 16/19] sched, numa: NUMA home-node selection code Rik van Riel
2012-07-31 19:12 ` [PATCH 17/19] sched, numa: Detect big processes Peter Zijlstra
2012-07-31 21:53   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 18/19] sched, numa: Per task memory placement for " Peter Zijlstra
2012-07-31 21:56   ` Rik van Riel
2012-08-08 21:35   ` Peter Zijlstra
2012-08-09 21:57   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 19/19] mm, numa: retry failed page migrations Peter Zijlstra
2012-08-02 20:40   ` Christoph Lameter
2012-08-08 17:17 ` [PATCH 00/19] sched-numa rewrite Andrea Arcangeli
2012-08-08 18:43   ` Rik van Riel
2012-08-17 18:08     ` Andrea Arcangeli
     [not found] <506310A8.5040402@hp.com>
2012-09-26 14:31 ` [PATCH 16/19] sched, numa: NUMA home-node selection code Don Morris

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120731192809.308468547@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=Lee.Schermerhorn@hp.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=pjt@google.com \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.