linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Mel Gorman <mgorman@suse.de>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Linux-MM <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>, Mel Gorman <mgorman@suse.de>
Subject: [PATCH 06/13] sched: Reschedule task on preferred NUMA node once selected
Date: Wed,  3 Jul 2013 15:21:33 +0100	[thread overview]
Message-ID: <1372861300-9973-7-git-send-email-mgorman@suse.de> (raw)
In-Reply-To: <1372861300-9973-1-git-send-email-mgorman@suse.de>

A preferred node is selected based on the node the most NUMA hinting
faults was incurred on. There is no guarantee that the task is running
on that node at the time so this patch rescheules the task to run on
the most idle CPU of the selected node when selected. This avoids
waiting for the balancer to make a decision.

Signed-off-by: Mel Gorman <mgorman@suse.de>
---
 kernel/sched/core.c  | 18 ++++++++++++++++--
 kernel/sched/fair.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h |  2 +-
 3 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ba9470e..b4722d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5717,11 +5717,25 @@ struct sched_domain_topology_level;
 
 #ifdef CONFIG_NUMA_BALANCING
 
-/* Set a tasks preferred NUMA node */
-void sched_setnuma(struct task_struct *p, int nid)
+/* Set a tasks preferred NUMA node and reschedule to it */
+void sched_setnuma(struct task_struct *p, int nid, int idlest_cpu)
 {
+	int curr_cpu = task_cpu(p);
+	struct migration_arg arg = { p, idlest_cpu };
+
 	p->numa_preferred_nid = nid;
 	p->numa_migrate_seq = 0;
+
+	/* Do not reschedule if already running on the target CPU */
+	if (idlest_cpu == curr_cpu)
+		return;
+
+	/* Ensure the target CPU is eligible */
+	if (!cpumask_test_cpu(idlest_cpu, tsk_cpus_allowed(p)))
+		return;
+
+	/* Move current running task to idlest CPU on preferred node */
+	stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2a0bbc2..b9139be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -800,6 +800,37 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
  */
 unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
 
+static unsigned long weighted_cpuload(const int cpu);
+
+static int
+find_idlest_cpu_node(int this_cpu, int nid)
+{
+	unsigned long load, min_load = ULONG_MAX;
+	int i, idlest_cpu = this_cpu;
+
+	BUG_ON(cpu_to_node(this_cpu) == nid);
+
+	for_each_cpu(i, cpumask_of_node(nid)) {
+		load = weighted_cpuload(i);
+
+		if (load < min_load) {
+			struct task_struct *p;
+
+			/* Do not preempt a task running on its preferred node */
+			struct rq *rq = cpu_rq(i);
+			raw_spin_lock_irq(&rq->lock);
+			p = rq->curr;
+			if (p->numa_preferred_nid != nid) {
+				min_load = load;
+				idlest_cpu = i;
+			}
+			raw_spin_unlock_irq(&rq->lock);
+		}
+	}
+
+	return idlest_cpu;
+}
+
 static void task_numa_placement(struct task_struct *p)
 {
 	int seq, nid, max_nid = 0;
@@ -829,8 +860,27 @@ static void task_numa_placement(struct task_struct *p)
 		}
 	}
 
-	if (max_faults && max_nid != p->numa_preferred_nid)
-		sched_setnuma(p, max_nid);
+	/*
+	 * Record the preferred node as the node with the most faults,
+	 * requeue the task to be running on the idlest CPU on the
+	 * preferred node and reset the scanning rate to recheck
+	 * the working set placement.
+	 */
+	if (max_faults && max_nid != p->numa_preferred_nid) {
+		int preferred_cpu;
+
+		/*
+		 * If the task is not on the preferred node then find the most
+		 * idle CPU to migrate to.
+		 */
+		preferred_cpu = task_cpu(p);
+		if (cpu_to_node(preferred_cpu) != max_nid) {
+			preferred_cpu = find_idlest_cpu_node(preferred_cpu,
+							     max_nid);
+		}
+
+		sched_setnuma(p, max_nid, preferred_cpu);
+	}
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 65a0cf0..64c37a3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -504,7 +504,7 @@ DECLARE_PER_CPU(struct rq, runqueues);
 #define raw_rq()		(&__raw_get_cpu_var(runqueues))
 
 #ifdef CONFIG_NUMA_BALANCING
-extern void sched_setnuma(struct task_struct *p, int nid);
+extern void sched_setnuma(struct task_struct *p, int nid, int idlest_cpu);
 static inline void task_numa_free(struct task_struct *p)
 {
 	kfree(p->numa_faults);
-- 
1.8.1.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-07-03 14:22 UTC|newest]

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-03 14:21 [PATCH 0/13] Basic scheduler support for automatic NUMA balancing V2 Mel Gorman
2013-07-03 14:21 ` [PATCH 01/13] mm: numa: Document automatic NUMA balancing sysctls Mel Gorman
2013-07-03 14:21 ` [PATCH 02/13] sched: Track NUMA hinting faults on per-node basis Mel Gorman
2013-07-03 14:21 ` [PATCH 03/13] sched: Select a preferred node with the most numa hinting faults Mel Gorman
2013-07-03 14:21 ` [PATCH 04/13] sched: Update NUMA hinting faults once per scan Mel Gorman
2013-07-03 14:21 ` [PATCH 05/13] sched: Favour moving tasks towards the preferred node Mel Gorman
2013-07-03 14:21 ` Mel Gorman [this message]
2013-07-04 12:26   ` [PATCH 06/13] sched: Reschedule task on preferred NUMA node once selected Srikar Dronamraju
2013-07-04 13:29     ` Mel Gorman
2013-07-03 14:21 ` [PATCH 07/13] sched: Split accounting of NUMA hinting faults that pass two-stage filter Mel Gorman
2013-07-03 21:56   ` Johannes Weiner
2013-07-04  9:23     ` Mel Gorman
2013-07-04 14:24       ` Rik van Riel
2013-07-04 19:36       ` Johannes Weiner
2013-07-05  9:41         ` Mel Gorman
2013-07-05 10:48         ` Peter Zijlstra
2013-07-03 14:21 ` [PATCH 08/13] sched: Increase NUMA PTE scanning when a new preferred node is selected Mel Gorman
2013-07-03 14:21 ` [PATCH 09/13] sched: Favour moving tasks towards nodes that incurred more faults Mel Gorman
2013-07-03 18:27   ` Peter Zijlstra
2013-07-04  9:25     ` Mel Gorman
2013-07-03 14:21 ` [PATCH 10/13] sched: Set the scan rate proportional to the size of the task being scanned Mel Gorman
2013-07-03 14:21 ` [PATCH 11/13] sched: Check current->mm before allocating NUMA faults Mel Gorman
2013-07-03 15:33   ` Mel Gorman
2013-07-04 12:48   ` Srikar Dronamraju
2013-07-05 10:07     ` Mel Gorman
2013-07-03 14:21 ` [PATCH 12/13] mm: numa: Scan pages with elevated page_mapcount Mel Gorman
2013-07-03 18:35   ` Peter Zijlstra
2013-07-04  9:27     ` Mel Gorman
2013-07-03 18:41   ` Peter Zijlstra
2013-07-04  9:32     ` Mel Gorman
2013-07-03 18:42   ` Peter Zijlstra
2013-07-03 14:21 ` [PATCH 13/13] sched: Account for the number of preferred tasks running on a node when selecting a preferred node Mel Gorman
2013-07-03 18:32   ` Peter Zijlstra
2013-07-04  9:37     ` Mel Gorman
2013-07-04 13:07       ` Srikar Dronamraju
2013-07-04 13:54         ` Mel Gorman
2013-07-04 14:06           ` Peter Zijlstra
2013-07-04 14:40             ` Mel Gorman
2013-07-03 16:19 ` [PATCH 0/13] Basic scheduler support for automatic NUMA balancing V2 Mel Gorman
2013-07-03 16:26   ` Mel Gorman
2013-07-04 18:02 ` [PATCH RFC WIP] Process weights based scheduling for better consolidation Srikar Dronamraju
2013-07-05 10:16   ` Peter Zijlstra
2013-07-05 12:49     ` Srikar Dronamraju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1372861300-9973-7-git-send-email-mgorman@suse.de \
    --to=mgorman@suse.de \
    --cc=a.p.zijlstra@chello.nl \
    --cc=aarcange@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mingo@kernel.org \
    --cc=srikar@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).