linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Rik van Riel <riel@redhat.com>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>,
	Srikar Dronamraju <srikar@linux.vnet.ibm.com>,
	Ingo Molnar <mingo@kernel.org>,
	Andrea Arcangeli <aarcange@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Linux-MM <linux-mm@kvack.org>,
	LKML <linux-kernel@vger.kernel.org>
Subject: [RFC PATCH -v2] numa,sched: use group fault statistics in numa placement
Date: Thu, 1 Aug 2013 18:36:46 -0400	[thread overview]
Message-ID: <20130801183646.37b04171@annuminas.surriel.com> (raw)
In-Reply-To: <20130730113857.GR3008@twins.programming.kicks-ass.net>

On Tue, 30 Jul 2013 13:38:57 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> 
> Subject: sched, numa: Use {cpu, pid} to create task groups for shared faults
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue Jul 30 10:40:20 CEST 2013
> 
> A very simple/straight forward shared fault task grouping
> implementation.

Here is another (untested) version of task placement on top of
your task grouping. Better send it to you now, rather than on
your friday evening :)

The algorithm is loosely based on Andrea's placement algorithm,
but not as strict because I am not entirely confident that the
task grouping code works right yet...

It also has no "fall back to a better node than the current one" 
code yet, for the case where we fail to migrate to the best node,
but that should be a separate patch anyway.


Subject: [PATCH,RFC] numa,sched: use group fault statistics in numa placement

This version uses the fraction of faults on a particular node for
both task and group, to figure out the best node to place a task.

I wish I had benchmark numbers to report, but our timezones just
don't seem to work out that way. Enjoy at your own peril :)

I will be testing these tomorrow.

Signed-off-by: Rik van Riel <riel@redhat.com>
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 94 +++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e7fcfe..5e175ae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1355,6 +1355,7 @@ struct task_struct {
 	 * The values remain static for the duration of a PTE scan
 	 */
 	unsigned long *numa_faults;
+	unsigned long total_numa_faults;
 
 	/*
 	 * numa_faults_buffer records faults per node during the current
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a06bef..3ef4d45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -844,6 +844,18 @@ static unsigned int task_scan_max(struct task_struct *p)
  */
 unsigned int sysctl_numa_balancing_settle_count __read_mostly = 3;
 
+struct numa_group {
+	atomic_t refcount;
+
+	spinlock_t lock; /* nr_tasks, tasks */
+	int nr_tasks;
+	struct list_head task_list;
+
+	struct rcu_head rcu;
+	atomic_long_t total_faults;
+	atomic_long_t faults[0];
+};
+
 static inline int task_faults_idx(int nid, int priv)
 {
 	return 2 * nid + priv;
@@ -857,6 +869,38 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
 	return p->numa_faults[2*nid] + p->numa_faults[2*nid+1];
 }
 
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+	if (!p->numa_group)
+		return 0;
+
+	return atomic_long_read(&p->numa_group->faults[2*nid]) +
+	       atomic_long_read(&p->numa_group->faults[2*nid+1]);
+}
+
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+	if (!p->numa_faults)
+		return 0;
+
+	return 1000 * task_faults(p, nid) / p->total_numa_faults;
+}
+
+static inline unsigned long group_weight(struct task_struct *p, int nid)
+{
+	if (!p->numa_group)
+		return 0;
+
+	return 1200 * group_faults(p, nid) /
+			atomic_long_read(&p->numa_group->total_faults);
+}
+
 /*
  * Create/Update p->mempolicy MPOL_INTERLEAVE to match p->numa_faults[].
  */
@@ -979,8 +1023,10 @@ static void task_numa_compare(struct task_numa_env *env, long imp)
 		cur = NULL;
 
 	if (cur) {
-		imp += task_faults(cur, env->src_nid) -
-		       task_faults(cur, env->dst_nid);
+		imp += task_faults(cur, env->src_nid) +
+		       group_faults(cur, env->src_nid) -
+		       task_faults(cur, env->dst_nid) -
+		       group_faults(cur, env->dst_nid);
 	}
 
 	trace_printk("compare[%d] task:%s/%d improvement: %ld\n",
@@ -1067,7 +1113,7 @@ static int task_numa_migrate(struct task_struct *p)
 	}
 	rcu_read_unlock();
 
-	faults = task_faults(p, env.src_nid);
+	faults = task_faults(p, env.src_nid) + group_faults(p, env.src_nid);
 	update_numa_stats(&env.src_stats, env.src_nid);
 
 	for_each_online_node(nid) {
@@ -1076,7 +1122,7 @@ static int task_numa_migrate(struct task_struct *p)
 		if (nid == env.src_nid)
 			continue;
 
-		imp = task_faults(p, nid) - faults;
+		imp = task_faults(p, nid) + group_faults(p, nid) - faults;
 		if (imp < 0)
 			continue;
 
@@ -1122,21 +1168,10 @@ static void numa_migrate_preferred(struct task_struct *p)
 	p->numa_migrate_retry = jiffies + HZ/10;
 }
 
-struct numa_group {
-	atomic_t refcount;
-
-	spinlock_t lock; /* nr_tasks, tasks */
-	int nr_tasks;
-	struct list_head task_list;
-
-	struct rcu_head rcu;
-	atomic_long_t faults[0];
-};
-
 static void task_numa_placement(struct task_struct *p)
 {
-	int seq, nid, max_nid = -1;
-	unsigned long max_faults = 0;
+	int seq, nid, max_nid = -1, max_group_nid = -1;
+	unsigned long max_faults = 0, max_group_faults = 0;
 
 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 	if (p->numa_scan_seq == seq)
@@ -1148,7 +1183,7 @@ static void task_numa_placement(struct task_struct *p)
 
 	/* Find the node with the highest number of faults */
 	for (nid = 0; nid < nr_node_ids; nid++) {
-		unsigned long faults = 0;
+		unsigned long faults = 0, group_faults = 0;
 		int priv, i;
 
 		for (priv = 0; priv < 2; priv++) {
@@ -1161,6 +1196,7 @@ static void task_numa_placement(struct task_struct *p)
 			/* Decay existing window, copy faults since last scan */
 			p->numa_faults[i] >>= 1;
 			p->numa_faults[i] += p->numa_faults_buffer[i];
+			p->total_numa_faults += p->numa_faults_buffer[i];
 			p->numa_faults_buffer[i] = 0;
 
 			diff += p->numa_faults[i];
@@ -1169,6 +1205,8 @@ static void task_numa_placement(struct task_struct *p)
 			if (p->numa_group) {
 				/* safe because we can only change our own group */
 				atomic_long_add(diff, &p->numa_group->faults[i]);
+				atomic_long_add(diff, &p->numa_group->total_faults);
+				group_faults += atomic_long_read(&p->numa_group->faults[i]);
 			}
 		}
 
@@ -1176,11 +1214,29 @@ static void task_numa_placement(struct task_struct *p)
 			max_faults = faults;
 			max_nid = nid;
 		}
+
+		if (group_faults > max_group_faults) {
+			max_group_faults = group_faults;
+			max_group_nid = nid;
+		}
 	}
 
 	if (sched_feat(NUMA_INTERLEAVE))
 		task_numa_mempol(p, max_faults);
 
+	/*
+	 * Should we stay on our own, or move in with the group?
+	 * If the task's memory accesses are concentrated on one node, go
+	 * to (more likely, stay on) that node. If the group's accesses
+	 * are more concentrated than the task's accesses, join the group.
+	 *
+	 *  max_group_faults     max_faults
+	 * ------------------ > ------------
+	 * total_group_faults   total_faults
+	 */
+	if (group_weight(p, max_group_nid) > task_weight(p, max_nid))
+		max_nid = max_group_nid;
+
 	/* Preferred node as the node with the most faults */
 	if (max_faults && max_nid != p->numa_preferred_nid) {
 
@@ -1242,6 +1298,7 @@ void task_numa_group(struct task_struct *p, int cpu, int pid)
 		atomic_set(&grp->refcount, 1);
 		spin_lock_init(&grp->lock);
 		INIT_LIST_HEAD(&grp->task_list);
+		atomic_long_set(&grp->total_faults, 0);
 
 		spin_lock(&p->numa_lock);
 		list_add(&p->numa_entry, &grp->task_list);
@@ -1336,6 +1393,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, bool migrated)
 
 		BUG_ON(p->numa_faults_buffer);
 		p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+		p->total_numa_faults = 0;
 	}
 
 	/*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2013-08-01 22:37 UTC|newest]

Thread overview: 102+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-15 15:20 [PATCH 0/18] Basic scheduler support for automatic NUMA balancing V5 Mel Gorman
2013-07-15 15:20 ` [PATCH 01/18] mm: numa: Document automatic NUMA balancing sysctls Mel Gorman
2013-07-15 15:20 ` [PATCH 02/18] sched: Track NUMA hinting faults on per-node basis Mel Gorman
2013-07-17 10:50   ` Peter Zijlstra
2013-07-31  7:54     ` Mel Gorman
2013-07-29 10:10   ` Peter Zijlstra
2013-07-31  7:54     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 03/18] mm: numa: Account for THP numa hinting faults on the correct node Mel Gorman
2013-07-17  0:33   ` Hillf Danton
2013-07-17  1:26     ` Wanpeng Li
2013-07-17  1:26     ` Wanpeng Li
2013-07-15 15:20 ` [PATCH 04/18] mm: numa: Do not migrate or account for hinting faults on the zero page Mel Gorman
2013-07-17 11:00   ` Peter Zijlstra
2013-07-31  8:11     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 05/18] sched: Select a preferred node with the most numa hinting faults Mel Gorman
2013-07-15 15:20 ` [PATCH 06/18] sched: Update NUMA hinting faults once per scan Mel Gorman
2013-07-15 15:20 ` [PATCH 07/18] sched: Favour moving tasks towards the preferred node Mel Gorman
2013-07-25 10:40   ` [PATCH] sched, numa: migrates_degrades_locality() Peter Zijlstra
2013-07-31  8:44     ` Mel Gorman
2013-07-31  8:50       ` Peter Zijlstra
2013-07-15 15:20 ` [PATCH 08/18] sched: Reschedule task on preferred NUMA node once selected Mel Gorman
2013-07-17  1:31   ` Hillf Danton
2013-07-31  9:07     ` Mel Gorman
2013-07-31  9:38       ` Srikar Dronamraju
2013-08-01  4:47   ` Srikar Dronamraju
2013-08-01 15:38     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 09/18] sched: Add infrastructure for split shared/private accounting of NUMA hinting faults Mel Gorman
2013-07-17  2:17   ` Hillf Danton
2013-07-31  9:08     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 10/18] sched: Increase NUMA PTE scanning when a new preferred node is selected Mel Gorman
2013-07-15 15:20 ` [PATCH 11/18] sched: Check current->mm before allocating NUMA faults Mel Gorman
2013-07-15 15:20 ` [PATCH 12/18] sched: Set the scan rate proportional to the size of the task being scanned Mel Gorman
2013-07-15 15:20 ` [PATCH 13/18] mm: numa: Scan pages with elevated page_mapcount Mel Gorman
2013-07-17  5:22   ` Sam Ben
2013-07-31  9:13     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 14/18] sched: Remove check that skips small VMAs Mel Gorman
2013-07-15 15:20 ` [PATCH 15/18] sched: Set preferred NUMA node based on number of private faults Mel Gorman
2013-07-18  1:53   ` [PATCH 15/18] fix compilation with !CONFIG_NUMA_BALANCING Rik van Riel
2013-07-31  9:19     ` Mel Gorman
2013-07-26 11:20   ` [PATCH 15/18] sched: Set preferred NUMA node based on number of private faults Peter Zijlstra
2013-07-31  9:29     ` Mel Gorman
2013-07-31  9:34       ` Peter Zijlstra
2013-07-31 10:10         ` Mel Gorman
2013-07-15 15:20 ` [PATCH 16/18] sched: Avoid overloading CPUs on a preferred NUMA node Mel Gorman
2013-07-15 20:03   ` Peter Zijlstra
2013-07-16  8:23     ` Mel Gorman
2013-07-16 10:35       ` Peter Zijlstra
2013-07-16 15:55   ` Hillf Danton
2013-07-16 16:01     ` Mel Gorman
2013-07-17 10:54   ` Peter Zijlstra
2013-07-31  9:49     ` Mel Gorman
2013-08-01  7:10   ` Srikar Dronamraju
2013-08-01 15:42     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 17/18] sched: Retry migration of tasks to CPU on a preferred node Mel Gorman
2013-07-25 10:33   ` Peter Zijlstra
2013-07-31 10:03     ` Mel Gorman
2013-07-31 10:05       ` Peter Zijlstra
2013-07-31 10:07         ` Mel Gorman
2013-07-25 10:35   ` Peter Zijlstra
2013-08-01  5:13   ` Srikar Dronamraju
2013-08-01 15:46     ` Mel Gorman
2013-07-15 15:20 ` [PATCH 18/18] sched: Swap tasks when reschuling if a CPU on a target node is imbalanced Mel Gorman
2013-07-15 20:11   ` Peter Zijlstra
2013-07-16  9:41     ` Mel Gorman
2013-08-01  4:59   ` Srikar Dronamraju
2013-08-01 15:48     ` Mel Gorman
2013-07-15 20:14 ` [PATCH 0/18] Basic scheduler support for automatic NUMA balancing V5 Peter Zijlstra
2013-07-16 15:10 ` Srikar Dronamraju
2013-07-25 10:36 ` Peter Zijlstra
2013-07-31 10:30   ` Mel Gorman
2013-07-31 10:48     ` Peter Zijlstra
2013-07-31 11:57       ` Mel Gorman
2013-07-31 15:30         ` Peter Zijlstra
2013-07-31 16:11           ` Mel Gorman
2013-07-31 16:39             ` Peter Zijlstra
2013-08-01 15:51               ` Mel Gorman
2013-07-25 10:38 ` [PATCH] mm, numa: Sanitize task_numa_fault() callsites Peter Zijlstra
2013-07-31 11:25   ` Mel Gorman
2013-07-25 10:41 ` [PATCH] sched, numa: Improve scanner Peter Zijlstra
2013-07-25 10:46 ` [PATCH] mm, sched, numa: Create a per-task MPOL_INTERLEAVE policy Peter Zijlstra
2013-07-26  9:55   ` Peter Zijlstra
2013-08-26 16:10     ` Peter Zijlstra
2013-08-26 16:14       ` Peter Zijlstra
2013-07-30 11:24 ` [PATCH] mm, numa: Change page last {nid,pid} into {cpu,pid} Peter Zijlstra
2013-08-01 22:33   ` Rik van Riel
2013-07-30 11:38 ` [PATCH] sched, numa: Use {cpu, pid} to create task groups for shared faults Peter Zijlstra
2013-07-31 15:07   ` Peter Zijlstra
2013-07-31 15:38     ` Peter Zijlstra
2013-07-31 15:45     ` Don Morris
2013-07-31 16:05       ` Peter Zijlstra
2013-08-02 16:47       ` [PATCH -v3] " Peter Zijlstra
2013-08-02 16:50         ` [PATCH] mm, numa: Do not group on RO pages Peter Zijlstra
2013-08-02 19:56           ` Peter Zijlstra
2013-08-05 19:36           ` [PATCH] numa,sched: use group fault statistics in numa placement Rik van Riel
2013-08-09 13:55             ` Don Morris
2013-08-28 16:41         ` [PATCH -v3] sched, numa: Use {cpu, pid} to create task groups for shared faults Peter Zijlstra
2013-08-28 17:10           ` Rik van Riel
2013-08-01  6:23   ` [PATCH,RFC] numa,sched: use group fault statistics in numa placement Rik van Riel
2013-08-01 10:37     ` Peter Zijlstra
2013-08-01 16:35       ` Rik van Riel
2013-08-01 22:36   ` Rik van Riel [this message]
2013-07-30 13:58 ` [PATCH 0/18] Basic scheduler support for automatic NUMA balancing V5 Andrew Theurer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130801183646.37b04171@annuminas.surriel.com \
    --to=riel@redhat.com \
    --cc=aarcange@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=srikar@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).