public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] sched:Prefer numa hotness over cache hotness
@ 2015-06-10  7:03 Srikar Dronamraju
  2015-06-10  7:03 ` Performance numbers with prefer " Srikar Dronamraju
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Srikar Dronamraju @ 2015-06-10  7:03 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: linux-kernel, srikar, Rik van Riel, Mel Gorman

The current load balancer may not try to prevent a task from moving out
of a preferred node to a less preferred node. The reason for this being:

- Since sched features NUMA and NUMA_RESIST_LOWER are disabled by
  default, migrate_degrades_locality() always returns false.

- Even if NUMA_RESIST_LOWER were to be enabled, if its cache hot,
  migrate_degrades_locality() never gets called.

The above behaviour can mean that tasks can move out of their preferred
node but they may be eventually be brought back to their preferred node
by numa balancer (due to higher numa faults).

To avoid the above, this commit merges migrate_degrades_locality() and
migrate_improves_locality(). It also replaces 3 sched features NUMA,
NUMA_FAVOUR_HIGHER and NUMA_RESIST_LOWER by a single sched feature NUMA.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 kernel/sched/fair.c     | 96 ++++++++++++++++++-------------------------------
 kernel/sched/features.h | 18 +++-------
 2 files changed, 40 insertions(+), 74 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa41..ddde6fd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5467,90 +5467,64 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 }
 
 #ifdef CONFIG_NUMA_BALANCING
-/* Returns true if the destination node has incurred more faults */
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
-{
-	struct numa_group *numa_group = rcu_dereference(p->numa_group);
-	int src_nid, dst_nid;
-
-	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
-	    !(env->sd->flags & SD_NUMA)) {
-		return false;
-	}
-
-	src_nid = cpu_to_node(env->src_cpu);
-	dst_nid = cpu_to_node(env->dst_cpu);
-
-	if (src_nid == dst_nid)
-		return false;
-
-	if (numa_group) {
-		/* Task is already in the group's interleave set. */
-		if (node_isset(src_nid, numa_group->active_nodes))
-			return false;
-
-		/* Task is moving into the group's interleave set. */
-		if (node_isset(dst_nid, numa_group->active_nodes))
-			return true;
-
-		return group_faults(p, dst_nid) > group_faults(p, src_nid);
-	}
-
-	/* Encourage migration to the preferred node. */
-	if (dst_nid == p->numa_preferred_nid)
-		return true;
-
-	return task_faults(p, dst_nid) > task_faults(p, src_nid);
-}
 
+/*
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
+ */
 
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
 	int src_nid, dst_nid;
 
-	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
-		return false;
-
 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
-		return false;
+		return -1;
+
+	if (!sched_feat(NUMA))
+		return -1;
 
 	src_nid = cpu_to_node(env->src_cpu);
 	dst_nid = cpu_to_node(env->dst_cpu);
 
 	if (src_nid == dst_nid)
-		return false;
+		return -1;
 
 	if (numa_group) {
 		/* Task is moving within/into the group's interleave set. */
-		if (node_isset(dst_nid, numa_group->active_nodes))
-			return false;
+		if (node_isset(dst_nid, numa_group->active_nodes) &&
+				!node_isset(src_nid, numa_group->active_nodes))
+			return 0;
 
 		/* Task is moving out of the group's interleave set. */
-		if (node_isset(src_nid, numa_group->active_nodes))
-			return true;
+		if (!node_isset(dst_nid, numa_group->active_nodes) &&
+				node_isset(src_nid, numa_group->active_nodes))
+			return 1;
 
 		return group_faults(p, dst_nid) < group_faults(p, src_nid);
 	}
 
 	/* Migrating away from the preferred node is always bad. */
-	if (src_nid == p->numa_preferred_nid)
-		return true;
+	if (src_nid == p->numa_preferred_nid) {
+		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+			return 1;
+		else
+			return -1;
+	}
+
+	/* Encourage migration to the preferred node. */
+	if (dst_nid == p->numa_preferred_nid)
+		return 0;
 
 	return task_faults(p, dst_nid) < task_faults(p, src_nid);
 }
 
 #else
-static inline bool migrate_improves_locality(struct task_struct *p,
+static inline int migrate_degrades_locality(struct task_struct *p,
 					     struct lb_env *env)
 {
-	return false;
-}
-
-static inline bool migrate_degrades_locality(struct task_struct *p,
-					     struct lb_env *env)
-{
-	return false;
+	return -1;
 }
 #endif
 
@@ -5560,7 +5534,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
-	int tsk_cache_hot = 0;
+	int tsk_cache_hot;
 
 	lockdep_assert_held(&env->src_rq->lock);
 
@@ -5618,13 +5592,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) task is cache cold, or
 	 * 3) too many balance attempts have failed.
 	 */
-	tsk_cache_hot = task_hot(p, env);
-	if (!tsk_cache_hot)
-		tsk_cache_hot = migrate_degrades_locality(p, env);
+	tsk_cache_hot = migrate_degrades_locality(p, env);
+	if (tsk_cache_hot == -1)
+		tsk_cache_hot = task_hot(p, env);
 
-	if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+	if (tsk_cache_hot <= 0 ||
 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-		if (tsk_cache_hot) {
+		if (tsk_cache_hot == 1) {
 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
 			schedstat_inc(p, se.statistics.nr_forced_migrations);
 		}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 91e33cd..83a50e7 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false)
  * numa_balancing=
  */
 #ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA,	false)
 
 /*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
+ * NUMA will favor moving tasks towards nodes where a higher number of
+ * hinting faults are recorded during active load balancing. It will
+ * resist moving tasks towards nodes where a lower number of hinting
+ * faults have been recorded.
  */
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
-
-/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
- */
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
+SCHED_FEAT(NUMA,	true)
 #endif
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Performance numbers with prefer numa hotness over cache hotness
  2015-06-10  7:03 [PATCH] sched:Prefer numa hotness over cache hotness Srikar Dronamraju
@ 2015-06-10  7:03 ` Srikar Dronamraju
  2015-06-10 14:12 ` [PATCH] sched:Prefer " Rik van Riel
  2015-06-11 14:01 ` Peter Zijlstra
  2 siblings, 0 replies; 4+ messages in thread
From: Srikar Dronamraju @ 2015-06-10  7:03 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: linux-kernel, srikar, Rik van Riel, Mel Gorman

Ran 5 runs of autonuma-benchmark
(https://github.com/pholasek/autonuma-benchmark)

KernelVersion: 4.1.0-rc6
	Testcase:         Min         Max         Avg      StdDev
  elapsed_numa01:      533.73      789.12      696.77       86.86
  elapsed_numa02:       22.59       28.15       25.87        1.99
	Testcase:         Min         Max         Avg      StdDev
   system_numa01:      342.78     1655.01     1128.26      479.49
   system_numa02:       36.12       99.35       62.93       20.95
	Testcase:         Min         Max         Avg      StdDev
     user_numa01:    43724.45    62948.12    55832.97     6486.37
     user_numa02:     1693.81     1945.30     1817.80      101.85
	Testcase:         Min         Max         Avg      StdDev
    total_numa01:    44067.20    64603.10    56961.22     6941.77
    total_numa02:     1729.93     2044.65     1880.73      118.16

KernelVersion: 4.1.0-rc6+patch
	Testcase:         Min         Max         Avg      StdDev     %Change
  elapsed_numa01:      484.08      726.45      648.46       85.14       6.65%
  elapsed_numa02:       20.38       29.56       24.99        2.93       2.98%
	Testcase:         Min         Max         Avg      StdDev     %Change
   system_numa01:      325.60     1111.46      837.76      290.98      26.14%
   system_numa02:       47.65       73.59       60.59       10.97       3.18%
	Testcase:         Min         Max         Avg      StdDev     %Change
     user_numa01:    39256.04    53387.31    48009.88     4717.79      14.65%
     user_numa02:     1498.92     2089.51     1762.66      193.30       2.64%
	Testcase:         Min         Max         Avg      StdDev     %Change
    total_numa01:    39581.60    54483.60    48847.66     4987.34      14.89%
    total_numa02:     1546.57     2161.73     1823.26      202.28       2.66%

Performance counter stats for 'system wide': (for 1st run).
numa01
(Before patch)
          5,81,911      cs                                         [100.00%]
          1,07,380      migrations                                 [100.00%]
          5,15,465      faults
    2,99,19,83,196      cache-misses

     724.330728328 seconds time elapsed

(After patch)
          3,04,022      cs                                         [100.00%]
            47,539      migrations                                 [100.00%]
          3,14,508      faults
    1,91,03,93,197      cache-misses

     484.088557526 seconds time elapsed

numa02
(Before patch)
            26,078      cs                                         [100.00%]
             4,932      migrations                                 [100.00%]
            28,025      faults
      11,75,46,393      cache-misses

      28.156065587 seconds time elapsed

(After patch)
            18,972      cs                                         [100.00%]
             3,650      migrations                                 [100.00%]
            24,620      fault
       9,19,64,503      cache-misses

      20.382626292 seconds time elapsed

# numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 0 size: 32425 MB
node 0 free: 25493 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
node 1 size: 31711 MB
node 1 free: 30933 MB
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 2 size: 30431 MB
node 2 free: 29577 MB
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
node 3 size: 32219 MB
node 3 free: 31455 MB
node distances:
node   0   1   2   3
  0:  10  20  40  40
  1:  20  10  40  40
  2:  40  40  10  20
  3:  40  40  20  10

Srikar Dronamraju (1):
  sched:Prefer numa hotness over cache hotness

 kernel/sched/fair.c     | 96 ++++++++++++++++++-------------------------------
 kernel/sched/features.h | 18 +++-------
 2 files changed, 40 insertions(+), 74 deletions(-)

--
1.8.3.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] sched:Prefer numa hotness over cache hotness
  2015-06-10  7:03 [PATCH] sched:Prefer numa hotness over cache hotness Srikar Dronamraju
  2015-06-10  7:03 ` Performance numbers with prefer " Srikar Dronamraju
@ 2015-06-10 14:12 ` Rik van Riel
  2015-06-11 14:01 ` Peter Zijlstra
  2 siblings, 0 replies; 4+ messages in thread
From: Rik van Riel @ 2015-06-10 14:12 UTC (permalink / raw)
  To: Srikar Dronamraju, Ingo Molnar, Peter Zijlstra; +Cc: linux-kernel, Mel Gorman

On 06/10/2015 03:03 AM, Srikar Dronamraju wrote:
> The current load balancer may not try to prevent a task from moving out
> of a preferred node to a less preferred node. The reason for this being:
> 
> - Since sched features NUMA and NUMA_RESIST_LOWER are disabled by
>   default, migrate_degrades_locality() always returns false.
> 
> - Even if NUMA_RESIST_LOWER were to be enabled, if its cache hot,
>   migrate_degrades_locality() never gets called.
> 
> The above behaviour can mean that tasks can move out of their preferred
> node but they may be eventually be brought back to their preferred node
> by numa balancer (due to higher numa faults).
> 
> To avoid the above, this commit merges migrate_degrades_locality() and
> migrate_improves_locality(). It also replaces 3 sched features NUMA,
> NUMA_FAVOUR_HIGHER and NUMA_RESIST_LOWER by a single sched feature NUMA.

I had been looking at merging the two at well, but ran
into a dead end because I failed to reorder NUMA and
cache_hot testing.  Nice work.

> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

Acked-by: Rik van Riel <riel@redhat.com>

-- 
All rights reversed

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] sched:Prefer numa hotness over cache hotness
  2015-06-10  7:03 [PATCH] sched:Prefer numa hotness over cache hotness Srikar Dronamraju
  2015-06-10  7:03 ` Performance numbers with prefer " Srikar Dronamraju
  2015-06-10 14:12 ` [PATCH] sched:Prefer " Rik van Riel
@ 2015-06-11 14:01 ` Peter Zijlstra
  2 siblings, 0 replies; 4+ messages in thread
From: Peter Zijlstra @ 2015-06-11 14:01 UTC (permalink / raw)
  To: Srikar Dronamraju; +Cc: Ingo Molnar, linux-kernel, Rik van Riel, Mel Gorman

On Wed, Jun 10, 2015 at 12:33:35PM +0530, Srikar Dronamraju wrote:
> The current load balancer may not try to prevent a task from moving out
> of a preferred node to a less preferred node. The reason for this being:
> 
> - Since sched features NUMA and NUMA_RESIST_LOWER are disabled by
>   default, migrate_degrades_locality() always returns false.
> 
> - Even if NUMA_RESIST_LOWER were to be enabled, if its cache hot,
>   migrate_degrades_locality() never gets called.
> 
> The above behaviour can mean that tasks can move out of their preferred
> node but they may be eventually be brought back to their preferred node
> by numa balancer (due to higher numa faults).
> 
> To avoid the above, this commit merges migrate_degrades_locality() and
> migrate_improves_locality(). It also replaces 3 sched features NUMA,
> NUMA_FAVOUR_HIGHER and NUMA_RESIST_LOWER by a single sched feature NUMA.
> 

Does not apply.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-06-11 14:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-10  7:03 [PATCH] sched:Prefer numa hotness over cache hotness Srikar Dronamraju
2015-06-10  7:03 ` Performance numbers with prefer " Srikar Dronamraju
2015-06-10 14:12 ` [PATCH] sched:Prefer " Rik van Riel
2015-06-11 14:01 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox