From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1758493AbdLRJoO (ORCPT <rfc822;w@1wt.eu>);
        Mon, 18 Dec 2017 04:44:14 -0500
Received: from outbound-smtp10.blacknight.com ([46.22.139.15]:50142 "EHLO
        outbound-smtp10.blacknight.com" rhost-flags-OK-OK-OK-OK)
        by vger.kernel.org with ESMTP id S1758275AbdLRJn3 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 18 Dec 2017 04:43:29 -0500
From: Mel Gorman <mgorman@techsingularity.net>
To: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>,
        Matt Fleming <matt@codeblueprint.co.uk>,
        Mel Gorman <mgorman@techsingularity.net>,
        LKML <linux-kernel@vger.kernel.org>
Subject: [PATCH 2/4] sched: Allow a wakee to run on the prev_cpu if it is idle and cache-affine with the waker
Date: Mon, 18 Dec 2017 09:43:25 +0000
Message-Id: <20171218094327.19562-3-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.15.0
In-Reply-To: <20171218094327.19562-1-mgorman@techsingularity.net>
References: <20171218094327.19562-1-mgorman@techsingularity.net>
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

With the commit "sched: Only migrate tasks due to interrupts if prev
and target CPUs share cache", we no longer migrate a task from interrupt
context if the waker does not share a CPU. However, for a normal wakeup
from a cache-affine process, we can miss the fact that prev_cpu is idle
and an appropriate sibling leading to unnecessary searches and migrations.

This patch reworks wake_affine to return a suitable CPU to wake on which
may be the current or prev CPU. If wake_affine_idle returns prev due to it
being idle then select_idle_sibling will immediately return the prev_cpu
without searching. It's slightly mixed on dbench using ext4 with gains when the machine is lightly
loaded and a small regression borderline on the noise when more than a node's
worth of CPU is used.

                          4.15.0-rc3             4.15.0-rc3
                               noirq               wakeprev
Hmean     1        865.01 (   0.00%)      834.19 (  -3.56%)
Hmean     2       1274.44 (   0.00%)     1353.09 (   6.17%)
Hmean     4       1628.08 (   0.00%)     1714.82 (   5.33%)
Hmean     8       1831.80 (   0.00%)     1855.84 (   1.31%)
Hmean     16      2091.44 (   0.00%)     1975.40 (  -5.55%)
Hmean     32      2430.29 (   0.00%)     2298.58 (  -5.42%)
Hmean     64      2568.54 (   0.00%)     2536.56 (  -1.25%)
Hmean     128     2499.28 (   0.00%)     2543.81 (   1.78%)
Stddev    1          5.35 (   0.00%)       19.39 (-262.63%)
Stddev    2         11.09 (   0.00%)        4.88 (  55.97%)
Stddev    4          6.80 (   0.00%)        9.24 ( -35.93%)
Stddev    8          9.41 (   0.00%)       28.39 (-201.82%)
Stddev    16        20.01 (   0.00%)       44.92 (-124.56%)
Stddev    32        44.74 (   0.00%)       50.14 ( -12.07%)
Stddev    64        93.18 (   0.00%)       84.97 (   8.81%)
Stddev    128      177.85 (   0.00%)      178.00 (  -0.09%)

However, system CPU usage is noticably reduced

          4.15.0-rc3  4.15.0-rc3
               noirq    wakeprev
User         1058.32     1077.42
System       5729.22     5287.61
Elapsed      1550.69     1553.09

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 kernel/sched/fair.c | 70 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a1f7d32ecf6..392e08b364bd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5689,17 +5689,21 @@ static int wake_wide(struct task_struct *p)
  * soonest. For the purpose of speed we only consider the waking and previous
  * CPU.
  *
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- *			will be) idle.
+ * wake_affine_idle() - only considers 'now', it checks if a CPU that is
+ *			cache-affine with the waker is idle
+ *
+ * wake_affine_sync() - only considers 'now', it checks if the waking CPU
+ *			will be idle. Migrations to a different NUMA node
+ *			are allowed on the basis that sync wakeups imply
+ *			shared data between waker and wakee.
  *
  * wake_affine_weight() - considers the weight to reflect the average
  *			  scheduling latency of the CPUs. This seems to work
  *			  for the overloaded case.
  */
 
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
-		 int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 {
 	/*
 	 * If this_cpu is idle, it implies the wakeup is from interrupt
@@ -5710,13 +5714,36 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
 	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
 		return true;
 
+	/*
+	 * Prefer migration if it's an interrupt on the assumption that the
+	 * data is cache hot to the CPU receiving the interrupt.
+	 */
+	if (idle_cpu(this_cpu))
+		return this_cpu;
+
+	/*
+	 * For normal wakeups, we use the prev_cpu if it's cache affine but
+	 * for remote wakeups, rely on wake_affine_weight to determine if
+	 * if it's best to pull the waker to the wakee. For sync wakeups,
+	 * rely on wake_affine_sync to determine if the task should wakeup
+	 * on the current CPU.
+	*/
+	if (this_cpu != prev_cpu && !sync && idle_cpu(prev_cpu))
+		return prev_cpu;
+
+	return nr_cpumask_bits;
+}
+
+static int
+wake_affine_sync(int this_cpu, int sync)
+{
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
-		return true;
+		return this_cpu;
 
-	return false;
+	return nr_cpumask_bits;
 }
 
-static bool
+static int
 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 		   int this_cpu, int prev_cpu, int sync)
 {
@@ -5730,7 +5757,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 		unsigned long current_load = task_h_load(current);
 
 		if (current_load > this_eff_load)
-			return true;
+			return this_cpu;
 
 		this_eff_load -= current_load;
 	}
@@ -5747,28 +5774,34 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
 	prev_eff_load *= capacity_of(this_cpu);
 
-	return this_eff_load <= prev_eff_load;
+	if (this_eff_load <= prev_eff_load)
+		return this_cpu;
+	return nr_cpumask_bits;
 }
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 		       int prev_cpu, int sync)
 {
 	int this_cpu = smp_processor_id();
-	bool affine = false;
+	int new_cpu = nr_cpumask_bits;
+
+	if (sched_feat(WA_IDLE))
+		new_cpu = wake_affine_idle(this_cpu, prev_cpu, sync);
 
-	if (sched_feat(WA_IDLE) && !affine)
-		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+	if (sched_feat(WA_IDLE) && new_cpu == nr_cpumask_bits)
+		new_cpu = wake_affine_sync(this_cpu, sync);
 
-	if (sched_feat(WA_WEIGHT) && !affine)
-		affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+	if (sched_feat(WA_WEIGHT) && new_cpu == nr_cpumask_bits)
+		new_cpu = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
 
 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
-	if (affine) {
+	if (new_cpu != nr_cpumask_bits) {
 		schedstat_inc(sd->ttwu_move_affine);
 		schedstat_inc(p->se.statistics.nr_wakeups_affine);
+		return new_cpu;
 	}
 
-	return affine;
+	return prev_cpu;
 }
 
 static inline int task_util(struct task_struct *p);
@@ -6361,8 +6394,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 		if (cpu == prev_cpu)
 			goto pick_cpu;
 
-		if (wake_affine(affine_sd, p, prev_cpu, sync))
-			new_cpu = cpu;
+		new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
 	}
 
 	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
-- 
2.15.0