From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1758493AbdLRJoO (ORCPT ); Mon, 18 Dec 2017 04:44:14 -0500 Received: from outbound-smtp10.blacknight.com ([46.22.139.15]:50142 "EHLO outbound-smtp10.blacknight.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758275AbdLRJn3 (ORCPT ); Mon, 18 Dec 2017 04:43:29 -0500 From: Mel Gorman To: Peter Zijlstra Cc: Ingo Molnar , Matt Fleming , Mel Gorman , LKML Subject: [PATCH 2/4] sched: Allow a wakee to run on the prev_cpu if it is idle and cache-affine with the waker Date: Mon, 18 Dec 2017 09:43:25 +0000 Message-Id: <20171218094327.19562-3-mgorman@techsingularity.net> X-Mailer: git-send-email 2.15.0 In-Reply-To: <20171218094327.19562-1-mgorman@techsingularity.net> References: <20171218094327.19562-1-mgorman@techsingularity.net> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org With the commit "sched: Only migrate tasks due to interrupts if prev and target CPUs share cache", we no longer migrate a task from interrupt context if the waker does not share a CPU. However, for a normal wakeup from a cache-affine process, we can miss the fact that prev_cpu is idle and an appropriate sibling leading to unnecessary searches and migrations. This patch reworks wake_affine to return a suitable CPU to wake on which may be the current or prev CPU. If wake_affine_idle returns prev due to it being idle then select_idle_sibling will immediately return the prev_cpu without searching. It's slightly mixed on dbench using ext4 with gains when the machine is lightly loaded and a small regression borderline on the noise when more than a node's worth of CPU is used. 4.15.0-rc3 4.15.0-rc3 noirq wakeprev Hmean 1 865.01 ( 0.00%) 834.19 ( -3.56%) Hmean 2 1274.44 ( 0.00%) 1353.09 ( 6.17%) Hmean 4 1628.08 ( 0.00%) 1714.82 ( 5.33%) Hmean 8 1831.80 ( 0.00%) 1855.84 ( 1.31%) Hmean 16 2091.44 ( 0.00%) 1975.40 ( -5.55%) Hmean 32 2430.29 ( 0.00%) 2298.58 ( -5.42%) Hmean 64 2568.54 ( 0.00%) 2536.56 ( -1.25%) Hmean 128 2499.28 ( 0.00%) 2543.81 ( 1.78%) Stddev 1 5.35 ( 0.00%) 19.39 (-262.63%) Stddev 2 11.09 ( 0.00%) 4.88 ( 55.97%) Stddev 4 6.80 ( 0.00%) 9.24 ( -35.93%) Stddev 8 9.41 ( 0.00%) 28.39 (-201.82%) Stddev 16 20.01 ( 0.00%) 44.92 (-124.56%) Stddev 32 44.74 ( 0.00%) 50.14 ( -12.07%) Stddev 64 93.18 ( 0.00%) 84.97 ( 8.81%) Stddev 128 177.85 ( 0.00%) 178.00 ( -0.09%) However, system CPU usage is noticably reduced 4.15.0-rc3 4.15.0-rc3 noirq wakeprev User 1058.32 1077.42 System 5729.22 5287.61 Elapsed 1550.69 1553.09 Signed-off-by: Mel Gorman --- kernel/sched/fair.c | 70 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4a1f7d32ecf6..392e08b364bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5689,17 +5689,21 @@ static int wake_wide(struct task_struct *p) * soonest. For the purpose of speed we only consider the waking and previous * CPU. * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - * will be) idle. + * wake_affine_idle() - only considers 'now', it checks if a CPU that is + * cache-affine with the waker is idle + * + * wake_affine_sync() - only considers 'now', it checks if the waking CPU + * will be idle. Migrations to a different NUMA node + * are allowed on the basis that sync wakeups imply + * shared data between waker and wakee. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ -static bool -wake_affine_idle(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +static int +wake_affine_idle(int this_cpu, int prev_cpu, int sync) { /* * If this_cpu is idle, it implies the wakeup is from interrupt @@ -5710,13 +5714,36 @@ wake_affine_idle(struct sched_domain *sd, struct task_struct *p, if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return true; + /* + * Prefer migration if it's an interrupt on the assumption that the + * data is cache hot to the CPU receiving the interrupt. + */ + if (idle_cpu(this_cpu)) + return this_cpu; + + /* + * For normal wakeups, we use the prev_cpu if it's cache affine but + * for remote wakeups, rely on wake_affine_weight to determine if + * if it's best to pull the waker to the wakee. For sync wakeups, + * rely on wake_affine_sync to determine if the task should wakeup + * on the current CPU. + */ + if (this_cpu != prev_cpu && !sync && idle_cpu(prev_cpu)) + return prev_cpu; + + return nr_cpumask_bits; +} + +static int +wake_affine_sync(int this_cpu, int sync) +{ if (sync && cpu_rq(this_cpu)->nr_running == 1) - return true; + return this_cpu; - return false; + return nr_cpumask_bits; } -static bool +static int wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5730,7 +5757,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long current_load = task_h_load(current); if (current_load > this_eff_load) - return true; + return this_cpu; this_eff_load -= current_load; } @@ -5747,28 +5774,34 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load; + if (this_eff_load <= prev_eff_load) + return this_cpu; + return nr_cpumask_bits; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + int new_cpu = nr_cpumask_bits; + + if (sched_feat(WA_IDLE)) + new_cpu = wake_affine_idle(this_cpu, prev_cpu, sync); - if (sched_feat(WA_IDLE) && !affine) - affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE) && new_cpu == nr_cpumask_bits) + new_cpu = wake_affine_sync(this_cpu, sync); - if (sched_feat(WA_WEIGHT) && !affine) - affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && new_cpu == nr_cpumask_bits) + new_cpu = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (affine) { + if (new_cpu != nr_cpumask_bits) { schedstat_inc(sd->ttwu_move_affine); schedstat_inc(p->se.statistics.nr_wakeups_affine); + return new_cpu; } - return affine; + return prev_cpu; } static inline int task_util(struct task_struct *p); @@ -6361,8 +6394,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - if (wake_affine(affine_sd, p, prev_cpu, sync)) - new_cpu = cpu; + new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { -- 2.15.0