From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S933585AbdLRJoM (ORCPT ); Mon, 18 Dec 2017 04:44:12 -0500 Received: from outbound-smtp25.blacknight.com ([81.17.249.193]:56847 "EHLO outbound-smtp25.blacknight.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1758305AbdLRJn3 (ORCPT ); Mon, 18 Dec 2017 04:43:29 -0500 From: Mel Gorman To: Peter Zijlstra Cc: Ingo Molnar , Matt Fleming , Mel Gorman , LKML Subject: [PATCH 4/4] sched: Allow tasks to stack with a workqueue on the same CPU Date: Mon, 18 Dec 2017 09:43:27 +0000 Message-Id: <20171218094327.19562-5-mgorman@techsingularity.net> X-Mailer: git-send-email 2.15.0 In-Reply-To: <20171218094327.19562-1-mgorman@techsingularity.net> References: <20171218094327.19562-1-mgorman@techsingularity.net> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org If tasks wake a kworker to do some work and is woken on completion and it was a per-cpu kworker that was used then a situation can arise where the current CPU is always active when the kworker is waking and select_idle_sibling moves the task. This leads to a situation where a task moves around the socket each time a kworker is used even through the relationship is effectively sync. This patch special cases a kworker running on the same CPU. It has a noticable impact on migrations and performance of dbench running with the XFS filesystem but has no impact on ext4 as ext4 interacts with a kthread, not a kworker. 4.15.0-rc3 4.15.0-rc3 wakeprev stackwq Hmean 1 392.92 ( 0.00%) 1024.22 ( 160.67%) Hmean 2 787.09 ( 0.00%) 1808.38 ( 129.75%) Hmean 4 1559.71 ( 0.00%) 2525.42 ( 61.92%) Hmean 8 2576.05 ( 0.00%) 2881.12 ( 11.84%) Hmean 16 2949.28 ( 0.00%) 3137.65 ( 6.39%) Hmean 32 3041.89 ( 0.00%) 3147.92 ( 3.49%) Hmean 64 1655.42 ( 0.00%) 1756.21 ( 6.09%) Hmean 128 1133.19 ( 0.00%) 1165.39 ( 2.84%) Stddev 1 2.59 ( 0.00%) 11.21 (-332.82%) Stddev 2 8.96 ( 0.00%) 13.57 ( -51.44%) Stddev 4 20.15 ( 0.00%) 8.51 ( 57.75%) Stddev 8 17.15 ( 0.00%) 14.45 ( 15.75%) Stddev 16 30.29 ( 0.00%) 31.30 ( -3.33%) Stddev 32 64.45 ( 0.00%) 57.22 ( 11.21%) Stddev 64 55.89 ( 0.00%) 62.84 ( -12.43%) Stddev 128 55.89 ( 0.00%) 62.75 ( -12.27%) There is also a large drop in system CPU usage; 4.15.0-rc3 4.15.0-rc3 wakeprev stackwq User 1561.85 1166.59 System 6961.89 4965.09 Elapsed 1472.05 1471.84 Signed-off-by: Mel Gorman --- kernel/sched/fair.c | 29 +++++++++++++++++++++++++++-- kernel/sched/features.h | 8 ++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95b1145bc38d..cff55481bd19 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5684,6 +5684,19 @@ static int wake_wide(struct task_struct *p) return 1; } +/* + * Returns true if a wakeup is either from or to a workqueue and the tasks + * appear to be synchronised with each other. + */ +static bool +is_wakeup_workqueue_sync(struct task_struct *p, int this_cpu, int prev_cpu) +{ + return sched_feat(WA_STACK_WQ) && + this_cpu == prev_cpu && + ((p->flags & PF_WQ_WORKER) || (current->flags & PF_WQ_WORKER)) && + this_rq()->nr_running <= 1; +} + /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous @@ -5735,7 +5748,7 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) } static int -wake_affine_sync(int this_cpu, int sync) +wake_affine_sync(struct task_struct *p, int this_cpu, int prev_cpu, int sync) { /* * Consider stacking tasks if it's a sync wakeup and there is only @@ -5745,6 +5758,14 @@ wake_affine_sync(int this_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + /* + * If the waker or wakee is a workqueue and it appears to be similar + * to a sync wakeup then assume the waker will sleep shortly and allow + * the tasks to stack on the same CPU. + */ + if (is_wakeup_workqueue_sync(p, this_cpu, prev_cpu)) + return this_cpu; + return nr_cpumask_bits; } @@ -5794,7 +5815,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, new_cpu = wake_affine_idle(this_cpu, prev_cpu, sync); if (sched_feat(WA_IDLE) && new_cpu == nr_cpumask_bits) - new_cpu = wake_affine_sync(this_cpu, sync); + new_cpu = wake_affine_sync(p, this_cpu, prev_cpu, sync); if (sched_feat(WA_WEIGHT) && new_cpu == nr_cpumask_bits) new_cpu = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); @@ -6240,6 +6261,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (idle_cpu(target)) return target; + /* Allow a wakeup to stack if it looks like a synchronous workqueue */ + if (is_wakeup_workqueue_sync(p, smp_processor_id(), target)) + return target; + /* * If the previous cpu is cache affine and idle, don't be stupid. */ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 9552fd5854bf..c96ad246584a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,3 +85,11 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true) + +/* + * If true then a process may stack with a workqueue on the same CPU during + * wakeup instead of finding an idle sibling. This should only happen in the + * case where there appears to be a strong relationship beween the wq and the + * task e.g. IO operations dispatched to a workqueue on XFS. + */ +SCHED_FEAT(WA_STACK_WQ, true) -- 2.15.0