From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751953AbdKZU70 convert rfc822-to-8bit (ORCPT ); Sun, 26 Nov 2017 15:59:26 -0500 Received: from mout.gmx.net ([212.227.17.20]:64232 "EHLO mout.gmx.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751662AbdKZU7Y (ORCPT ); Sun, 26 Nov 2017 15:59:24 -0500 Message-ID: <1511729914.22158.29.camel@gmx.de> Subject: Re: [PATCH RFC 1/2] sched: Minimize the idle cpu selection race window. From: Mike Galbraith To: Uladzislau Rezki Cc: Atish Patra , Peter Zijlstra , Joel Fernandes , LKML , Brendan Jackman , Josef Bacik , Ingo Molnar Date: Sun, 26 Nov 2017 21:58:34 +0100 In-Reply-To: <1511549190.8029.233.camel@gmx.de> References: <1509427662-25114-1-git-send-email-atish.patra@oracle.com> <1509427662-25114-2-git-send-email-atish.patra@oracle.com> <20171031082009.rxxa57goto6q5xld@hirez.programming.kicks-ass.net> <49e98b00-80c7-b3a4-30fd-bccb382d002b@oracle.com> <20171123105247.wcl2fiypge2pvile@pc636> <1511442781.6505.26.camel@gmx.de> <20171124102636.zqqjqa3sru7ebh4k@pc636> <1511549190.8029.233.camel@gmx.de> Content-Type: text/plain; charset="ISO-8859-15" X-Mailer: Evolution 3.20.5 Mime-Version: 1.0 Content-Transfer-Encoding: 8BIT X-Provags-ID: V03:K0:bhW7xRSIKlG7MIB+a+JwabN1Q2qWaW/SqtA7+DMqbZm6LsAhyFX RYqa8t5KO7C6MjTlwyNuzuhVpThUhL+P2AqOLhY4cnUSMfh0YuDjnEzyrjEva/0oWBlibL7 ovHtRZIuxyeRJUoJmxWKSIKMAdvSJebNlY4I0Qcjit/tbQF15hPw6EAsy9o33FjTSqn/9Ai RWZ9a3H9oBTYeR32GdNSA== X-UI-Out-Filterresults: notjunk:1;V01:K0:0h1D4sIujsQ=:8JB6a6e8v+J1U88jVaM71e gO8Sz4UnrgMNEMX44SLlatdIfrfp+rRushEr9ugMdFycMtDVsvvMwnR+AdCXsQiFXp/PFEaxq tQz6yGlTBFlS9eQNbPQG9f0T3Ie06gQLWdqCLh0fHJitfh42ICZPJdBkMSWdHCKAQeUJu2ebP xfet+6h4FfZVfvR5fYSNR66eHO0L2Q6oQQ+/Jr6cwQtyR9MUzoKX0T+bxsHidqvSNk5tMRAKa zGSrPRcgoiqJnYPR++CQx7gAuVRaLvlTg4F/GDC/9vFBX/xqe1OxMQwTCVKNCk6zK4wt+ads/ dS0VFpSEVRwKx4nw+5jpgB8HGP+BCg01oqGbXrzBRb2F0i+LR11l8XGpQnA3WRexUUBce7/Jr SaBatojf1yTyWEsamCxOU48SkCK3BQNt2lzf5Wvwo10tBvwdEeBdvcQtJZvTnontEdhi+U65U fw2h0tsDmsXxHUtELbzX3k1wS8GpyxU5U9KK3iPeCC0T0K9elo+CGnLX1SfYA0s0dkt1PAzjR 4Wl9DNPHQaQEF4DdgciTwpbK9KPjpJ5yDDlfU+KodHKTXh9qXaq/qZmY2xT+RSYoijhbd0lwA X8C4URj8eiSVn4Z3ZFTbYZFh/KJot9kz6XqiI/rkyeKp+fIX6Ln6G0bgpwaf3UMHcUiqkxZCK QXhN76kA51kPk96EcdH4F1II6WoNwbpjN28shodLx+QBW2rmVObh34uarhkOuXik1v0r+qJjY iXiv5e4iAsn1vAgOJ3MokdCiWlXP3aP8Ypb4APye1n0Y9hTBuZQ1riNTpwsDwXwqSQX6DUvmI 0Jf08fci10ZQDdDjt8dUZgbTdNMULltChI3Du++LRvKNEtreBA= Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Fri, 2017-11-24 at 19:46 +0100, Mike Galbraith wrote: > > My view is you're barking up the wrong tree: you're making the idle > data SIS is using more accurate, but I question the benefit.  That it > makes an imperfect placement decision occasionally due to raciness is > nearly meaningless compared to the cost of frequent bounce. Playing with SIS (yet again), below is a hack that I think illustrates why I think the occasional races are nearly meaningless. Box = i4790 desktop. master masterx TCP_STREAM-1 Avg: 70495 71295 TCP_STREAM-2 Avg: 54388 66202 TCP_STREAM-4 Avg: 19316 21413 TCP_STREAM-8 Avg: 9678 8894 TCP_STREAM-16 Avg: 4286 4360 TCP_MAERTS-1 Avg: 69238 71799 TCP_MAERTS-2 Avg: 50729 65612 TCP_MAERTS-4 Avg: 19095 21984 TCP_MAERTS-8 Avg: 9405 8888 TCP_MAERTS-16 Avg: 4891 4371 TCP_RR-1 Avg: 198617 203291 TCP_RR-2 Avg: 152862 191761 TCP_RR-4 Avg: 112241 117888 TCP_RR-8 Avg: 104453 113260 TCP_RR-16 Avg: 50897 55280 UDP_RR-1 Avg: 250738 264214 UDP_RR-2 Avg: 196250 253352 UDP_RR-4 Avg: 152862 158819 UDP_RR-8 Avg: 143781 154071 UDP_RR-16 Avg: 68605 76492 tbench 1 2 4 8 16 master 772 1207 1829 3516 3440 masterx 811 1466 1959 3737 3670 hackbench -l 10000 5.917 5.990 5.957 avg 5.954 NO_SIS_DEBOUNCE 5.886 5.808 5.826 avg 5.840 SIS_DEBOUNCE echo 0 > tracing_on echo 1 > events/sched/sched_migrate_task/enable start endless tbench 2 for i in `seq 3` do echo > trace echo 1 > tracing_on sleep 10 echo 0 > tracing_on cat trace|grep tbench|wc -l done kde desktop idling NO_SIS_DEBOUNCE 261 208 199 SIS_DEBOUNCE 8 6 0 add firefox playing youtube documentary NO_SIS_DEBOUNCE 10906 10094 10774 SIS_DEBOUNCE 34 34 34 tbench 2 throughput as firefox runs NO_SIS_DEBOUNCE 1129.63 MB/sec SIS_DEBOUNCE 1462.53 MB/sec Advisory: welding goggles. --- include/linux/sched.h | 3 ++- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++++-- kernel/sched/features.h | 1 + 3 files changed, 42 insertions(+), 3 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -541,7 +541,6 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct llist_node wake_entry; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ @@ -549,8 +548,10 @@ struct task_struct { #endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; + unsigned long wakee_placed; struct task_struct *last_wakee; + struct llist_node wake_entry; int wake_cpu; #endif int on_rq; --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6174,6 +6174,9 @@ static int select_idle_sibling(struct ta if ((unsigned)i < nr_cpumask_bits) return i; + if (sched_feat(SIS_DEBOUNCE)) + p->wakee_placed = jiffies; + return target; } @@ -6258,6 +6261,22 @@ static int wake_cap(struct task_struct * return min_cap * 1024 < task_util(p) * capacity_margin; } +static bool task_placed(struct task_struct *p) +{ + return p->wakee_placed == jiffies; +} + +static bool task_llc_affine_and_cold(struct task_struct *p, int cpu, int prev) +{ + int cold = sysctl_sched_migration_cost; + + if (!cpus_share_cache(cpu, prev)) + return false; + if (cold > 0 && rq_clock_task(cpu_rq(prev)) - p->se.exec_start > cold) + return true; + return false; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6276,16 +6295,26 @@ select_task_rq_fair(struct task_struct * struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; - int want_affine = 0; + int want_affine = 0, want_debounce = 0; int sync = wake_flags & WF_SYNC; + rcu_read_lock(); if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); + want_debounce = sched_feat(SIS_DEBOUNCE); + if (task_placed(p)) + goto out_unlock; + /* Balance cold tasks to reduce hot task bounce tendency. */ + if (want_debounce && task_llc_affine_and_cold(p, cpu, prev_cpu)) { + sd_flag |= SD_SHARE_PKG_RESOURCES; + sd = highest_flag_domain(prev_cpu, SD_SHARE_PKG_RESOURCES); + p->wakee_placed = jiffies; + goto pick_cpu_cold; + } want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, &p->cpus_allowed); } - rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -6315,6 +6344,7 @@ select_task_rq_fair(struct task_struct * new_cpu = cpu; } +pick_cpu_cold: if (sd && !(sd_flag & SD_BALANCE_FORK)) { /* * We're going to need the task's util for capacity_spare_wake @@ -6329,9 +6359,13 @@ select_task_rq_fair(struct task_struct * if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (want_debounce && new_cpu == prev_cpu) + p->wakee_placed = jiffies; + } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } +out_unlock: rcu_read_unlock(); return new_cpu; @@ -6952,6 +6986,9 @@ static int task_hot(struct task_struct * if (sysctl_sched_migration_cost == 0) return 0; + if (task_placed(p)) + return 1; + delta = rq_clock_task(env->src_rq) - p->se.exec_start; return delta < (s64)sysctl_sched_migration_cost; --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_DEBOUNCE, true) /* * Issue a WARN when we do multiple update_rq_clock() calls