All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mike Galbraith <efault@gmx.de>
To: Uladzislau Rezki <urezki@gmail.com>
Cc: Atish Patra <atish.patra@oracle.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Joel Fernandes <joelaf@google.com>,
	LKML <linux-kernel@vger.kernel.org>,
	Brendan Jackman <brendan.jackman@arm.com>,
	Josef Bacik <jbacik@fb.com>, Ingo Molnar <mingo@redhat.com>
Subject: Re: [PATCH RFC 1/2] sched: Minimize the idle cpu selection race window.
Date: Sun, 26 Nov 2017 21:58:34 +0100	[thread overview]
Message-ID: <1511729914.22158.29.camel@gmx.de> (raw)
In-Reply-To: <1511549190.8029.233.camel@gmx.de>

On Fri, 2017-11-24 at 19:46 +0100, Mike Galbraith wrote:
> 
> My view is you're barking up the wrong tree: you're making the idle
> data SIS is using more accurate, but I question the benefit.  That it
> makes an imperfect placement decision occasionally due to raciness is
> nearly meaningless compared to the cost of frequent bounce.

Playing with SIS (yet again), below is a hack that I think illustrates
why I think the occasional races are nearly meaningless.

Box = i4790 desktop.

                       master   masterx
TCP_STREAM-1      Avg:  70495     71295
TCP_STREAM-2      Avg:  54388     66202
TCP_STREAM-4      Avg:  19316     21413
TCP_STREAM-8      Avg:   9678      8894
TCP_STREAM-16     Avg:   4286      4360
TCP_MAERTS-1      Avg:  69238     71799
TCP_MAERTS-2      Avg:  50729     65612
TCP_MAERTS-4      Avg:  19095     21984
TCP_MAERTS-8      Avg:   9405      8888
TCP_MAERTS-16     Avg:   4891      4371
TCP_RR-1          Avg: 198617    203291
TCP_RR-2          Avg: 152862    191761
TCP_RR-4          Avg: 112241    117888
TCP_RR-8          Avg: 104453    113260
TCP_RR-16         Avg:  50897     55280
UDP_RR-1          Avg: 250738    264214
UDP_RR-2          Avg: 196250    253352
UDP_RR-4          Avg: 152862    158819
UDP_RR-8          Avg: 143781    154071
UDP_RR-16         Avg:  68605     76492


tbench     1       2       4       8      16
master   772    1207    1829    3516    3440
masterx  811    1466    1959    3737    3670

hackbench -l 10000
5.917   5.990   5.957  avg 5.954   NO_SIS_DEBOUNCE
5.886   5.808   5.826  avg 5.840   SIS_DEBOUNCE


echo 0 > tracing_on
echo 1 > events/sched/sched_migrate_task/enable
start endless tbench 2

for i in `seq 3`
do
	echo > trace
	echo 1 > tracing_on
	sleep 10
	echo 0 > tracing_on
	cat trace|grep tbench|wc -l
done

kde desktop idling

NO_SIS_DEBOUNCE    261     208     199
SIS_DEBOUNCE         8       6       0

add firefox playing youtube documentary

NO_SIS_DEBOUNCE  10906   10094   10774
SIS_DEBOUNCE        34      34      34

tbench 2 throughput as firefox runs

NO_SIS_DEBOUNCE  1129.63 MB/sec
SIS_DEBOUNCE     1462.53 MB/sec

Advisory: welding goggles.

---
 include/linux/sched.h   |    3 ++-
 kernel/sched/fair.c     |   41 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/features.h |    1 +
 3 files changed, 42 insertions(+), 3 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -541,7 +541,6 @@ struct task_struct {
 	unsigned int			ptrace;
 
 #ifdef CONFIG_SMP
-	struct llist_node		wake_entry;
 	int				on_cpu;
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/* Current CPU: */
@@ -549,8 +548,10 @@ struct task_struct {
 #endif
 	unsigned int			wakee_flips;
 	unsigned long			wakee_flip_decay_ts;
+	unsigned long			wakee_placed;
 	struct task_struct		*last_wakee;
 
+	struct llist_node		wake_entry;
 	int				wake_cpu;
 #endif
 	int				on_rq;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6174,6 +6174,9 @@ static int select_idle_sibling(struct ta
 	if ((unsigned)i < nr_cpumask_bits)
 		return i;
 
+	if (sched_feat(SIS_DEBOUNCE))
+		p->wakee_placed = jiffies;
+
 	return target;
 }
 
@@ -6258,6 +6261,22 @@ static int wake_cap(struct task_struct *
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static bool task_placed(struct task_struct *p)
+{
+	return p->wakee_placed == jiffies;
+}
+
+static bool task_llc_affine_and_cold(struct task_struct *p, int cpu, int prev)
+{
+	int cold = sysctl_sched_migration_cost;
+
+	if (!cpus_share_cache(cpu, prev))
+		return false;
+	if (cold > 0 && rq_clock_task(cpu_rq(prev)) - p->se.exec_start > cold)
+		return true;
+	return false;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -6276,16 +6295,26 @@ select_task_rq_fair(struct task_struct *
 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
 	int cpu = smp_processor_id();
 	int new_cpu = prev_cpu;
-	int want_affine = 0;
+	int want_affine = 0, want_debounce = 0;
 	int sync = wake_flags & WF_SYNC;
 
+	rcu_read_lock();
 	if (sd_flag & SD_BALANCE_WAKE) {
 		record_wakee(p);
+		want_debounce = sched_feat(SIS_DEBOUNCE);
+		if (task_placed(p))
+			goto out_unlock;
+		/* Balance cold tasks to reduce hot task bounce tendency. */
+		if (want_debounce && task_llc_affine_and_cold(p, cpu, prev_cpu)) {
+			sd_flag |= SD_SHARE_PKG_RESOURCES;
+			sd = highest_flag_domain(prev_cpu, SD_SHARE_PKG_RESOURCES);
+			p->wakee_placed = jiffies;
+			goto pick_cpu_cold;
+		}
 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
 			      && cpumask_test_cpu(cpu, &p->cpus_allowed);
 	}
 
-	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		if (!(tmp->flags & SD_LOAD_BALANCE))
 			break;
@@ -6315,6 +6344,7 @@ select_task_rq_fair(struct task_struct *
 			new_cpu = cpu;
 	}
 
+pick_cpu_cold:
 	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
 		/*
 		 * We're going to need the task's util for capacity_spare_wake
@@ -6329,9 +6359,13 @@ select_task_rq_fair(struct task_struct *
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
+		if (want_debounce && new_cpu == prev_cpu)
+			p->wakee_placed = jiffies;
+
 	} else {
 		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
 	}
+out_unlock:
 	rcu_read_unlock();
 
 	return new_cpu;
@@ -6952,6 +6986,9 @@ static int task_hot(struct task_struct *
 	if (sysctl_sched_migration_cost == 0)
 		return 0;
 
+	if (task_placed(p))
+		return 1;
+
 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
 
 	return delta < (s64)sysctl_sched_migration_cost;
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -57,6 +57,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_AVG_CPU, false)
 SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_DEBOUNCE, true)
 
 /*
  * Issue a WARN when we do multiple update_rq_clock() calls

  reply	other threads:[~2017-11-26 20:59 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-31  5:27 [PATCH RFC 0/2] Fix race window during idle cpu selection Atish Patra
2017-10-31  5:27 ` [PATCH RFC 1/2] sched: Minimize the idle cpu selection race window Atish Patra
2017-10-31  8:20   ` Peter Zijlstra
2017-10-31  8:48     ` Mike Galbraith
2017-11-01  6:08       ` Atish Patra
2017-11-01  6:54         ` Mike Galbraith
2017-11-01  7:18           ` Mike Galbraith
2017-11-01 16:36             ` Atish Patra
2017-11-01 20:20               ` Mike Galbraith
2017-11-05  0:58     ` Joel Fernandes
2017-11-22  5:23       ` Atish Patra
2017-11-23 10:52         ` Uladzislau Rezki
2017-11-23 13:13           ` Mike Galbraith
2017-11-23 16:00             ` Josef Bacik
2017-11-23 17:40               ` Mike Galbraith
2017-11-23 21:11               ` Atish Patra
2017-11-24 10:26             ` Uladzislau Rezki
2017-11-24 18:46               ` Mike Galbraith
2017-11-26 20:58                 ` Mike Galbraith [this message]
2017-11-28  9:34                 ` Uladzislau Rezki
2017-11-28 10:49                   ` Mike Galbraith
2017-11-29 10:41                     ` Uladzislau Rezki
2017-11-29 18:15                       ` Mike Galbraith
2017-11-30 12:30                         ` Uladzislau Rezki
2017-10-31  5:27 ` [PATCH DEBUG 2/2] sched: Add a stat for " Atish Patra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1511729914.22158.29.camel@gmx.de \
    --to=efault@gmx.de \
    --cc=atish.patra@oracle.com \
    --cc=brendan.jackman@arm.com \
    --cc=jbacik@fb.com \
    --cc=joelaf@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=urezki@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.