[RFC PATCH 1/3] sched: Rename cpus_share_cache to cpus_share

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH 1/3] sched: Rename cpus_share_cache to cpus_share_llc
@ 2023-08-18 15:30 Mathieu Desnoyers
  2023-08-18 15:30 ` [RFC PATCH 2/3] sched: Introduce cpus_share_l2c Mathieu Desnoyers
  2023-08-18 15:30 ` [RFC PATCH 3/3] sched: ttwu_queue_cond: skip queued wakeups across different l2 caches Mathieu Desnoyers
  0 siblings, 2 replies; 8+ messages in thread
From: Mathieu Desnoyers @ 2023-08-18 15:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Aaron Lu, Julien Desfossez, x86

In preparation for introducing cpus_share_l2c, rename cpus_share_cache
to cpus_share_llc, to make it clear that it specifically groups CPUs by
LLC.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Julien Desfossez <jdesfossez@digitalocean.com>
Cc: x86@kernel.org
---
 block/blk-mq.c                 | 2 +-
 include/linux/sched/topology.h | 4 ++--
 kernel/sched/core.c            | 4 ++--
 kernel/sched/fair.c            | 8 ++++----
 kernel/sched/topology.c        | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b9f454613989..ed1457ca2c6d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1136,7 +1136,7 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
 	/* same CPU or cache domain?  Complete locally */
 	if (cpu == rq->mq_ctx->cpu ||
 	    (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
-	     cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+	     cpus_share_llc(cpu, rq->mq_ctx->cpu)))
 		return false;
 
 	/* don't try to IPI to an offline CPU */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 816df6cc444e..7f9331f71260 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -178,7 +178,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
-bool cpus_share_cache(int this_cpu, int that_cpu);
+bool cpus_share_llc(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
@@ -227,7 +227,7 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 {
 }
 
-static inline bool cpus_share_cache(int this_cpu, int that_cpu)
+static inline bool cpus_share_llc(int this_cpu, int that_cpu)
 {
 	return true;
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..d096ce815099 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3904,7 +3904,7 @@ void wake_up_if_idle(int cpu)
 	rcu_read_unlock();
 }
 
-bool cpus_share_cache(int this_cpu, int that_cpu)
+bool cpus_share_llc(int this_cpu, int that_cpu)
 {
 	if (this_cpu == that_cpu)
 		return true;
@@ -3929,7 +3929,7 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 	 * If the CPU does not share cache, then queue the task on the
 	 * remote rqs wakelist to avoid accessing remote data.
 	 */
-	if (!cpus_share_cache(smp_processor_id(), cpu))
+	if (!cpus_share_llc(smp_processor_id(), cpu))
 		return true;
 
 	if (cpu == smp_processor_id())
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4da5f3541762..680bbe0c7d7a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6626,7 +6626,7 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
 	 * a cpufreq perspective, it's better to have higher utilisation
 	 * on one CPU.
 	 */
-	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+	if (available_idle_cpu(this_cpu) && cpus_share_llc(this_cpu, prev_cpu))
 		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
 
 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
@@ -7146,7 +7146,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) &&
+	if (prev != target && cpus_share_llc(prev, target) &&
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
 	    asym_fits_cpu(task_util, util_min, util_max, prev))
 		return prev;
@@ -7172,7 +7172,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	p->recent_used_cpu = prev;
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
-	    cpus_share_cache(recent_used_cpu, target) &&
+	    cpus_share_llc(recent_used_cpu, target) &&
 	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
 	    cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
@@ -7206,7 +7206,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (sched_smt_active()) {
 		has_idle_core = test_idle_cores(target);
 
-		if (!has_idle_core && cpus_share_cache(prev, target)) {
+		if (!has_idle_core && cpus_share_llc(prev, target)) {
 			i = select_idle_smt(p, prev);
 			if ((unsigned int)i < nr_cpumask_bits)
 				return i;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6682535e37c8..1ae2a0a1115a 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -661,7 +661,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
  *
  * Also keep a unique ID per domain (we use the first CPU number in
  * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_cache().
+ * two CPUs are in the same cache domain, see cpus_share_llc().
  */
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [RFC PATCH 2/3] sched: Introduce cpus_share_l2c
  2023-08-18 15:30 [RFC PATCH 1/3] sched: Rename cpus_share_cache to cpus_share_llc Mathieu Desnoyers
@ 2023-08-18 15:30 ` Mathieu Desnoyers
  2023-08-18 15:40   ` Mathieu Desnoyers
  2023-08-19 12:31   ` [RFC PATCH v2 " Mathieu Desnoyers
  2023-08-18 15:30 ` [RFC PATCH 3/3] sched: ttwu_queue_cond: skip queued wakeups across different l2 caches Mathieu Desnoyers
  1 sibling, 2 replies; 8+ messages in thread
From: Mathieu Desnoyers @ 2023-08-18 15:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Aaron Lu, Julien Desfossez, x86

Introduce cpus_share_l2c to allow querying whether two logical CPUs
share a common L2 cache.

Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
cache has a latency of 4-5 cycles, the L2 cache has a latency of at
least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
this, I measured the RAM accesses to a latency around 120ns on my
system [2]. So L3 really is only 2.4x faster than RAM accesses.
Therefore, with this relatively slow access speed compared to L2, the
scheduler will benefit from only considering CPUs sharing an L2 cache
for the purpose of using remote runqueue locking rather than queued
wakeups.

Link: https://en.wikichip.org/wiki/amd/microarchitectures/zen_4 [1]
Link: https://github.com/ChipsandCheese/MemoryLatencyTest [2]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Julien Desfossez <jdesfossez@digitalocean.com>
Cc: x86@kernel.org
---
 include/linux/sched/topology.h |  6 ++++++
 kernel/sched/core.c            |  8 ++++++++
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 20 +++++++++++++++++---
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f9331f71260..c5fdee188bea 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -178,6 +178,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
+bool cpus_share_l2c(int this_cpu, int that_cpu);
 bool cpus_share_llc(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
@@ -227,6 +228,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 {
 }
 
+static inline bool cpus_share_l2c(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 static inline bool cpus_share_llc(int this_cpu, int that_cpu)
 {
 	return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d096ce815099..11e60a69ae31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3904,6 +3904,14 @@ void wake_up_if_idle(int cpu)
 	rcu_read_unlock();
 }
 
+bool cpus_share_l2c(int this_cpu, int that_cpu)
+{
+	if (this_cpu == that_cpu)
+		return true;
+
+	return per_cpu(sd_l2c_id, this_cpu) == per_cpu(sd_l2c_id, that_cpu);
+}
+
 bool cpus_share_llc(int this_cpu, int that_cpu)
 {
 	if (this_cpu == that_cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 81ac605b9cd5..d93543db214c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1828,6 +1828,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 	return sd;
 }
 
+DECLARE_PER_CPU(int, sd_l2c_size);
+DECLARE_PER_CPU(int, sd_l2c_id);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1ae2a0a1115a..41a41f730011 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -661,8 +661,11 @@ static void destroy_sched_domains(struct sched_domain *sd)
  *
  * Also keep a unique ID per domain (we use the first CPU number in
  * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_llc().
+ * two CPUs are in the same cache domain, see cpus_share_l2c() and
+ * cpus_share_llc().
  */
+DEFINE_PER_CPU(int, sd_l2c_size);
+DEFINE_PER_CPU(int, sd_l2c_id);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
@@ -674,10 +677,10 @@ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
 static void update_top_cache_domain(int cpu)
 {
+	const struct cpumask *cluster_mask;
 	struct sched_domain_shared *sds = NULL;
 	struct sched_domain *sd;
-	int id = cpu;
-	int size = 1;
+	int id = cpu, size = 1, l2c_id, l2c_size;
 
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
@@ -686,6 +689,17 @@ static void update_top_cache_domain(int cpu)
 		sds = sd->shared;
 	}
 
+	cluster_mask = topology_cluster_cpumask(cpu);
+	l2c_size = cpumask_weight(cluster_mask);
+	if (l2c_size == 1) {
+		/* Fallback on using LLC. */
+		l2c_size = size;
+		l2c_id = id;
+	}
+	l2c_id = cpumask_first(cluster_mask);
+	per_cpu(sd_l2c_id, cpu) = l2c_id;
+	per_cpu(sd_l2c_size, cpu) = l2c_size;
+
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [RFC PATCH 3/3] sched: ttwu_queue_cond: skip queued wakeups across different l2 caches
  2023-08-18 15:30 [RFC PATCH 1/3] sched: Rename cpus_share_cache to cpus_share_llc Mathieu Desnoyers
  2023-08-18 15:30 ` [RFC PATCH 2/3] sched: Introduce cpus_share_l2c Mathieu Desnoyers
@ 2023-08-18 15:30 ` Mathieu Desnoyers
  2023-08-18 19:27   ` Julien Desfossez
  1 sibling, 1 reply; 8+ messages in thread
From: Mathieu Desnoyers @ 2023-08-18 15:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Aaron Lu, Julien Desfossez, x86

Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
cache has a latency of 4-5 cycles, the L2 cache has a latency of at
least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
this, I measured the RAM accesses to a latency around 120ns on my
system [2]. So L3 really is only 2.4x faster than RAM accesses.
Therefore, with this relatively slow access speed compared to L2, the
scheduler will benefit from only considering CPUs sharing an L2 cache
for the purpose of using remote runqueue locking rather than queued
wakeups.

Skipping queued wakeups for all logical CPUs sharing an LLC means that
on a 192 cores AMD EPYC 9654 96-Core Processor (over 2 sockets), groups
of 8 cores (16 hardware threads) end up grabbing runqueue locks of other
runqueues within the same group for each wakeup, causing contention on
the runqueue locks.

Improve this by only considering logical cpus sharing an L2 cache as
candidates for skipping use of the queued wakeups.

This results in the following benchmark improvements:

    hackbench -g 32 -f 20 --threads --pipe -l 480000 -s 100

from 49s to 34s. (30% speedup)

And similarly with perf bench:

    perf bench sched messaging -g 32 -p -t -l 100000

from 10.9s to 7.4s (32% speedup)

I have noticed that in order to observe the speedup, the workload needs
to keep the CPUs sufficiently busy to cause runqueue lock contention,
but not so busy that they don't go idle. This can be explained by the
fact that idle CPUs are a preferred target for task wakeup runqueue
selection, and therefore having idle cpus causes more migrations, which
triggers more remote wakeups. For both the hackbench and the perf bench
sched messaging benchmarks, the scale of the workload can be tweaked by
changing the number groups.

This was developed as part of the investigation into a weird regression
reported by AMD where adding a raw spinlock in the scheduler context
switch accelerated hackbench. It turned out that changing this raw
spinlock for a loop of 10000x cpu_relax within do_idle() had similar
benefits.

This patch achieves a similar effect without busy waiting nor changing
anything about runqueue selection on wakeup. It considers that only
hardware threads sharing an L2 cache should skip the queued
try-to-wakeup and directly grab the target runqueue lock, rather than
allowing all hardware threads sharing an LLC to do so.

I would be interested to hear feedback about performance impact of this
patch (improvement or regression) on other workloads and hardware,
especially for Intel CPUs. One thing that we might want to empirically
figure out from the topology is whether there is a maximum number of
hardware threads within an LLC below which it would make sense to use
the LLC rather than L2 as group within which queued wakeups can be
skipped.

Link: https://en.wikichip.org/wiki/amd/microarchitectures/zen_4 [1]
Link: https://github.com/ChipsandCheese/MemoryLatencyTest [2]
Link: https://lore.kernel.org/r/09e0f469-a3f7-62ef-75a1-e64cec2dcfc5@amd.com
Link: https://lore.kernel.org/lkml/20230725193048.124796-1-mathieu.desnoyers@efficios.com/
Link: https://lore.kernel.org/lkml/20230810140635.75296-1-mathieu.desnoyers@efficios.com/
Link: https://lore.kernel.org/lkml/20230810140635.75296-1-mathieu.desnoyers@efficios.com/
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Julien Desfossez <jdesfossez@digitalocean.com>
Cc: x86@kernel.org
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11e60a69ae31..317f4cec4653 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3934,10 +3934,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 		return false;

 	/*
-	 * If the CPU does not share cache, then queue the task on the
+	 * If the CPU does not share L2 cache, then queue the task on the
 	 * remote rqs wakelist to avoid accessing remote data.
 	 */
-	if (!cpus_share_llc(smp_processor_id(), cpu))
+	if (!cpus_share_l2c(smp_processor_id(), cpu))
 		return true;

 	if (cpu == smp_processor_id())
-- 
2.39.2

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 2/3] sched: Introduce cpus_share_l2c
  2023-08-18 15:30 ` [RFC PATCH 2/3] sched: Introduce cpus_share_l2c Mathieu Desnoyers
@ 2023-08-18 15:40   ` Mathieu Desnoyers
  2023-08-19 12:31   ` [RFC PATCH v2 " Mathieu Desnoyers
  1 sibling, 0 replies; 8+ messages in thread
From: Mathieu Desnoyers @ 2023-08-18 15:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Valentin Schneider, Steven Rostedt,
	Ben Segall, Mel Gorman, Daniel Bristot de Oliveira,
	Vincent Guittot, Juri Lelli, Swapnil Sapkal, Aaron Lu,
	Julien Desfossez, x86

On 8/18/23 11:30, Mathieu Desnoyers wrote:
[...]
>   
> +	cluster_mask = topology_cluster_cpumask(cpu);
> +	l2c_size = cpumask_weight(cluster_mask);
> +	if (l2c_size == 1) {
> +		/* Fallback on using LLC. */
> +		l2c_size = size;
> +		l2c_id = id;
> +	}

For the fallback case, this should be:

         cluster_mask = topology_cluster_cpumask(cpu);
         l2c_size = cpumask_weight(cluster_mask);
         if (l2c_size == 1) {
                 /* Fallback on using LLC. */
                 l2c_size = size;
                 l2c_id = id;
         } else {
                 l2c_id = cpumask_first(cluster_mask);
         }
         per_cpu(sd_l2c_id, cpu) = l2c_id;
         per_cpu(sd_l2c_size, cpu) = l2c_size;

Thanks,

Mathieu


> +	l2c_id = cpumask_first(cluster_mask);
> +	per_cpu(sd_l2c_id, cpu) = l2c_id;
> +	per_cpu(sd_l2c_size, cpu) = l2c_size;
> +
>   	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>   	per_cpu(sd_llc_size, cpu) = size;
>   	per_cpu(sd_llc_id, cpu) = id;

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH 3/3] sched: ttwu_queue_cond: skip queued wakeups across different l2 caches
  2023-08-18 15:30 ` [RFC PATCH 3/3] sched: ttwu_queue_cond: skip queued wakeups across different l2 caches Mathieu Desnoyers
@ 2023-08-18 19:27   ` Julien Desfossez
  0 siblings, 0 replies; 8+ messages in thread
From: Julien Desfossez @ 2023-08-18 19:27 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Aaron Lu, x86

On 18-Aug-2023 11:30:27 AM, Mathieu Desnoyers wrote:
> Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
> cache has a latency of 4-5 cycles, the L2 cache has a latency of at
> least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
> this, I measured the RAM accesses to a latency around 120ns on my
> system [2]. So L3 really is only 2.4x faster than RAM accesses.
> Therefore, with this relatively slow access speed compared to L2, the
> scheduler will benefit from only considering CPUs sharing an L2 cache
> for the purpose of using remote runqueue locking rather than queued
> wakeups.
> 
> Skipping queued wakeups for all logical CPUs sharing an LLC means that
> on a 192 cores AMD EPYC 9654 96-Core Processor (over 2 sockets), groups
> of 8 cores (16 hardware threads) end up grabbing runqueue locks of other
> runqueues within the same group for each wakeup, causing contention on
> the runqueue locks.
> 
> Improve this by only considering logical cpus sharing an L2 cache as
> candidates for skipping use of the queued wakeups.
> 
> This results in the following benchmark improvements:
> 
>     hackbench -g 32 -f 20 --threads --pipe -l 480000 -s 100
> 
> from 49s to 34s. (30% speedup)
> 
> And similarly with perf bench:
> 
>     perf bench sched messaging -g 32 -p -t -l 100000
> 
> from 10.9s to 7.4s (32% speedup)
> 
> I have noticed that in order to observe the speedup, the workload needs
> to keep the CPUs sufficiently busy to cause runqueue lock contention,
> but not so busy that they don't go idle. This can be explained by the
> fact that idle CPUs are a preferred target for task wakeup runqueue
> selection, and therefore having idle cpus causes more migrations, which
> triggers more remote wakeups. For both the hackbench and the perf bench
> sched messaging benchmarks, the scale of the workload can be tweaked by
> changing the number groups.
> 
> This was developed as part of the investigation into a weird regression
> reported by AMD where adding a raw spinlock in the scheduler context
> switch accelerated hackbench. It turned out that changing this raw
> spinlock for a loop of 10000x cpu_relax within do_idle() had similar
> benefits.
> 
> This patch achieves a similar effect without busy waiting nor changing
> anything about runqueue selection on wakeup. It considers that only
> hardware threads sharing an L2 cache should skip the queued
> try-to-wakeup and directly grab the target runqueue lock, rather than
> allowing all hardware threads sharing an LLC to do so.
> 
> I would be interested to hear feedback about performance impact of this
> patch (improvement or regression) on other workloads and hardware,
> especially for Intel CPUs. One thing that we might want to empirically
> figure out from the topology is whether there is a maximum number of
> hardware threads within an LLC below which it would make sense to use
> the LLC rather than L2 as group within which queued wakeups can be
> skipped.

I just tested this patchset with the above perf bench sched on Intel Xeon
Platinum 8358 CPU with 2 sockets, 32 cores (64 hardware threads) by socket.

Here are the results by group size (the -g parameter):
group  A_idle   B_idle   A_avg    A_stdev  B_avg    B_stdev  diff_percent
2      51.9429  50.8     3.45583  0.333    3.5144   0.379    -1.69
4      29.0722  29.475   4.0825   0.067    3.9498   0.049    3.250
6      17.9737  18.7158  4.991    0.026    4.5654   0.051    8.527
8      12.8167  13.1667  5.9366   0.046    5.1026   0.049    14.04
10     9.608    7.08095  6.6506   0.038    5.6814   0.064    14.57
12     6.91034  3.1      7.2526   0.048    6.7408   0.170    7.056
14     5.48333  2.76429  7.9436   0.085    8.1394   0.124    -2.46
16     4.475    3.21562  9.184    0.277    9.544    0.302    -3.91
18     4.60811  2.65588  10.45    0.279    10.46    0.243    -.095
20     4.35349  3.40857  11.7954  0.304    10.9482  0.268    7.182
22     4.63778  3.25714  13.0338  0.294    11.7888  0.302    9.552
24     4.49216  3.5      14.2888  0.251    13.1898  0.344    7.691
26     4.67288  3.42308  15.7286  0.231    14.3678  0.495    8.651
28     4.9375   3.68929  17.4738  0.714    15.3134  0.541    12.36
30     4.65538  4.47286  18.678   0.486    18.2136  1.413    2.486
32     4.28904  4.25479  19.6096  0.168    18.4308  0.778    6.011
34     4.49342  4.09146  20.9478  0.184    21.0388  1.349    -.434
36     4.47439  4.04945  22.3406  0.322    23.606   1.060    -5.66

The avg and stdev are based on 5 runs of perf bench sched.

"A" is the baseline captured on 6.1.42, "B" is 6.1.42 + this patchset.
the _idle columns are a rough sampling of the %idle CPU time while perf
is running.

This is really encouraging and it seems to match an issue I have
observed in production for a while where the scheduling latencies seem
too high proportionally to the %idle on large servers. Those servers are
hypervisors basically running a lot of random workloads in VMs. I am
working on a synthetic test case that could help confirm this is the
same issue or not.

Thanks,

Julien

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [RFC PATCH v2 2/3] sched: Introduce cpus_share_l2c
  2023-08-18 15:30 ` [RFC PATCH 2/3] sched: Introduce cpus_share_l2c Mathieu Desnoyers
  2023-08-18 15:40   ` Mathieu Desnoyers
@ 2023-08-19 12:31   ` Mathieu Desnoyers
  2023-08-22  7:58     ` Aaron Lu
  1 sibling, 1 reply; 8+ messages in thread
From: Mathieu Desnoyers @ 2023-08-19 12:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Aaron Lu, Julien Desfossez, x86

Introduce cpus_share_l2c to allow querying whether two logical CPUs
share a common L2 cache.

Considering a system like the AMD EPYC 9654 96-Core Processor, the L1
cache has a latency of 4-5 cycles, the L2 cache has a latency of at
least 14ns, whereas the L3 cache has a latency of 50ns [1]. Compared to
this, I measured the RAM accesses to a latency around 120ns on my
system [2]. So L3 really is only 2.4x faster than RAM accesses.
Therefore, with this relatively slow access speed compared to L2, the
scheduler will benefit from only considering CPUs sharing an L2 cache
for the purpose of using remote runqueue locking rather than queued
wakeups.

Link: https://en.wikichip.org/wiki/amd/microarchitectures/zen_4 [1]
Link: https://github.com/ChipsandCheese/MemoryLatencyTest [2]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Swapnil Sapkal <Swapnil.Sapkal@amd.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Julien Desfossez <jdesfossez@digitalocean.com>
Cc: x86@kernel.org
---
Changes since v1:
- Fix l2c id for configurations where L2 have a single logical CPU:
  use TOPOLOGY_CLUSTER_SYSFS to find out whether topology cluster is
  implemented or if LLC should be used as fallback.
---
 include/linux/sched/topology.h |  6 ++++++
 kernel/sched/core.c            |  8 ++++++++
 kernel/sched/sched.h           |  2 ++
 kernel/sched/topology.c        | 32 +++++++++++++++++++++++++++++---
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f9331f71260..c5fdee188bea 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -178,6 +178,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
+bool cpus_share_l2c(int this_cpu, int that_cpu);
 bool cpus_share_llc(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
@@ -227,6 +228,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 {
 }
 
+static inline bool cpus_share_l2c(int this_cpu, int that_cpu)
+{
+	return true;
+}
+
 static inline bool cpus_share_llc(int this_cpu, int that_cpu)
 {
 	return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d096ce815099..11e60a69ae31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3904,6 +3904,14 @@ void wake_up_if_idle(int cpu)
 	rcu_read_unlock();
 }
 
+bool cpus_share_l2c(int this_cpu, int that_cpu)
+{
+	if (this_cpu == that_cpu)
+		return true;
+
+	return per_cpu(sd_l2c_id, this_cpu) == per_cpu(sd_l2c_id, that_cpu);
+}
+
 bool cpus_share_llc(int this_cpu, int that_cpu)
 {
 	if (this_cpu == that_cpu)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 81ac605b9cd5..d93543db214c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1828,6 +1828,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 	return sd;
 }
 
+DECLARE_PER_CPU(int, sd_l2c_size);
+DECLARE_PER_CPU(int, sd_l2c_id);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1ae2a0a1115a..2a672cbcd67e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -661,8 +661,11 @@ static void destroy_sched_domains(struct sched_domain *sd)
  *
  * Also keep a unique ID per domain (we use the first CPU number in
  * the cpumask of the domain), this allows us to quickly tell if
- * two CPUs are in the same cache domain, see cpus_share_llc().
+ * two CPUs are in the same cache domain, see cpus_share_l2c() and
+ * cpus_share_llc().
  */
+DEFINE_PER_CPU(int, sd_l2c_size);
+DEFINE_PER_CPU(int, sd_l2c_id);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
@@ -672,12 +675,27 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
+#ifdef TOPOLOGY_CLUSTER_SYSFS
+static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
+{
+	const struct cpumask *cluster_mask = topology_cluster_cpumask(cpu);
+
+	*l2c_size = cpumask_weight(cluster_mask);
+	*l2c_id = cpumask_first(cluster_mask);
+	return 0;
+}
+#else
+static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
+{
+	return -1;
+}
+#endif
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
 	struct sched_domain *sd;
-	int id = cpu;
-	int size = 1;
+	int id = cpu, size = 1, l2c_id, l2c_size;
 
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
 	if (sd) {
@@ -686,6 +704,14 @@ static void update_top_cache_domain(int cpu)
 		sds = sd->shared;
 	}
 
+	if (cpu_get_l2c_info(cpu, &l2c_id, &l2c_size)) {
+		/* Fallback on using LLC. */
+		l2c_size = size;
+		l2c_id = id;
+	}
+	per_cpu(sd_l2c_size, cpu) = l2c_size;
+	per_cpu(sd_l2c_id, cpu) = l2c_id;
+
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH v2 2/3] sched: Introduce cpus_share_l2c
  2023-08-19 12:31   ` [RFC PATCH v2 " Mathieu Desnoyers
@ 2023-08-22  7:58     ` Aaron Lu
  2023-08-22 11:12       ` Mathieu Desnoyers
  0 siblings, 1 reply; 8+ messages in thread
From: Aaron Lu @ 2023-08-22  7:58 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Julien Desfossez, x86

On Sat, Aug 19, 2023 at 08:31:55AM -0400, Mathieu Desnoyers wrote:
> +#ifdef TOPOLOGY_CLUSTER_SYSFS
> +static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
> +{
> +	const struct cpumask *cluster_mask = topology_cluster_cpumask(cpu);
> +
> +	*l2c_size = cpumask_weight(cluster_mask);
> +	*l2c_id = cpumask_first(cluster_mask);
> +	return 0;
> +}
> +#else
> +static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
> +{
> +	return -1;
> +}
> +#endif
> +
>  static void update_top_cache_domain(int cpu)
>  {
>  	struct sched_domain_shared *sds = NULL;
>  	struct sched_domain *sd;
> -	int id = cpu;
> -	int size = 1;
> +	int id = cpu, size = 1, l2c_id, l2c_size;
>  
>  	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>  	if (sd) {
> @@ -686,6 +704,14 @@ static void update_top_cache_domain(int cpu)
>  		sds = sd->shared;
>  	}
>  
> +	if (cpu_get_l2c_info(cpu, &l2c_id, &l2c_size)) {

Should be:  cpu_get_l2c_info(cpu, &l2c_size, &l2c_id)

> +		/* Fallback on using LLC. */
> +		l2c_size = size;
> +		l2c_id = id;
> +	}
> +	per_cpu(sd_l2c_size, cpu) = l2c_size;
> +	per_cpu(sd_l2c_id, cpu) = l2c_id;
> +
>  	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>  	per_cpu(sd_llc_size, cpu) = size;
>  	per_cpu(sd_llc_id, cpu) = id;
> -- 
> 2.39.2
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [RFC PATCH v2 2/3] sched: Introduce cpus_share_l2c
  2023-08-22  7:58     ` Aaron Lu
@ 2023-08-22 11:12       ` Mathieu Desnoyers
  0 siblings, 0 replies; 8+ messages in thread
From: Mathieu Desnoyers @ 2023-08-22 11:12 UTC (permalink / raw)
  To: Aaron Lu
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Valentin Schneider,
	Steven Rostedt, Ben Segall, Mel Gorman,
	Daniel Bristot de Oliveira, Vincent Guittot, Juri Lelli,
	Swapnil Sapkal, Julien Desfossez, x86

On 8/22/23 03:58, Aaron Lu wrote:
> On Sat, Aug 19, 2023 at 08:31:55AM -0400, Mathieu Desnoyers wrote:
>> +#ifdef TOPOLOGY_CLUSTER_SYSFS
>> +static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
>> +{
>> +	const struct cpumask *cluster_mask = topology_cluster_cpumask(cpu);
>> +
>> +	*l2c_size = cpumask_weight(cluster_mask);
>> +	*l2c_id = cpumask_first(cluster_mask);
>> +	return 0;
>> +}
>> +#else
>> +static int cpu_get_l2c_info(int cpu, int *l2c_size, int *l2c_id)
>> +{
>> +	return -1;
>> +}
>> +#endif
>> +
>>   static void update_top_cache_domain(int cpu)
>>   {
>>   	struct sched_domain_shared *sds = NULL;
>>   	struct sched_domain *sd;
>> -	int id = cpu;
>> -	int size = 1;
>> +	int id = cpu, size = 1, l2c_id, l2c_size;
>>   
>>   	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
>>   	if (sd) {
>> @@ -686,6 +704,14 @@ static void update_top_cache_domain(int cpu)
>>   		sds = sd->shared;
>>   	}
>>   
>> +	if (cpu_get_l2c_info(cpu, &l2c_id, &l2c_size)) {
> 
> Should be:  cpu_get_l2c_info(cpu, &l2c_size, &l2c_id)

Good point! Thanks for spotting it. I will fix it in a v3.

Mathieu

> 
>> +		/* Fallback on using LLC. */
>> +		l2c_size = size;
>> +		l2c_id = id;
>> +	}
>> +	per_cpu(sd_l2c_size, cpu) = l2c_size;
>> +	per_cpu(sd_l2c_id, cpu) = l2c_id;
>> +
>>   	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
>>   	per_cpu(sd_llc_size, cpu) = size;
>>   	per_cpu(sd_llc_id, cpu) = id;
>> -- 
>> 2.39.2
>>

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-08-22 11:12 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-18 15:30 [RFC PATCH 1/3] sched: Rename cpus_share_cache to cpus_share_llc Mathieu Desnoyers
2023-08-18 15:30 ` [RFC PATCH 2/3] sched: Introduce cpus_share_l2c Mathieu Desnoyers
2023-08-18 15:40   ` Mathieu Desnoyers
2023-08-19 12:31   ` [RFC PATCH v2 " Mathieu Desnoyers
2023-08-22  7:58     ` Aaron Lu
2023-08-22 11:12       ` Mathieu Desnoyers
2023-08-18 15:30 ` [RFC PATCH 3/3] sched: ttwu_queue_cond: skip queued wakeups across different l2 caches Mathieu Desnoyers
2023-08-18 19:27   ` Julien Desfossez

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox