[PATCH 3/4] sched/rt: Split root_domain->rto_count to per-NUMA-node counters

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Pan Deng <pan.deng@intel.com>
To: peterz@infradead.org, mingo@kernel.org
Cc: linux-kernel@vger.kernel.org, tianyou.li@intel.com,
	tim.c.chen@linux.intel.com, yu.c.chen@intel.com,
	pan.deng@intel.com
Subject: [PATCH 3/4] sched/rt: Split root_domain->rto_count to per-NUMA-node counters
Date: Mon,  7 Jul 2025 10:35:27 +0800	[thread overview]
Message-ID: <2c1e1dbacaddd881f3cca340ece1f9268029b620.1751852370.git.pan.deng@intel.com> (raw)
In-Reply-To: <cover.1751852370.git.pan.deng@intel.com>

When running a multi-instance FFmpeg workload on HCC system, significant
contention is observed on root_domain `rto_count` and `overloaded` fields.

The SUT is a 2-socket machine with 240 physical cores and 480 logical
CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical cores
(8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
with FIFO scheduling. FPS is used as score.

perf c2c tool reveals:
root_domain cache line 1:
- `rto_count` (0x4) is frequently loaded/stored
- `overloaded` (0x28) is heavily loaded
- cycles per load: ~2.8K to 44K:

A separate patch rearranges root_domain to place `overloaded` on a
different cache line, but this alone is insufficient to resolve the
contention on `rto_count`. As a complementary, this patch splits
`rto_count` into per-numa-node counters to reduce the contention.

With this change:
- FPS improves by ~4%
- Kernel cycles% drops from ~20% to ~18.6%
- The cache line no longer appears in perf-c2c report

Appendix:
1. Perf c2c report of root_domain cache line 1:
-------  -------  ------  ------  ------  ------  ------------------------
 Rmt      Lcl     Store   Data    Load    Total    Symbol
Hitm%    Hitm%   L1 Hit%  offset  cycles  records
-------  -------  ------  ------  ------  ------  ------------------------
 231       43       48    0xff14d42c400e3800
-------  -------  ------  ------  ------  ------  ------------------------
22.51%   18.60%    0.00%  0x4     5041    247   pull_rt_task
 5.63%    2.33%   45.83%  0x4     6995    315   dequeue_pushable_task
 3.90%    4.65%   54.17%  0x4     6587    370   enqueue_pushable_task
 0.43%    0.00%    0.00%  0x4     17111   4     enqueue_pushable_task
 0.43%    0.00%    0.00%  0x4     44062   4     dequeue_pushable_task
32.03%   27.91%    0.00%  0x28    6393    285   enqueue_task_rt
16.45%   27.91%    0.00%  0x28    5534    139   sched_balance_newidle
14.72%   18.60%    0.00%  0x28    5287    110   dequeue_task_rt
 3.46%    0.00%    0.00%  0x28    2820    25    enqueue_task_fair
 0.43%    0.00%    0.00%  0x28    220     3     enqueue_task_stop

Signed-off-by: Pan Deng <pan.deng@intel.com>
Reviewed-by: Tianyou Li <tianyou.li@intel.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
---
 kernel/sched/rt.c       | 65 +++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h    |  9 +++++-
 kernel/sched/topology.c |  7 +++++
 3 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e40422c37033..cc820dbde6d6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -337,9 +337,58 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 	return rq->online && rq->rt.highest_prio.curr > prev->prio;
 }
 
+int rto_counts_init(atomic_tp **rto_counts)
+{
+	int i;
+	atomic_tp *counts = kzalloc(nr_node_ids * sizeof(atomic_tp), GFP_KERNEL);
+
+	if (!counts)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		counts[i] = kzalloc_node(sizeof(atomic_t), GFP_KERNEL, i);
+
+		if (!counts[i])
+			goto cleanup;
+	}
+
+	*rto_counts = counts;
+	return 0;
+
+cleanup:
+	while (i--)
+		kfree(counts[i]);
+
+	kfree(counts);
+	return -ENOMEM;
+}
+
+void rto_counts_cleanup(atomic_tp *rto_counts)
+{
+	for (int i = 0; i < nr_node_ids; i++)
+		kfree(rto_counts[i]);
+
+	kfree(rto_counts);
+}
+
 static inline int rt_overloaded(struct rq *rq)
 {
-	return atomic_read(&rq->rd->rto_count);
+	int count = 0;
+	int cur_node, nid;
+
+	cur_node = numa_node_id();
+
+	for (int i = 0; i < nr_node_ids; i++) {
+		nid = (cur_node + i) % nr_node_ids;
+		count += atomic_read(rq->rd->rto_counts[nid]);
+
+		// The caller only checks if it is 0
+		// or 1, so that return once > 1
+		if (count > 1)
+			return count;
+	}
+
+	return count;
 }
 
 static inline void rt_set_overload(struct rq *rq)
@@ -358,7 +407,7 @@ static inline void rt_set_overload(struct rq *rq)
 	 * Matched by the barrier in pull_rt_task().
 	 */
 	smp_wmb();
-	atomic_inc(&rq->rd->rto_count);
+	atomic_inc(rq->rd->rto_counts[cpu_to_node(rq->cpu)]);
 }
 
 static inline void rt_clear_overload(struct rq *rq)
@@ -367,7 +416,7 @@ static inline void rt_clear_overload(struct rq *rq)
 		return;
 
 	/* the order here really doesn't matter */
-	atomic_dec(&rq->rd->rto_count);
+	atomic_dec(rq->rd->rto_counts[cpu_to_node(rq->cpu)]);
 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
@@ -443,6 +492,16 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 static inline void rt_queue_push_tasks(struct rq *rq)
 {
 }
+
+int rto_counts_init(atomic_tp **rto_counts)
+{
+	return 0;
+}
+
+void rto_counts_cleanup(atomic_tp *rto_counts)
+{
+}
+
 #endif /* CONFIG_SMP */
 
 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dd3c79470bfc..f80968724dd6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -953,6 +953,8 @@ struct perf_domain {
 	struct rcu_head rcu;
 };
 
+typedef atomic_t *atomic_tp;
+
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -963,12 +965,15 @@ struct perf_domain {
  */
 struct root_domain {
 	atomic_t		refcount;
-	atomic_t		rto_count;
 	struct rcu_head		rcu;
 	cpumask_var_t		span;
 	cpumask_var_t		online;
 
 	atomic_t		dlo_count;
+
+	/* rto_count per node */
+	atomic_tp		*rto_counts;
+
 	struct dl_bw		dl_bw;
 	struct cpudl		cpudl;
 
@@ -1030,6 +1035,8 @@ extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 extern void sched_get_rd(struct root_domain *rd);
 extern void sched_put_rd(struct root_domain *rd);
+extern int rto_counts_init(atomic_tp **rto_counts);
+extern void rto_counts_cleanup(atomic_tp *rto_counts);
 
 static inline int get_rd_overloaded(struct root_domain *rd)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b958fe48e020..166dc8177a44 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -457,6 +457,7 @@ static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
 
+	rto_counts_cleanup(rd->rto_counts);
 	cpupri_cleanup(&rd->cpupri);
 	cpudl_cleanup(&rd->cpudl);
 	free_cpumask_var(rd->dlo_mask);
@@ -549,8 +550,14 @@ static int init_rootdomain(struct root_domain *rd)
 
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_cpudl;
+
+	if (rto_counts_init(&rd->rto_counts) != 0)
+		goto free_cpupri;
+
 	return 0;
 
+free_cpupri:
+	cpupri_cleanup(&rd->cpupri);
 free_cpudl:
 	cpudl_cleanup(&rd->cpudl);
 free_rto_mask:
-- 
2.43.5

next prev parent reply	other threads:[~2025-07-07  2:31 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-07  2:35 [PATCH 0/4] sched/rt: mitigate root_domain cache line contention Pan Deng
2025-07-07  2:35 ` [PATCH 1/4] sched/rt: Optimize cpupri_vec layout to mitigate " Pan Deng
2025-09-01  5:10   ` Chen, Yu C
2025-09-01 13:24     ` Deng, Pan
2025-07-07  2:35 ` [PATCH 2/4] sched/rt: Restructure root_domain to reduce cacheline contention Pan Deng
2025-07-07  2:35 ` Pan Deng [this message]
2025-07-07  6:53   ` [PATCH 3/4] sched/rt: Split root_domain->rto_count to per-NUMA-node counters kernel test robot
2025-07-07 11:36     ` Deng, Pan
2025-07-07  6:53   ` kernel test robot
2025-07-08  5:33   ` kernel test robot
2025-07-08 14:02     ` Deng, Pan
2025-07-09  8:56       ` Li, Philip
2025-07-07  2:35 ` [PATCH 4/4] sched/rt: Split cpupri_vec->cpumask to per NUMA node to reduce contention Pan Deng
2025-07-21 11:23   ` Chen, Yu C
2025-07-22 14:46     ` Deng, Pan
2025-08-06 14:00       ` Deng, Pan

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:e40422c3703 dfblob:cc820dbde6d dfblob:dd3c79470bf
dfblob:f80968724dd dfblob:b958fe48e02 dfblob:166dc8177a4 )
 OR (
bs:"[PATCH 3/4] sched/rt: Split root_domain->rto_count to per-NUMA-node counters" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2c1e1dbacaddd881f3cca340ece1f9268029b620.1751852370.git.pan.deng@intel.com \
    --to=pan.deng@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=tianyou.li@intel.com \
    --cc=tim.c.chen@linux.intel.com \
    --cc=yu.c.chen@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).