From: Juri Lelli <juri.lelli@redhat.com>
To: linux-kernel@vger.kernel.org, cgroups@vger.kernel.org
Cc: "Ingo Molnar" <mingo@redhat.com>,
"Peter Zijlstra" <peterz@infradead.org>,
"Juri Lelli" <juri.lelli@redhat.com>,
"Vincent Guittot" <vincent.guittot@linaro.org>,
"Dietmar Eggemann" <dietmar.eggemann@arm.com>,
"Steven Rostedt" <rostedt@goodmis.org>,
"Ben Segall" <bsegall@google.com>, "Mel Gorman" <mgorman@suse.de>,
"Valentin Schneider" <vschneid@redhat.com>,
"Waiman Long" <longman@redhat.com>, "Tejun Heo" <tj@kernel.org>,
"Johannes Weiner" <hannes@cmpxchg.org>,
"Michal Koutný" <mkoutny@suse.com>,
"Qais Yousef" <qyousef@layalina.io>,
"Sebastian Andrzej Siewior" <bigeasy@linutronix.de>,
"Swapnil Sapkal" <swapnil.sapkal@amd.com>,
"Shrikanth Hegde" <sshegde@linux.ibm.com>,
"Phil Auld" <pauld@redhat.com>,
luca.abeni@santannapisa.it, tommaso.cucinotta@santannapisa.it,
"Jon Hunter" <jonathanh@nvidia.com>
Subject: [PATCH v2 4/8] sched/deadline: Rebuild root domain accounting after every update
Date: Thu, 6 Mar 2025 14:10:12 +0000 [thread overview]
Message-ID: <20250306141016.268313-5-juri.lelli@redhat.com> (raw)
In-Reply-To: <20250306141016.268313-1-juri.lelli@redhat.com>
Rebuilding of root domains accounting information (total_bw) is
currently broken on some cases, e.g. suspend/resume on aarch64. Problem
is that the way we keep track of domain changes and try to add bandwidth
back is convoluted and fragile.
Fix it by simplify things by making sure bandwidth accounting is cleared
and completely restored after root domains changes (after root domains
are again stable).
Reported-by: Jon Hunter <jonathanh@nvidia.com>
Fixes: 53916d5fd3c0 ("sched/deadline: Check bandwidth overflow earlier for hotplug")
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
---
include/linux/sched/deadline.h | 4 ++++
include/linux/sched/topology.h | 2 ++
kernel/cgroup/cpuset.c | 16 +++++++++-------
kernel/sched/deadline.c | 16 ++++++++++------
kernel/sched/topology.c | 1 +
5 files changed, 26 insertions(+), 13 deletions(-)
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 6ec578600b24..a780068aa1a5 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -34,6 +34,10 @@ static inline bool dl_time_before(u64 a, u64 b)
struct root_domain;
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
+extern void dl_clear_root_domain_cpu(int cpu);
+
+extern u64 dl_cookie;
+extern bool dl_bw_visited(int cpu, u64 gen);
#endif /* CONFIG_SMP */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f3dbafe1817..1622232bd08b 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -166,6 +166,8 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
return to_cpumask(sd->span);
}
+extern void dl_rebuild_rd_accounting(void);
+
extern void partition_sched_domains_locked(int ndoms_new,
cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f87526edb2a4..f66b2aefdc04 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -954,10 +954,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void dl_rebuild_rd_accounting(void)
+void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
+ int cpu;
+ u64 cookie = ++dl_cookie;
lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
@@ -965,11 +967,12 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_lock();
- /*
- * Clear default root domain DL accounting, it will be computed again
- * if a task belongs to it.
- */
- dl_clear_root_domain(&def_root_domain);
+ for_each_possible_cpu(cpu) {
+ if (dl_bw_visited(cpu, cookie))
+ continue;
+
+ dl_clear_root_domain_cpu(cpu);
+ }
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
@@ -996,7 +999,6 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
sched_domains_mutex_lock();
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- dl_rebuild_rd_accounting();
sched_domains_mutex_unlock();
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 339434271cba..17b040c92885 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -166,7 +166,7 @@ static inline unsigned long dl_bw_capacity(int i)
}
}
-static inline bool dl_bw_visited(int cpu, u64 cookie)
+bool dl_bw_visited(int cpu, u64 cookie)
{
struct root_domain *rd = cpu_rq(cpu)->rd;
@@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i)
return SCHED_CAPACITY_SCALE;
}
-static inline bool dl_bw_visited(int cpu, u64 cookie)
+bool dl_bw_visited(int cpu, u64 cookie)
{
return false;
}
@@ -2981,18 +2981,22 @@ void dl_clear_root_domain(struct root_domain *rd)
rd->dl_bw.total_bw = 0;
/*
- * dl_server bandwidth is only restored when CPUs are attached to root
- * domains (after domains are created or CPUs moved back to the
- * default root doamin).
+ * dl_servers are not tasks. Since dl_add_task_root_domanin ignores
+ * them, we need to account for them here explicitly.
*/
for_each_cpu(i, rd->span) {
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
if (dl_server(dl_se) && cpu_active(i))
- rd->dl_bw.total_bw += dl_se->dl_bw;
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
}
}
+void dl_clear_root_domain_cpu(int cpu)
+{
+ dl_clear_root_domain(cpu_rq(cpu)->rd);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 44093339761c..363ad268a25b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2791,6 +2791,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
ndoms_cur = ndoms_new;
update_sched_domain_debugfs();
+ dl_rebuild_rd_accounting();
}
/*
--
2.48.1
next prev parent reply other threads:[~2025-03-06 14:11 UTC|newest]
Thread overview: 31+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-03-06 14:10 [PATCH v2 0/8] Fix SCHED_DEADLINE bandwidth accounting during suspend Juri Lelli
2025-03-06 14:10 ` [PATCH v2 1/8] sched/deadline: Ignore special tasks when rebuilding domains Juri Lelli
2025-03-06 14:10 ` [PATCH v2 2/8] sched/topology: Wrappers for sched_domains_mutex Juri Lelli
2025-03-07 6:34 ` Shrikanth Hegde
2025-03-07 8:53 ` Juri Lelli
2025-03-07 9:02 ` Shrikanth Hegde
2025-03-07 15:11 ` Waiman Long
2025-03-07 15:15 ` Juri Lelli
2025-03-07 15:19 ` Waiman Long
2025-03-07 15:59 ` Juri Lelli
2025-03-07 16:34 ` Waiman Long
2025-03-06 14:10 ` [PATCH v2 3/8] sched/deadline: Generalize unique visiting of root domains Juri Lelli
2025-03-07 5:36 ` Shrikanth Hegde
2025-03-07 8:55 ` Juri Lelli
2025-03-06 14:10 ` Juri Lelli [this message]
2025-03-07 6:33 ` [PATCH v2 4/8] sched/deadline: Rebuild root domain accounting after every update Shrikanth Hegde
2025-03-07 9:33 ` Juri Lelli
2025-03-07 7:32 ` Shrikanth Hegde
2025-03-07 8:59 ` Juri Lelli
2025-03-06 14:10 ` [PATCH v2 5/8] sched/topology: Remove redundant dl_clear_root_domain call Juri Lelli
2025-03-06 14:10 ` [PATCH v2 6/8] cgroup/cpuset: Remove partition_and_rebuild_sched_domains Juri Lelli
2025-03-07 7:40 ` Shrikanth Hegde
2025-03-07 15:14 ` Waiman Long
2025-03-07 15:16 ` Waiman Long
2025-03-06 14:10 ` [PATCH v2 7/8] sched/topology: Stop exposing partition_sched_domains_locked Juri Lelli
2025-03-06 14:10 ` [PATCH v2 8/8] include/{topology,cpuset}: Move dl_rebuild_rd_accounting to cpuset.h Juri Lelli
2025-03-07 15:17 ` Waiman Long
2025-03-07 11:40 ` [PATCH v2 0/8] Fix SCHED_DEADLINE bandwidth accounting during suspend Jon Hunter
2025-03-07 15:16 ` Juri Lelli
2025-03-07 19:00 ` Waiman Long
2025-03-10 8:55 ` Juri Lelli
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250306141016.268313-5-juri.lelli@redhat.com \
--to=juri.lelli@redhat.com \
--cc=bigeasy@linutronix.de \
--cc=bsegall@google.com \
--cc=cgroups@vger.kernel.org \
--cc=dietmar.eggemann@arm.com \
--cc=hannes@cmpxchg.org \
--cc=jonathanh@nvidia.com \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=luca.abeni@santannapisa.it \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=mkoutny@suse.com \
--cc=pauld@redhat.com \
--cc=peterz@infradead.org \
--cc=qyousef@layalina.io \
--cc=rostedt@goodmis.org \
--cc=sshegde@linux.ibm.com \
--cc=swapnil.sapkal@amd.com \
--cc=tj@kernel.org \
--cc=tommaso.cucinotta@santannapisa.it \
--cc=vincent.guittot@linaro.org \
--cc=vschneid@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox