From: Qais Yousef <qyousef@layalina.io>
To: Ingo Molnar <mingo@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Vincent Guittot <vincent.guittot@linaro.org>,
"Rafael J. Wysocki" <rafael@kernel.org>,
Viresh Kumar <viresh.kumar@linaro.org>
Cc: Juri Lelli <juri.lelli@redhat.com>,
Steven Rostedt <rostedt@goodmis.org>,
John Stultz <jstultz@google.com>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Tim Chen <tim.c.chen@linux.intel.com>,
"Chen, Yu C" <yu.c.chen@intel.com>,
Thomas Gleixner <tglx@kernel.org>,
linux-kernel@vger.kernel.org, linux-pm@vger.kernel.org,
Qais Yousef <qyousef@layalina.io>
Subject: [PATCH v2 05/13] sched: cpufreq: Remove magic 1.25 headroom from sugov_apply_dvfs_headroom()
Date: Mon, 4 May 2026 02:59:55 +0100 [thread overview]
Message-ID: <20260504020003.71306-6-qyousef@layalina.io> (raw)
In-Reply-To: <20260504020003.71306-1-qyousef@layalina.io>
Replace 1.25 headroom in sugov_apply_dvfs_headroom() with better dynamic
logic.
Instead of the magical 1.25 headroom, use the new approximate_util_avg()
to provide headroom based on the dvfs_update_delay, which is the period
at which the cpufreq governor will send DVFS updates to the hardware, or
min(curr.se.slice, TICK_USEC) which is the max delay for util signal to
change and promote a cpufreq update; whichever is higher.
Add a new percpu dvfs_update_delay that can be cheaply accessed whenever
sugov_apply_dvfs_headroom() is called. We expect cpufreq governors that
rely on util to drive its DVFS logic/algorithm to populate these percpu
variables. schedutil is the only such governor at the moment.
The behavior of schedutil will change. Some systems will experience
faster dvfs rampup (because of higher TICK or rate_limit_us), others
will experience slower rampup.
The impact on performance should not be visible if not for the black
hole effect of utilization invariance. A problem that will be addressed
in later patches.
CONST_DVFS_HEADROOM sched_feat allows reverting back to the old behavior
for easy backward compatibility.
Signed-off-by: Qais Yousef <qyousef@layalina.io>
---
kernel/sched/core.c | 1 +
kernel/sched/cpufreq_schedutil.c | 39 +++++++++++++++++++++++++++-----
kernel/sched/features.h | 6 +++++
kernel/sched/sched.h | 9 ++++++++
4 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 47ec8ea7c52e..3fbf560203f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -124,6 +124,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU_READ_MOSTLY(u64, dvfs_update_delay);
DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f6de241fc62c..b529f5b96f6e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -215,13 +215,31 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* to run at adequate performance point.
*
* This function provides enough headroom to provide adequate performance
- * assuming the CPU continues to be busy.
+ * assuming the CPU continues to be busy. This headroom is based on the
+ * dvfs_update_delay of the cpufreq governor or min(curr.se.slice, TICK_US),
+ * whichever is higher.
*
- * At the moment it is a constant multiplication with 1.25.
+ * XXX: Should we provide headroom when the util is decaying?
*/
-static inline unsigned long sugov_apply_dvfs_headroom(unsigned long util)
+static inline unsigned long sugov_apply_dvfs_headroom(unsigned long util, int cpu)
{
- return util + (util >> 2);
+ struct rq *rq = cpu_rq(cpu);
+ u64 delay;
+
+ if (sched_feat(CONST_DVFS_HEADROOM))
+ return util + (util >> 2);
+
+ /*
+ * What is the possible worst case scenario for updating util_avg, ctx
+ * switch or TICK?
+ */
+ if (rq->cfs.h_nr_queued > 1)
+ delay = min(rq->curr->se.slice/1000, TICK_USEC);
+ else
+ delay = TICK_USEC;
+ delay = max(delay, per_cpu(dvfs_update_delay, cpu));
+
+ return approximate_util_avg(util, delay);
}
unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
@@ -229,7 +247,7 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
unsigned long max)
{
/* Add dvfs headroom to actual utilization */
- actual = sugov_apply_dvfs_headroom(actual);
+ actual = sugov_apply_dvfs_headroom(actual, cpu);
/* Actually we don't need to target the max performance */
if (actual < max)
max = actual;
@@ -615,15 +633,21 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
unsigned int rate_limit_us;
+ int cpu;
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
tunables->rate_limit_us = rate_limit_us;
- list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+
sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ for_each_cpu(cpu, sg_policy->policy->cpus)
+ per_cpu(dvfs_update_delay, cpu) = rate_limit_us;
+ }
+
return count;
}
@@ -886,6 +910,9 @@ static int sugov_start(struct cpufreq_policy *policy)
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
+
+ per_cpu(dvfs_update_delay, cpu) = sg_policy->tunables->rate_limit_us;
+
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
}
return 0;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a25f97201ab9..6f7e5bba854f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -129,3 +129,9 @@ SCHED_FEAT(LATENCY_WARN, false)
*/
SCHED_FEAT(NI_RANDOM, true)
SCHED_FEAT(NI_RATE, true)
+
+/*
+ * For backward compatibility. Use the constant 1.25 dvfs headroom in
+ * schedutil instead of the dynamic one.
+ */
+SCHED_FEAT(CONST_DVFS_HEADROOM, false)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24008f1ec812..16ebd8eb48d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3531,6 +3531,15 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
unsigned long approximate_util_avg(unsigned long util, u64 delta);
u64 approximate_runtime(unsigned long util);
+/*
+ * Any governor that relies on util signal to drive DVFS, must populate these
+ * percpu dvfs_update_delay variables.
+ *
+ * It should describe the rate/delay at which the governor sends DVFS freq
+ * update to the hardware in us.
+ */
+DECLARE_PER_CPU_READ_MOSTLY(u64, dvfs_update_delay);
+
/*
* Verify the fitness of task @p to run on @cpu taking into account the
* CPU original capacity and the runtime/deadline ratio of the task.
--
2.34.1
next prev parent reply other threads:[~2026-05-04 2:00 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-04 1:59 [PATCH v2 00/13] sched/fair/schedutil: Better manage system response time Qais Yousef
2026-05-04 1:59 ` [PATCH v2 01/13] sched: cpufreq: Rename map_util_perf to sugov_apply_dvfs_headroom Qais Yousef
2026-05-04 1:59 ` [PATCH v2 02/13] sched/pelt: Add a new function to approximate the future util_avg value Qais Yousef
2026-05-04 1:59 ` [PATCH v2 03/13] sched/pelt: Add a new function to approximate runtime to reach given util Qais Yousef
2026-05-04 1:59 ` [PATCH v2 04/13] sched/fair: Remove magic hardcoded margin in fits_capacity() Qais Yousef
2026-05-04 1:59 ` Qais Yousef [this message]
2026-05-04 1:59 ` [PATCH v2 06/13] sched/fair: Extend util_est to improve rampup time Qais Yousef
2026-05-04 1:59 ` [PATCH v2 07/13] sched/fair: util_est: Take into account periodic tasks Qais Yousef
2026-05-04 1:59 ` [PATCH v2 RFC 08/13] sched/qos: Add a new sched-qos interface Qais Yousef
2026-05-06 20:38 ` Tim Chen
2026-05-07 9:55 ` Qais Yousef
2026-05-07 14:20 ` Chen, Yu C
2026-05-09 9:39 ` Qais Yousef
2026-05-11 10:57 ` Peter Zijlstra
2026-05-04 1:59 ` [PATCH v2 09/13] sched/qos: Add rampup multiplier QoS Qais Yousef
2026-05-11 11:03 ` Peter Zijlstra
2026-05-04 2:00 ` [PATCH v2 10/13] sched/fair: Disable util_est when rampup_multiplier is 0 Qais Yousef
2026-05-04 2:00 ` [PATCH v2 11/13] sched/fair: Don't mess with util_avg post init Qais Yousef
2026-05-04 2:00 ` [PATCH v2 12/13] sched/fair: Call update_util_est() after dequeue_entities() Qais Yousef
2026-05-04 2:00 ` [PATCH v2 RFC 13/13] sched/pelt: Always allow load updates Qais Yousef
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260504020003.71306-6-qyousef@layalina.io \
--to=qyousef@layalina.io \
--cc=dietmar.eggemann@arm.com \
--cc=jstultz@google.com \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pm@vger.kernel.org \
--cc=mingo@kernel.org \
--cc=peterz@infradead.org \
--cc=rafael@kernel.org \
--cc=rostedt@goodmis.org \
--cc=tglx@kernel.org \
--cc=tim.c.chen@linux.intel.com \
--cc=vincent.guittot@linaro.org \
--cc=viresh.kumar@linaro.org \
--cc=yu.c.chen@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox