From: Fernand Sieber <sieberf@amazon.com>
To: Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Juri Lelli <juri.lelli@redhat.com>,
Vincent Guittot <vincent.guittot@linaro.org>
Cc: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
Andrea Righi <arighi@nvidia.com>,
Changwoo Min <changwoo@igalia.com>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
<linux-kernel@vger.kernel.org>, <nh-open-source@amazon.com>,
Fahad Mubeen <fmubeen@amazon.de>,
"Hendrik Borghorst" <hborghor@amazon.de>,
David Woodhouse <dwmw@amazon.co.uk>,
Fernand Sieber <sieberf@amazon.com>
Subject: [PATCH 1/2] sched/fair: expose cpu.max.runtime to set bandwidth runtime directly
Date: Mon, 25 May 2026 21:36:21 +0200 [thread overview]
Message-ID: <20260525193622.70282-2-sieberf@amazon.com> (raw)
In-Reply-To: <20260525193622.70282-1-sieberf@amazon.com>
Add a cpu.max.runtime cgroup v2 interface that allows userspace to set
the CFS bandwidth controller's runtime directly. This enables CPU credit
injection: an orchestrator writes a runtime budget which the cgroup
consumes naturally through the existing bandwidth enforcement mechanism.
The write sets cfs_b->runtime directly. Each period, the task consumes
runtime and the refill restores only quota (capped at quota + burst), so
the injected credits drain until runtime falls below the cap, after which
the cgroup returns to its steady-state quota allocation.
Writes are rejected if the value exceeds quota + burst (the per-period
runtime cap) or exceeds the maximum bandwidth limit.
Also relax the burst validation: remove the burst <= quota constraint,
requiring only that burst + quota does not overflow. This allows
configuring burst > quota so that the runtime cap (quota + burst) can
reach up to one full period, enabling 100% utilization while credits last.
The interface uses microseconds, consistent with cpu.max quota/period.
Signed-off-by: Fernand Sieber <sieberf@amazon.com>
---
kernel/sched/core.c | 44 +++++++++++++++-
tools/testing/selftests/cgroup/test_cpu.c | 62 +++++++++++++++++++++++
2 files changed, 104 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8871449d..d92e5840b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10085,8 +10085,7 @@ static int tg_set_bandwidth(struct task_group *tg,
if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
return -EINVAL;
- if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
- burst_us + quota_us > max_bw_runtime_us))
+ if (quota_us != RUNTIME_INF && (burst_us + quota_us > max_bw_runtime_us))
return -EINVAL;
#ifdef CONFIG_CFS_BANDWIDTH
@@ -10147,6 +10146,41 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
tg_bandwidth(tg, &period_us, "a_us, NULL);
return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
}
+
+static int cpu_runtime_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 runtime_us)
+{
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ if (runtime_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ raw_spin_lock_irq(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF &&
+ (u64)runtime_us * NSEC_PER_USEC > cfs_b->quota + cfs_b->burst) {
+ raw_spin_unlock_irq(&cfs_b->lock);
+ return -EINVAL;
+ }
+ cfs_b->runtime = (u64)runtime_us * NSEC_PER_USEC;
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ return 0;
+}
+
+static u64 cpu_runtime_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype)
+{
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 runtime_ns;
+
+ raw_spin_lock_irq(&cfs_b->lock);
+ runtime_ns = cfs_b->runtime;
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ return runtime_ns / NSEC_PER_USEC;
+}
#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -10498,6 +10532,12 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_burst_read_u64,
.write_u64 = cpu_burst_write_u64,
},
+ {
+ .name = "max.runtime",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_runtime_read_u64,
+ .write_u64 = cpu_runtime_write_u64,
+ },
#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index c83f05438..df151702b 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -776,6 +776,67 @@ static int test_cpucg_max_nested(const char *root)
return ret;
}
+static int test_cpucg_max_runtime(const char *root)
+{
+ int ret = KSFT_FAIL;
+ long quota_usec = 1000; /* 1ms (minimum) */
+ long period_usec = 100000; /* 100ms */
+ long burst_usec = 5000000; /* 5s, so cap = 5001ms */
+ long runtime_usec = 2500000; /* 2500ms = half of 5s run */
+ long duration_sec = 5;
+ long expected_usec = duration_sec * USEC_PER_SEC / 2; /* 50% */
+ long usage_usec;
+ char *cpucg;
+ char buf[64];
+ int pid;
+
+ cpucg = cg_name(root, "cpucg_runtime_test");
+ if (!cpucg)
+ goto cleanup;
+
+ if (cg_create(cpucg))
+ goto cleanup;
+
+ snprintf(buf, sizeof(buf), "%ld %ld", quota_usec, period_usec);
+ if (cg_write(cpucg, "cpu.max", buf))
+ goto cleanup;
+ if (cg_write_numeric(cpucg, "cpu.max.burst", burst_usec))
+ goto cleanup;
+
+ /* Start burner, let it settle, then inject credits */
+ struct cpu_hog_func_param param = {
+ .nprocs = 1,
+ .ts = { .tv_sec = duration_sec, .tv_nsec = 0 },
+ .clock_type = CPU_HOG_CLOCK_WALL,
+ };
+ pid = cg_run_nowait(cpucg, hog_cpus_timed, (void *)¶m);
+ if (pid < 0)
+ goto cleanup;
+
+ usleep(100000);
+ if (cg_write_numeric(cpucg, "cpu.max.runtime", runtime_usec)) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ goto cleanup;
+ }
+
+ waitpid(pid, NULL, 0);
+
+ usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+ if (usage_usec <= 0)
+ goto cleanup;
+
+ if (!values_close_report(usage_usec, expected_usec, 10))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(cpucg);
+ free(cpucg);
+ return ret;
+}
+
#define T(x) { x, #x }
struct cpucg_test {
int (*fn)(const char *root);
@@ -790,6 +851,7 @@ struct cpucg_test {
T(test_cpucg_nested_weight_underprovisioned),
T(test_cpucg_max),
T(test_cpucg_max_nested),
+ T(test_cpucg_max_runtime),
};
#undef T
--
2.47.3
Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
next prev parent reply other threads:[~2026-05-25 19:37 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-25 19:36 [PATCH 0/2] sched/fair: expose cpu.max.runtime for credit injection Fernand Sieber
2026-05-25 19:36 ` Fernand Sieber [this message]
2026-05-26 20:52 ` [PATCH 1/2] sched/fair: expose cpu.max.runtime to set bandwidth runtime directly Benjamin Segall
2026-05-28 7:25 ` Fernand Sieber
2026-05-27 19:04 ` Tejun Heo
2026-05-28 6:54 ` Fernand Sieber
2026-05-28 14:37 ` Tejun Heo
2026-05-25 19:36 ` [PATCH 2/2] sched/ext: add cgroup_set_runtime ops callback Fernand Sieber
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260525193622.70282-2-sieberf@amazon.com \
--to=sieberf@amazon.com \
--cc=arighi@nvidia.com \
--cc=bsegall@google.com \
--cc=changwoo@igalia.com \
--cc=dietmar.eggemann@arm.com \
--cc=dwmw@amazon.co.uk \
--cc=fmubeen@amazon.de \
--cc=hborghor@amazon.de \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=nh-open-source@amazon.com \
--cc=peterz@infradead.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox