From: Fernand Sieber <sieberf@amazon.com>
To: Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
Juri Lelli <juri.lelli@redhat.com>,
Vincent Guittot <vincent.guittot@linaro.org>
Cc: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
Andrea Righi <arighi@nvidia.com>,
Changwoo Min <changwoo@igalia.com>,
Dietmar Eggemann <dietmar.eggemann@arm.com>,
Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
<linux-kernel@vger.kernel.org>, <nh-open-source@amazon.com>,
Fahad Mubeen <fmubeen@amazon.de>,
"Hendrik Borghorst" <hborghor@amazon.de>,
David Woodhouse <dwmw@amazon.co.uk>,
Fernand Sieber <sieberf@amazon.com>
Subject: [PATCH 1/2] sched/fair: expose cpu.max.runtime to set bandwidth runtime directly
Date: Mon, 25 May 2026 21:36:21 +0200 [thread overview]
Message-ID: <20260525193622.70282-2-sieberf@amazon.com> (raw)
In-Reply-To: <20260525193622.70282-1-sieberf@amazon.com>
Add a cpu.max.runtime cgroup v2 interface that allows userspace to set
the CFS bandwidth controller's runtime directly. This enables CPU credit
injection: an orchestrator writes a runtime budget which the cgroup
consumes naturally through the existing bandwidth enforcement mechanism.
The write sets cfs_b->runtime directly. Each period, the task consumes
runtime and the refill restores only quota (capped at quota + burst), so
the injected credits drain until runtime falls below the cap, after which
the cgroup returns to its steady-state quota allocation.
Writes are rejected if the value exceeds quota + burst (the per-period
runtime cap) or exceeds the maximum bandwidth limit.
Also relax the burst validation: remove the burst <= quota constraint,
requiring only that burst + quota does not overflow. This allows
configuring burst > quota so that the runtime cap (quota + burst) can
reach up to one full period, enabling 100% utilization while credits last.
The interface uses microseconds, consistent with cpu.max quota/period.
Signed-off-by: Fernand Sieber <sieberf@amazon.com>
---
kernel/sched/core.c | 44 +++++++++++++++-
tools/testing/selftests/cgroup/test_cpu.c | 62 +++++++++++++++++++++++
2 files changed, 104 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8871449d..d92e5840b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10085,8 +10085,7 @@ static int tg_set_bandwidth(struct task_group *tg,
if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
return -EINVAL;
- if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
- burst_us + quota_us > max_bw_runtime_us))
+ if (quota_us != RUNTIME_INF && (burst_us + quota_us > max_bw_runtime_us))
return -EINVAL;
#ifdef CONFIG_CFS_BANDWIDTH
@@ -10147,6 +10146,41 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
tg_bandwidth(tg, &period_us, "a_us, NULL);
return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
}
+
+static int cpu_runtime_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype, u64 runtime_us)
+{
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+ if (runtime_us > max_bw_runtime_us)
+ return -EINVAL;
+
+ raw_spin_lock_irq(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF &&
+ (u64)runtime_us * NSEC_PER_USEC > cfs_b->quota + cfs_b->burst) {
+ raw_spin_unlock_irq(&cfs_b->lock);
+ return -EINVAL;
+ }
+ cfs_b->runtime = (u64)runtime_us * NSEC_PER_USEC;
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ return 0;
+}
+
+static u64 cpu_runtime_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cftype)
+{
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 runtime_ns;
+
+ raw_spin_lock_irq(&cfs_b->lock);
+ runtime_ns = cfs_b->runtime;
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ return runtime_ns / NSEC_PER_USEC;
+}
#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -10498,6 +10532,12 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_burst_read_u64,
.write_u64 = cpu_burst_write_u64,
},
+ {
+ .name = "max.runtime",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_runtime_read_u64,
+ .write_u64 = cpu_runtime_write_u64,
+ },
#endif /* CONFIG_CFS_BANDWIDTH */
#ifdef CONFIG_UCLAMP_TASK_GROUP
{
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index c83f05438..df151702b 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -776,6 +776,67 @@ static int test_cpucg_max_nested(const char *root)
return ret;
}
+static int test_cpucg_max_runtime(const char *root)
+{
+ int ret = KSFT_FAIL;
+ long quota_usec = 1000; /* 1ms (minimum) */
+ long period_usec = 100000; /* 100ms */
+ long burst_usec = 5000000; /* 5s, so cap = 5001ms */
+ long runtime_usec = 2500000; /* 2500ms = half of 5s run */
+ long duration_sec = 5;
+ long expected_usec = duration_sec * USEC_PER_SEC / 2; /* 50% */
+ long usage_usec;
+ char *cpucg;
+ char buf[64];
+ int pid;
+
+ cpucg = cg_name(root, "cpucg_runtime_test");
+ if (!cpucg)
+ goto cleanup;
+
+ if (cg_create(cpucg))
+ goto cleanup;
+
+ snprintf(buf, sizeof(buf), "%ld %ld", quota_usec, period_usec);
+ if (cg_write(cpucg, "cpu.max", buf))
+ goto cleanup;
+ if (cg_write_numeric(cpucg, "cpu.max.burst", burst_usec))
+ goto cleanup;
+
+ /* Start burner, let it settle, then inject credits */
+ struct cpu_hog_func_param param = {
+ .nprocs = 1,
+ .ts = { .tv_sec = duration_sec, .tv_nsec = 0 },
+ .clock_type = CPU_HOG_CLOCK_WALL,
+ };
+ pid = cg_run_nowait(cpucg, hog_cpus_timed, (void *)¶m);
+ if (pid < 0)
+ goto cleanup;
+
+ usleep(100000);
+ if (cg_write_numeric(cpucg, "cpu.max.runtime", runtime_usec)) {
+ kill(pid, SIGKILL);
+ waitpid(pid, NULL, 0);
+ goto cleanup;
+ }
+
+ waitpid(pid, NULL, 0);
+
+ usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+ if (usage_usec <= 0)
+ goto cleanup;
+
+ if (!values_close_report(usage_usec, expected_usec, 10))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(cpucg);
+ free(cpucg);
+ return ret;
+}
+
#define T(x) { x, #x }
struct cpucg_test {
int (*fn)(const char *root);
@@ -790,6 +851,7 @@ struct cpucg_test {
T(test_cpucg_nested_weight_underprovisioned),
T(test_cpucg_max),
T(test_cpucg_max_nested),
+ T(test_cpucg_max_runtime),
};
#undef T
--
2.47.3
Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07
next prev parent reply other threads:[~2026-05-25 19:37 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-25 19:36 [PATCH 0/2] sched/fair: expose cpu.max.runtime for credit injection Fernand Sieber
2026-05-25 19:36 ` Fernand Sieber [this message]
2026-05-26 20:52 ` [PATCH 1/2] sched/fair: expose cpu.max.runtime to set bandwidth runtime directly Benjamin Segall
2026-05-28 7:25 ` Fernand Sieber
2026-05-27 19:04 ` Tejun Heo
2026-05-28 6:54 ` Fernand Sieber
2026-05-28 14:37 ` Tejun Heo
2026-05-25 19:36 ` [PATCH 2/2] sched/ext: add cgroup_set_runtime ops callback Fernand Sieber
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260525193622.70282-2-sieberf@amazon.com \
--to=sieberf@amazon.com \
--cc=arighi@nvidia.com \
--cc=bsegall@google.com \
--cc=changwoo@igalia.com \
--cc=dietmar.eggemann@arm.com \
--cc=dwmw@amazon.co.uk \
--cc=fmubeen@amazon.de \
--cc=hborghor@amazon.de \
--cc=juri.lelli@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@redhat.com \
--cc=nh-open-source@amazon.com \
--cc=peterz@infradead.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=void@manifault.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.