The Linux Kernel Mailing List
 help / color / mirror / Atom feed
From: Fernand Sieber <sieberf@amazon.com>
To: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	Juri Lelli <juri.lelli@redhat.com>,
	Vincent Guittot <vincent.guittot@linaro.org>
Cc: Tejun Heo <tj@kernel.org>, David Vernet <void@manifault.com>,
	Andrea Righi <arighi@nvidia.com>,
	Changwoo Min <changwoo@igalia.com>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Ben Segall <bsegall@google.com>, Mel Gorman <mgorman@suse.de>,
	<linux-kernel@vger.kernel.org>, <nh-open-source@amazon.com>,
	Fahad Mubeen <fmubeen@amazon.de>,
	"Hendrik Borghorst" <hborghor@amazon.de>,
	David Woodhouse <dwmw@amazon.co.uk>,
	Fernand Sieber <sieberf@amazon.com>
Subject: [PATCH 1/2] sched/fair: expose cpu.max.runtime to set bandwidth runtime directly
Date: Mon, 25 May 2026 21:36:21 +0200	[thread overview]
Message-ID: <20260525193622.70282-2-sieberf@amazon.com> (raw)
In-Reply-To: <20260525193622.70282-1-sieberf@amazon.com>

Add a cpu.max.runtime cgroup v2 interface that allows userspace to set
the CFS bandwidth controller's runtime directly. This enables CPU credit
injection: an orchestrator writes a runtime budget which the cgroup
consumes naturally through the existing bandwidth enforcement mechanism.

The write sets cfs_b->runtime directly. Each period, the task consumes
runtime and the refill restores only quota (capped at quota + burst), so
the injected credits drain until runtime falls below the cap, after which
the cgroup returns to its steady-state quota allocation.

Writes are rejected if the value exceeds quota + burst (the per-period
runtime cap) or exceeds the maximum bandwidth limit.

Also relax the burst validation: remove the burst <= quota constraint,
requiring only that burst + quota does not overflow. This allows
configuring burst > quota so that the runtime cap (quota + burst) can
reach up to one full period, enabling 100% utilization while credits last.

The interface uses microseconds, consistent with cpu.max quota/period.

Signed-off-by: Fernand Sieber <sieberf@amazon.com>
---
 kernel/sched/core.c                       | 44 +++++++++++++++-
 tools/testing/selftests/cgroup/test_cpu.c | 62 +++++++++++++++++++++++
 2 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8871449d..d92e5840b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10085,8 +10085,7 @@ static int tg_set_bandwidth(struct task_group *tg,
 	if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us)
 		return -EINVAL;
 
-	if (quota_us != RUNTIME_INF && (burst_us > quota_us ||
-					burst_us + quota_us > max_bw_runtime_us))
+	if (quota_us != RUNTIME_INF && (burst_us + quota_us > max_bw_runtime_us))
 		return -EINVAL;
 
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -10147,6 +10146,41 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
 	tg_bandwidth(tg, &period_us, &quota_us, NULL);
 	return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
 }
+
+static int cpu_runtime_write_u64(struct cgroup_subsys_state *css,
+				 struct cftype *cftype, u64 runtime_us)
+{
+	struct task_group *tg = css_tg(css);
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	if (runtime_us > max_bw_runtime_us)
+		return -EINVAL;
+
+	raw_spin_lock_irq(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF &&
+	    (u64)runtime_us * NSEC_PER_USEC > cfs_b->quota + cfs_b->burst) {
+		raw_spin_unlock_irq(&cfs_b->lock);
+		return -EINVAL;
+	}
+	cfs_b->runtime = (u64)runtime_us * NSEC_PER_USEC;
+	raw_spin_unlock_irq(&cfs_b->lock);
+
+	return 0;
+}
+
+static u64 cpu_runtime_read_u64(struct cgroup_subsys_state *css,
+				struct cftype *cftype)
+{
+	struct task_group *tg = css_tg(css);
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+	u64 runtime_ns;
+
+	raw_spin_lock_irq(&cfs_b->lock);
+	runtime_ns = cfs_b->runtime;
+	raw_spin_unlock_irq(&cfs_b->lock);
+
+	return runtime_ns / NSEC_PER_USEC;
+}
 #endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -10498,6 +10532,12 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_burst_read_u64,
 		.write_u64 = cpu_burst_write_u64,
 	},
+	{
+		.name = "max.runtime",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_runtime_read_u64,
+		.write_u64 = cpu_runtime_write_u64,
+	},
 #endif /* CONFIG_CFS_BANDWIDTH */
 #ifdef CONFIG_UCLAMP_TASK_GROUP
 	{
diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c
index c83f05438..df151702b 100644
--- a/tools/testing/selftests/cgroup/test_cpu.c
+++ b/tools/testing/selftests/cgroup/test_cpu.c
@@ -776,6 +776,67 @@ static int test_cpucg_max_nested(const char *root)
 	return ret;
 }
 
+static int test_cpucg_max_runtime(const char *root)
+{
+	int ret = KSFT_FAIL;
+	long quota_usec = 1000;		/* 1ms (minimum) */
+	long period_usec = 100000;	/* 100ms */
+	long burst_usec = 5000000;	/* 5s, so cap = 5001ms */
+	long runtime_usec = 2500000;	/* 2500ms = half of 5s run */
+	long duration_sec = 5;
+	long expected_usec = duration_sec * USEC_PER_SEC / 2; /* 50% */
+	long usage_usec;
+	char *cpucg;
+	char buf[64];
+	int pid;
+
+	cpucg = cg_name(root, "cpucg_runtime_test");
+	if (!cpucg)
+		goto cleanup;
+
+	if (cg_create(cpucg))
+		goto cleanup;
+
+	snprintf(buf, sizeof(buf), "%ld %ld", quota_usec, period_usec);
+	if (cg_write(cpucg, "cpu.max", buf))
+		goto cleanup;
+	if (cg_write_numeric(cpucg, "cpu.max.burst", burst_usec))
+		goto cleanup;
+
+	/* Start burner, let it settle, then inject credits */
+	struct cpu_hog_func_param param = {
+		.nprocs = 1,
+		.ts = { .tv_sec = duration_sec, .tv_nsec = 0 },
+		.clock_type = CPU_HOG_CLOCK_WALL,
+	};
+	pid = cg_run_nowait(cpucg, hog_cpus_timed, (void *)&param);
+	if (pid < 0)
+		goto cleanup;
+
+	usleep(100000);
+	if (cg_write_numeric(cpucg, "cpu.max.runtime", runtime_usec)) {
+		kill(pid, SIGKILL);
+		waitpid(pid, NULL, 0);
+		goto cleanup;
+	}
+
+	waitpid(pid, NULL, 0);
+
+	usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec");
+	if (usage_usec <= 0)
+		goto cleanup;
+
+	if (!values_close_report(usage_usec, expected_usec, 10))
+		goto cleanup;
+
+	ret = KSFT_PASS;
+
+cleanup:
+	cg_destroy(cpucg);
+	free(cpucg);
+	return ret;
+}
+
 #define T(x) { x, #x }
 struct cpucg_test {
 	int (*fn)(const char *root);
@@ -790,6 +851,7 @@ struct cpucg_test {
 	T(test_cpucg_nested_weight_underprovisioned),
 	T(test_cpucg_max),
 	T(test_cpucg_max_nested),
+	T(test_cpucg_max_runtime),
 };
 #undef T
 
-- 
2.47.3




Amazon Development Centre (South Africa) (Proprietary) Limited
29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa
Registration Number: 2004 / 034463 / 07


  reply	other threads:[~2026-05-25 19:37 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-25 19:36 [PATCH 0/2] sched/fair: expose cpu.max.runtime for credit injection Fernand Sieber
2026-05-25 19:36 ` Fernand Sieber [this message]
2026-05-26 20:52   ` [PATCH 1/2] sched/fair: expose cpu.max.runtime to set bandwidth runtime directly Benjamin Segall
2026-05-28  7:25     ` Fernand Sieber
2026-05-27 19:04   ` Tejun Heo
2026-05-28  6:54     ` Fernand Sieber
2026-05-28 14:37       ` Tejun Heo
2026-05-25 19:36 ` [PATCH 2/2] sched/ext: add cgroup_set_runtime ops callback Fernand Sieber

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260525193622.70282-2-sieberf@amazon.com \
    --to=sieberf@amazon.com \
    --cc=arighi@nvidia.com \
    --cc=bsegall@google.com \
    --cc=changwoo@igalia.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=dwmw@amazon.co.uk \
    --cc=fmubeen@amazon.de \
    --cc=hborghor@amazon.de \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=nh-open-source@amazon.com \
    --cc=peterz@infradead.org \
    --cc=tj@kernel.org \
    --cc=vincent.guittot@linaro.org \
    --cc=void@manifault.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox