From: Aaron Tomlin <atomlin@atomlin.com>
To: longman@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com
Cc: chenridong@huaweicloud.com, neelx@suse.com,
cgroups@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH] cpuset: Fix multi-source deadline task accounting and bandwidth bypass
Date: Mon, 11 May 2026 21:03:41 -0400 [thread overview]
Message-ID: <20260512010341.101419-1-atomlin@atomlin.com> (raw)
During a batch migration where threads in a taskset originate from
multiple source cpusets (e.g., via cgroup.procs), cpuset_can_attach()
and cpuset_attach() currently evaluate the source cpuset exactly once
by caching the first task's oldcs.
This creates two distinct critical flaws for SCHED_DEADLINE tasks:
1. oldcs->nr_deadline_tasks is decremented solely on the first
source cpuset. If tasks originated from other cpusets, their
counts are permanently leaked, and the first cpuset permanently
underflows.
2. cpumask_intersects() is evaluated strictly against the first
task's source cpuset. This allows tasks originating from
entirely isolated root domains to silently bypass the
dl_bw_alloc() admission control.
This patch refactors the deadline accounting to evaluate task_cs(task)
on a per-task basis during the cgroup_taskset_for_each() loops. To
achieve accurate accounting before the core cgroup migration actually
executes, the permanent nr_deadline_tasks increments/decrements are
shifted into cpuset_can_attach(). If the migration aborts, the counts
are gracefully reverted via an internal rollback loop or the
cpuset_cancel_attach() callback.
Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
kernel/cgroup/cpuset.c | 53 +++++++++++++++++++++++++++++++-----------
1 file changed, 39 insertions(+), 14 deletions(-)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e3a081a07c6d..36f1d28f8ade 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3034,32 +3034,36 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
if (setsched_check) {
ret = security_task_setscheduler(task);
if (ret)
- goto out_unlock;
+ goto out_unlock_reset;
}
if (dl_task(task)) {
+ struct cpuset *old_cs = task_cs(task);
+
cs->nr_migrate_dl_tasks++;
- cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ old_cs->nr_deadline_tasks--;
+ cs->nr_deadline_tasks++;
+
+ if (!cpumask_intersects(old_cs->effective_cpus,
+ cs->effective_cpus))
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
}
if (!cs->nr_migrate_dl_tasks)
goto out_success;
- if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+ if (cs->sum_migrate_dl_bw) {
int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
if (unlikely(cpu >= nr_cpu_ids)) {
- reset_migrate_dl_data(cs);
ret = -EINVAL;
- goto out_unlock;
+ goto out_unlock_reset;
}
ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret) {
- reset_migrate_dl_data(cs);
- goto out_unlock;
- }
+ if (ret)
+ goto out_unlock_reset;
cs->dl_bw_cpu = cpu;
}
@@ -3070,6 +3074,22 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
+ goto out_unlock;
+
+out_unlock_reset:
+ if (cs->nr_migrate_dl_tasks) {
+ struct task_struct *t;
+
+ cgroup_taskset_for_each(t, css, tset) {
+ if (t == task)
+ break;
+ if (dl_task(t)) {
+ task_cs(t)->nr_deadline_tasks++;
+ cs->nr_deadline_tasks--;
+ }
+ }
+ reset_migrate_dl_data(cs);
+ }
out_unlock:
mutex_unlock(&cpuset_mutex);
return ret;
@@ -3079,6 +3099,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
+ struct task_struct *task;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
@@ -3089,8 +3110,15 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
if (cs->dl_bw_cpu >= 0)
dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
- if (cs->nr_migrate_dl_tasks)
+ if (cs->nr_migrate_dl_tasks) {
+ cgroup_taskset_for_each(task, css, tset) {
+ if (dl_task(task)) {
+ task_cs(task)->nr_deadline_tasks++;
+ cs->nr_deadline_tasks--;
+ }
+ }
reset_migrate_dl_data(cs);
+ }
mutex_unlock(&cpuset_mutex);
}
@@ -3195,11 +3223,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
- if (cs->nr_migrate_dl_tasks) {
- cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
- oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+ if (cs->nr_migrate_dl_tasks)
reset_migrate_dl_data(cs);
- }
dec_attach_in_progress_locked(cs);
--
2.51.0
reply other threads:[~2026-05-12 1:03 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260512010341.101419-1-atomlin@atomlin.com \
--to=atomlin@atomlin.com \
--cc=cgroups@vger.kernel.org \
--cc=chenridong@huaweicloud.com \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=longman@redhat.com \
--cc=mkoutny@suse.com \
--cc=neelx@suse.com \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox