Linux cgroups development
 help / color / mirror / Atom feed
From: Aaron Tomlin <atomlin@atomlin.com>
To: longman@redhat.com, tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com
Cc: chenridong@huaweicloud.com, neelx@suse.com,
	cgroups@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH] cpuset: Fix multi-source deadline task accounting and bandwidth bypass
Date: Mon, 11 May 2026 21:03:41 -0400	[thread overview]
Message-ID: <20260512010341.101419-1-atomlin@atomlin.com> (raw)

During a batch migration where threads in a taskset originate from
multiple source cpusets (e.g., via cgroup.procs), cpuset_can_attach()
and cpuset_attach() currently evaluate the source cpuset exactly once
by caching the first task's oldcs.

This creates two distinct critical flaws for SCHED_DEADLINE tasks:

    1.  oldcs->nr_deadline_tasks is decremented solely on the first
        source cpuset. If tasks originated from other cpusets, their
        counts are permanently leaked, and the first cpuset permanently
        underflows.

    2.  cpumask_intersects() is evaluated strictly against the first
        task's source cpuset. This allows tasks originating from
        entirely isolated root domains to silently bypass the
        dl_bw_alloc() admission control.

This patch refactors the deadline accounting to evaluate task_cs(task)
on a per-task basis during the cgroup_taskset_for_each() loops. To
achieve accurate accounting before the core cgroup migration actually
executes, the permanent nr_deadline_tasks increments/decrements are
shifted into cpuset_can_attach(). If the migration aborts, the counts
are gracefully reverted via an internal rollback loop or the
cpuset_cancel_attach() callback.

Signed-off-by: Aaron Tomlin <atomlin@atomlin.com>
---
 kernel/cgroup/cpuset.c | 53 +++++++++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e3a081a07c6d..36f1d28f8ade 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -3034,32 +3034,36 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 		if (setsched_check) {
 			ret = security_task_setscheduler(task);
 			if (ret)
-				goto out_unlock;
+				goto out_unlock_reset;
 		}
 
 		if (dl_task(task)) {
+			struct cpuset *old_cs = task_cs(task);
+
 			cs->nr_migrate_dl_tasks++;
-			cs->sum_migrate_dl_bw += task->dl.dl_bw;
+			old_cs->nr_deadline_tasks--;
+			cs->nr_deadline_tasks++;
+
+			if (!cpumask_intersects(old_cs->effective_cpus,
+						cs->effective_cpus))
+				cs->sum_migrate_dl_bw += task->dl.dl_bw;
 		}
 	}
 
 	if (!cs->nr_migrate_dl_tasks)
 		goto out_success;
 
-	if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+	if (cs->sum_migrate_dl_bw) {
 		int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
 
 		if (unlikely(cpu >= nr_cpu_ids)) {
-			reset_migrate_dl_data(cs);
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_unlock_reset;
 		}
 
 		ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
-		if (ret) {
-			reset_migrate_dl_data(cs);
-			goto out_unlock;
-		}
+		if (ret)
+			goto out_unlock_reset;
 
 		cs->dl_bw_cpu = cpu;
 	}
@@ -3070,6 +3074,22 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 	 * changes which zero cpus/mems_allowed.
 	 */
 	cs->attach_in_progress++;
+	goto out_unlock;
+
+out_unlock_reset:
+	if (cs->nr_migrate_dl_tasks) {
+		struct task_struct *t;
+
+		cgroup_taskset_for_each(t, css, tset) {
+			if (t == task)
+				break;
+			if (dl_task(t)) {
+				task_cs(t)->nr_deadline_tasks++;
+				cs->nr_deadline_tasks--;
+			}
+		}
+		reset_migrate_dl_data(cs);
+	}
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return ret;
@@ -3079,6 +3099,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 {
 	struct cgroup_subsys_state *css;
 	struct cpuset *cs;
+	struct task_struct *task;
 
 	cgroup_taskset_first(tset, &css);
 	cs = css_cs(css);
@@ -3089,8 +3110,15 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 	if (cs->dl_bw_cpu >= 0)
 		dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
 
-	if (cs->nr_migrate_dl_tasks)
+	if (cs->nr_migrate_dl_tasks) {
+		cgroup_taskset_for_each(task, css, tset) {
+			if (dl_task(task)) {
+				task_cs(task)->nr_deadline_tasks++;
+				cs->nr_deadline_tasks--;
+			}
+		}
 		reset_migrate_dl_data(cs);
+	}
 
 	mutex_unlock(&cpuset_mutex);
 }
@@ -3195,11 +3223,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		schedule_flush_migrate_mm();
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
-	if (cs->nr_migrate_dl_tasks) {
-		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
-		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+	if (cs->nr_migrate_dl_tasks)
 		reset_migrate_dl_data(cs);
-	}
 
 	dec_attach_in_progress_locked(cs);
 
-- 
2.51.0


                 reply	other threads:[~2026-05-12  1:03 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260512010341.101419-1-atomlin@atomlin.com \
    --to=atomlin@atomlin.com \
    --cc=cgroups@vger.kernel.org \
    --cc=chenridong@huaweicloud.com \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=mkoutny@suse.com \
    --cc=neelx@suse.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox