All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ridong Chen <ridong.chen@linux.dev>
To: Waiman Long <longman@redhat.com>
Cc: cgroups@vger.kernel.org, Tejun Heo <tj@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	ridong.chen@linux.dev, linux-kernel@vger.kernel.org
Subject: [PATCH] cgroup/cpuset: Support multiple source/destination cpusets using pids pattern
Date: Wed,  3 Jun 2026 18:26:03 +0800	[thread overview]
Message-ID: <20260603102604.177503-1-ridong.chen@linux.dev> (raw)
In-Reply-To: <20260602023203.248077-7-longman@redhat.com>

The current cpuset_can_attach() and cpuset_attach() functions assume task
migration is from one source cpuset to one destination cpuset. This can be
wrong in several scenarios:
 - Moving a multi-threaded process with threads in different cpusets
 - Disabling the cpuset controller (many children to one parent)
 - Enabling the cpuset controller (one parent to many children)

Fix this by adopting the pids subsystem's per-task accounting pattern.
In cpuset_can_attach(), use task_cs(task) to get the correct source cpuset
for each task (like pids_can_attach uses task_css), adjust nr_deadline_tasks
and reserve DL bandwidth per-task, and increment attach_in_progress per-task
on the destination cpuset. In cpuset_attach(), handle destination cpuset
changes within the task iteration loop.

A shared helper cpuset_undo_attach() reverses the per-task operations for
both partial rollback in cpuset_can_attach() and full reversal in
cpuset_cancel_attach().

When multiple source cpusets are detected in can_attach(), set
attach_many_sources so that cpuset_attach() forces cpus_updated and
mems_updated to true, ensuring all tasks get properly updated regardless
of which source cpuset cpuset_attach_old_cs points to.

This eliminates the need for nr_migrate_dl_tasks, sum_migrate_dl_bw, and
dl_bw_cpu fields in struct cpuset.

Fixes: 4ec22e9c5a90 ("cpuset: Enable cpuset controller in default hierarchy")
Signed-off-by: Ridong Chen <ridong.chen@linux.dev>
---
 kernel/cgroup/cpuset-internal.h |   8 --
 kernel/cgroup/cpuset.c          | 177 ++++++++++++++++----------------
 2 files changed, 89 insertions(+), 96 deletions(-)

diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index f7aaf01f7cd5..601e38b3c75b 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -166,14 +166,6 @@ struct cpuset {
 	 * know when to rebuild associated root domain bandwidth information.
 	 */
 	int nr_deadline_tasks;
-	int nr_migrate_dl_tasks;
-	/* DL bandwidth that needs destination reservation for this attach. */
-	u64 sum_migrate_dl_bw;
-	/*
-	 * CPU used for temporary DL bandwidth allocation during attach;
-	 * -1 if no DL bandwidth was allocated in the current attach.
-	 */
-	int dl_bw_cpu;
 
 	/* Invalid partition error code, not lock protected */
 	enum prs_errcode prs_err;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e52a5a40d607..be222eb6078c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -288,7 +288,6 @@ struct cpuset top_cpuset = {
 	.flags = BIT(CS_CPU_EXCLUSIVE) |
 		 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
 	.partition_root_state = PRS_ROOT,
-	.dl_bw_cpu = -1,
 };
 
 /**
@@ -580,8 +579,6 @@ static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
 	if (!trial)
 		return NULL;
 
-	trial->dl_bw_cpu = -1;
-
 	/* Setup cpumask pointer array */
 	cpumask_var_t *pmask[4] = {
 		&trial->cpus_allowed,
@@ -2984,6 +2981,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 static struct cpuset *cpuset_attach_old_cs;
 static bool attach_cpus_updated;
 static bool attach_mems_updated;
+static bool attach_many_sources;
 
 /*
  * Check to see if a cpuset can accept a new task
@@ -3026,30 +3024,36 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
 	return 0;
 }
 
-static int cpuset_reserve_dl_bw(struct cpuset *cs)
+/*
+ * Reverse per-task operations done in cpuset_can_attach().
+ * If @stop_at is non-NULL, only undo tasks before it (partial rollback).
+ * If @stop_at is NULL, undo all tasks (full reversal for cancel_attach).
+ * Must be called with cpuset_mutex held.
+ */
+static void cpuset_undo_attach(struct cgroup_taskset *tset,
+			       struct task_struct *stop_at)
 {
-	int cpu, ret;
-
-	if (!cs->sum_migrate_dl_bw)
-		return 0;
-
-	cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
-	if (unlikely(cpu >= nr_cpu_ids))
-		return -EINVAL;
+	struct cgroup_subsys_state *css;
+	struct task_struct *task;
 
-	ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
-	if (ret)
-		return ret;
+	cgroup_taskset_for_each(task, css, tset) {
+		struct cpuset *cs = css_cs(css);
+		struct cpuset *oldcs = task_cs(task);
 
-	cs->dl_bw_cpu = cpu;
-	return 0;
-}
+		if (task == stop_at)
+			break;
 
-static void reset_migrate_dl_data(struct cpuset *cs)
-{
-	cs->nr_migrate_dl_tasks = 0;
-	cs->sum_migrate_dl_bw = 0;
-	cs->dl_bw_cpu = -1;
+		if (dl_task(task)) {
+			cs->nr_deadline_tasks--;
+			oldcs->nr_deadline_tasks++;
+			if (dl_task_needs_bw_move(task, cs->effective_cpus)) {
+				int cpu = cpumask_any_and(cpu_active_mask,
+							 cs->effective_cpus);
+				dl_bw_free(cpu, task->dl.dl_bw);
+			}
+		}
+		dec_attach_in_progress_locked(cs);
+	}
 }
 
 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
@@ -3061,96 +3065,79 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
 	bool setsched_check;
 	int ret;
 
-	/* used later by cpuset_attach() */
 	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
 	oldcs = cpuset_attach_old_cs;
 	cs = css_cs(css);
 
 	mutex_lock(&cpuset_mutex);
+	attach_many_sources = false;
 
-	/* Check to see if task is allowed in the cpuset */
 	ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
 	if (ret)
 		goto out_unlock;
 
-	/*
-	 * The cpuset_attach_old_cs is used mainly by cpuset_migrate_mm() to get
-	 * the old_mems_allowed value. There are two ways that many-to-one
-	 * cpuset migration can happen:
-	 * 1) A multithread application with threads in different cpusets is
-	 *    wholely migrated to a new cpuset.
-	 * 2) Disabling v2 cpuset controller will move all the tasks in child
-	 *    cpusets to the parent cpuset.
-	 *
-	 * In the former case, it is the mm setting of the group leader that
-	 * really matters. So cpuset_attach_old_cs should track the oldcs of the
-	 * group leader. It falls back to the oldcs of the first task if there
-	 * is no group leader in the taskset. In the latter case, effective_mems
-	 * of child cpusets must always be a subset of the parent. So no real
-	 * page migration will be necessary no matter which child cpuset is
-	 * selected as cpuset_attach_old_cs.
-	 */
 	cgroup_taskset_for_each(task, css, tset) {
+		struct cpuset *newcs = css_cs(css);
+		struct cpuset *new_oldcs = task_cs(task);
+
+		if (newcs != cs || new_oldcs != oldcs) {
+			if (new_oldcs != oldcs)
+				attach_many_sources = true;
+			cs = newcs;
+			oldcs = new_oldcs;
+			ret = cpuset_can_attach_check(cs, oldcs,
+						      &setsched_check);
+			if (ret)
+				goto out_rollback;
+		}
+
 		ret = task_can_attach(task);
 		if (ret)
-			goto out_unlock;
+			goto out_rollback;
 
-		/* Update cpuset_attach_old_cs to the latest group leader */
 		if (task == task->group_leader)
 			cpuset_attach_old_cs = task_cs(task);
 
 		if (setsched_check) {
 			ret = security_task_setscheduler(task);
 			if (ret)
-				goto out_unlock;
+				goto out_rollback;
 		}
 
 		if (dl_task(task)) {
-			/*
-			 * Count all migrating DL tasks for cpuset task accounting.
-			 * Only tasks that need a root-domain bandwidth move
-			 * contribute to sum_migrate_dl_bw.
-			 */
-			cs->nr_migrate_dl_tasks++;
-			if (dl_task_needs_bw_move(task, cs->effective_cpus))
-				cs->sum_migrate_dl_bw += task->dl.dl_bw;
+			cs->nr_deadline_tasks++;
+			oldcs->nr_deadline_tasks--;
+
+			if (dl_task_needs_bw_move(task, cs->effective_cpus)) {
+				int cpu = cpumask_any_and(cpu_active_mask,
+							 cs->effective_cpus);
+				if (unlikely(cpu >= nr_cpu_ids)) {
+					ret = -EINVAL;
+					goto out_rollback;
+				}
+				ret = dl_bw_alloc(cpu, task->dl.dl_bw);
+				if (ret)
+					goto out_rollback;
+			}
 		}
-	}
-
-	ret = cpuset_reserve_dl_bw(cs);
 
-out_unlock:
-	if (ret) {
-		reset_migrate_dl_data(cs);
-	} else {
-		/*
-		 * Mark attach is in progress.  This makes validate_change() fail
-		 * changes which zero cpus/mems_allowed.
-		 */
 		cs->attach_in_progress++;
 	}
 
+	goto out_unlock;
+
+out_rollback:
+	cpuset_undo_attach(tset, task);
+
+out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return ret;
 }
 
 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
 {
-	struct cgroup_subsys_state *css;
-	struct cpuset *cs;
-
-	cgroup_taskset_first(tset, &css);
-	cs = css_cs(css);
-
 	mutex_lock(&cpuset_mutex);
-	dec_attach_in_progress_locked(cs);
-
-	if (cs->dl_bw_cpu >= 0)
-		dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
-
-	if (cs->nr_migrate_dl_tasks)
-		reset_migrate_dl_data(cs);
-
+	cpuset_undo_attach(tset, NULL);
 	mutex_unlock(&cpuset_mutex);
 }
 
@@ -3232,8 +3219,15 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	mutex_lock(&cpuset_mutex);
 	queue_task_work = false;
 
-	attach_cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
-	attach_mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+	if (attach_many_sources) {
+		attach_cpus_updated = true;
+		attach_mems_updated = true;
+	} else {
+		attach_cpus_updated = !cpumask_equal(cs->effective_cpus,
+						     oldcs->effective_cpus);
+		attach_mems_updated = !nodes_equal(cs->effective_mems,
+						    oldcs->effective_mems);
+	}
 
 	/*
 	 * In the default hierarchy, enabling cpuset in the child cgroups
@@ -3249,21 +3243,28 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
 	}
 
-	cgroup_taskset_for_each(task, css, tset)
+	cgroup_taskset_for_each(task, css, tset) {
+		struct cpuset *newcs = css_cs(css);
+
+		if (newcs != cs) {
+			cs->old_mems_allowed = cpuset_attach_nodemask_to;
+			cs = newcs;
+			if (cpuset_v2())
+				cpuset_attach_nodemask_to = cs->effective_mems;
+			else
+				guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+		}
 		cpuset_attach_task(cs, task);
+	}
 
 out:
 	if (queue_task_work)
 		schedule_flush_migrate_mm();
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
-	if (cs->nr_migrate_dl_tasks) {
-		cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
-		oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
-		reset_migrate_dl_data(cs);
-	}
-
-	dec_attach_in_progress_locked(cs);
+	/* Decrement per-task attach_in_progress */
+	cgroup_taskset_for_each(task, css, tset)
+		dec_attach_in_progress_locked(css_cs(css));
 
 	mutex_unlock(&cpuset_mutex);
 }
-- 
2.43.0


  reply	other threads:[~2026-06-03 10:26 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-02  2:31 [PATCH-next v5 0/6] cgroup/cpuset: Support multiple source/destination cpusets for cpuset_*attach() Waiman Long
2026-06-02  2:31 ` [PATCH-next v5 1/6] cgroup/cpuset: Fix node inconsistencies between cpuset_update_tasks_nodemask() and cpuset_attach() Waiman Long
2026-06-02 13:37   ` Ridong Chen
2026-06-02 18:43     ` Waiman Long
2026-06-02  2:31 ` [PATCH-next v5 2/6] cgroup/cpuset: Add a cpuset_reserve_dl_bw() helper Waiman Long
2026-06-02 13:40   ` Ridong Chen
2026-06-02  2:32 ` [PATCH-next v5 3/6] cgroup/cpuset: Expand the scope of cpuset_can_attach_check() Waiman Long
2026-06-02 13:51   ` Ridong Chen
2026-06-02  2:32 ` [PATCH-next v5 4/6] cgroup/cpuset: Make cpuset_attach_old_cs track task group leaders Waiman Long
2026-06-02 13:58   ` Ridong Chen
2026-06-02  2:32 ` [PATCH-next v5 5/6] cgroup/cpuset: Move mpol_rebind_mm/cpuset_migrate_mm() calls inside cpuset_attach_task() Waiman Long
2026-06-02  2:32 ` [PATCH-next v5 6/6] cgroup/cpuset: Support multiple source/destination cpusets for cpuset_*attach() Waiman Long
2026-06-03 10:26   ` Ridong Chen [this message]
2026-06-03 10:32     ` [PATCH] cgroup/cpuset: Support multiple source/destination cpusets using pids pattern Ridong Chen
2026-06-03 18:47     ` Waiman Long
2026-06-05  7:35       ` Ridong Chen
2026-06-05 17:15         ` Waiman Long
2026-06-07  3:12           ` Ridong Chen
2026-06-08 18:49             ` Waiman Long
2026-06-07  3:22           ` Ridong Chen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260603102604.177503-1-ridong.chen@linux.dev \
    --to=ridong.chen@linux.dev \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.