From: Waiman Long <longman@redhat.com>
To: "Ridong Chen" <ridong.chen@linux.dev>,
"Tejun Heo" <tj@kernel.org>,
"Johannes Weiner" <hannes@cmpxchg.org>,
"Michal Koutný" <mkoutny@suse.com>,
"Li Zefan" <lizefan@huawei.com>,
"Farhad Alemi" <farhad.alemi@berkeley.edu>,
"Andrew Morton" <akpm@linux-foundation.org>
Cc: cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
Aaron Tomlin <atomlin@atomlin.com>,
Guopeng Zhang <guopeng.zhang@linux.dev>,
Gregory Price <gourry@gourry.net>,
David Hildenbrand <david@kernel.org>,
Waiman Long <longman@redhat.com>
Subject: [PATCH v7 9/9] cgroup/cpuset: Support multiple destination cpusets for cpuset_*attach()
Date: Sat, 20 Jun 2026 23:28:16 -0400 [thread overview]
Message-ID: <20260621032816.1806773-10-longman@redhat.com> (raw)
In-Reply-To: <20260621032816.1806773-1-longman@redhat.com>
The only case where the cgroup_taskset structure requires task migration
to multiple cpusets is when enabling a cpuset controller in cgroup v2
where the newly created child cpusets inherits the same effective CPUs
and memory nodes from the parent. In that case, task migration can happen
directly with no update to tasks' CPU and memory nodes assignment and no
further work needed from the cpuset side exact updating nr_deadline_tasks
when DL tasks are involved and setting old_mems_allowed in the child
cpusets.
Do that by tracking all the destination cpusets with a new dst_cs_head
singly linked list again with the setting of attach_in_progress
associated with the insertion into the list.
It is assumed that a given cpuset cannot be both a source and a
destination cpuset. If such condition happens or when there are multiple
destination cpusets with CPU or memory nodes changes, the current code
will not handle it correctly. So it will print a warning and fail the
attach operation in these unexpected cases as we will have to enhance
the code to support this if such use cases are valid and not coding bugs.
Signed-off-by: Waiman Long <longman@redhat.com>
---
kernel/cgroup/cpuset-internal.h | 1 +
kernel/cgroup/cpuset.c | 121 +++++++++++++++++++-------------
2 files changed, 75 insertions(+), 47 deletions(-)
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 011993b1f756..900e74ac3538 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -151,6 +151,7 @@ struct cpuset {
*/
struct llist_node attach_node;
int attach_in_progress;
+ bool attach_source;
/* partition root state */
int partition_root_state;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c2d172873166..aff86acea701 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2986,11 +2986,16 @@ static int update_prstate(struct cpuset *cs, int new_prs)
*
* The attach_cpus_updated/attach_mems_updated flags are set in either
* cpuset_can_attach() or cpuset_fork() and used in cpuset_attach_task().
+ *
+ * The attach_many_dest_cs is set when there are multiple destination cpusets
+ * for task migration.
*/
static struct cpuset *cpuset_attach_old_cs;
static LLIST_HEAD(src_cs_head);
+static LLIST_HEAD(dst_cs_head);
static bool attach_cpus_updated;
static bool attach_mems_updated;
+static bool attach_many_dest_cs;
/*
* Check to see if a cpuset can accept a new task
@@ -3013,9 +3018,25 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
if (!oldcs)
return 0;
+ /*
+ * The same cpuset cannot be both a source and a destination.
+ * The current code does not support that, print a warning and
+ * fail the attach if so.
+ */
+ if (WARN_ON_ONCE((!oldcs->attach_source &&
+ llist_on_list(&oldcs->attach_node)) ||
+ cs->attach_source))
+ return -EINVAL;
+
if (!llist_on_list(&oldcs->attach_node)) {
llist_add(&oldcs->attach_node, &src_cs_head);
oldcs->attach_in_progress++;
+ oldcs->attach_source = true;
+ }
+
+ if (!llist_on_list(&cs->attach_node)) {
+ llist_add(&cs->attach_node, &dst_cs_head);
+ cs->attach_in_progress++;
}
cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
@@ -3046,35 +3067,31 @@ static int cpuset_can_attach_check(struct cpuset *cs, struct cpuset *oldcs,
return 0;
}
-static int cpuset_reserve_dl_bw(struct cpuset *cs)
+static int cpuset_reserve_dl_bw(void)
{
+ struct cpuset *cs;
int cpu, ret;
- if (!cs->sum_migrate_dl_bw)
- return 0;
+ llist_for_each_entry(cs, dst_cs_head.first, attach_node) {
+ if (!cs->sum_migrate_dl_bw)
+ continue;
- cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
- if (unlikely(cpu >= nr_cpu_ids))
- return -EINVAL;
+ cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+ if (unlikely(cpu >= nr_cpu_ids))
+ return -EINVAL;
- ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
- if (ret)
- return ret;
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret)
+ return ret;
- cs->dl_bw_cpu = cpu;
+ cs->dl_bw_cpu = cpu;
+ }
return 0;
}
-static void reset_migrate_dl_data(struct cpuset *cs)
-{
- cs->nr_migrate_dl_tasks = 0;
- cs->sum_migrate_dl_bw = 0;
- cs->dl_bw_cpu = -1;
-}
-
/*
* Clear and optionally apply (@cancel is false) the attach related data in the
- * source cpusets.
+ * source or destination cpuset.
*/
static void clear_attach_data(struct llist_head *head, bool cancel)
{
@@ -3086,9 +3103,14 @@ static void clear_attach_data(struct llist_head *head, bool cancel)
if (cs->nr_migrate_dl_tasks) {
if (!cancel)
cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ else if (cs->dl_bw_cpu >= 0) /* && cacnel */
+ dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
+ cs->dl_bw_cpu = -1;
}
dec_attach_in_progress_locked(cs);
+ cs->attach_source = false;
}
}
@@ -3109,6 +3131,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
attach_cpus_updated = false;
attach_mems_updated = false;
+ attach_many_dest_cs = false;
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
@@ -3133,9 +3156,13 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* selected as cpuset_attach_old_cs.
*/
cgroup_taskset_for_each(task, css, tset) {
+ struct cpuset *new_cs = css_cs(css);
struct cpuset *new_oldcs = task_cs(task);
- if (new_oldcs != oldcs) {
+ if ((new_oldcs != oldcs) || (new_cs != cs)) {
+ if (new_cs != cs)
+ attach_many_dest_cs = true;
+ cs = new_cs;
oldcs = new_oldcs;
ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
if (ret)
@@ -3169,14 +3196,28 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
}
}
- ret = cpuset_reserve_dl_bw(cs);
+ /*
+ * The only case where there are multiple destination cpusets for
+ * task migration is when enabling a v2 cpuset controllers where
+ * tasks will be migrated to multiple child cpusets from a parent
+ * cpuset with the same effective CPUs and memory nodes. IOW,
+ * both attach_cpus_updated and attach_mems_updated should be false.
+ * If not, it is a condition that the current code cannot handled.
+ * Print a warning and abort the attach operation as further code
+ * change will be needed.
+ */
+ if (WARN_ON_ONCE(attach_many_dest_cs && (!cpuset_v2() ||
+ attach_cpus_updated || attach_mems_updated))) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = cpuset_reserve_dl_bw();
out_unlock:
if (ret) {
- reset_migrate_dl_data(cs);
clear_attach_data(&src_cs_head, true);
- } else {
- cs->attach_in_progress++;
+ clear_attach_data(&dst_cs_head, true);
}
mutex_unlock(&cpuset_mutex);
@@ -3185,22 +3226,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
- struct cgroup_subsys_state *css;
- struct cpuset *cs;
-
- cgroup_taskset_first(tset, &css);
- cs = css_cs(css);
-
mutex_lock(&cpuset_mutex);
- dec_attach_in_progress_locked(cs);
clear_attach_data(&src_cs_head, true);
-
- if (cs->dl_bw_cpu >= 0)
- dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
-
- if (cs->nr_migrate_dl_tasks)
- reset_migrate_dl_data(cs);
-
+ clear_attach_data(&dst_cs_head, true);
mutex_unlock(&cpuset_mutex);
}
@@ -3286,26 +3314,25 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* In the default hierarchy, enabling cpuset in the child cgroups
* will trigger a cpuset_attach() call with no change in effective cpus
* and mems. In that case, we can optimize out by skipping the task
- * iteration and update.
+ * iteration and update, but the destination cpuset list is iterated to
+ * set old_mems_allowed.
*/
- if (cpuset_v2() && !attach_cpus_updated && !attach_mems_updated)
+ if (cpuset_v2() && !attach_cpus_updated && !attach_mems_updated) {
+ llist_for_each_entry(cs, dst_cs_head.first, attach_node)
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
goto out;
+ }
+ /* Task iteration shouldn't happen with attach_many_dest_cs set */
cgroup_taskset_for_each(task, css, tset)
cpuset_attach_task(cs, task);
-out:
if (queue_task_work)
schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
-
- if (cs->nr_migrate_dl_tasks) {
- cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
- reset_migrate_dl_data(cs);
- }
-
+out:
clear_attach_data(&src_cs_head, false);
- dec_attach_in_progress_locked(cs);
+ clear_attach_data(&dst_cs_head, false);
mutex_unlock(&cpuset_mutex);
}
--
2.54.0
prev parent reply other threads:[~2026-06-21 3:29 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-21 3:28 [PATCH v7 0/9] cgroup/cpuset: Support multiple source/destination cpusets for cpuset_*attach() Waiman Long
2026-06-21 3:28 ` [PATCH v7 1/9] cgroup/cpuset: rebind mm mempolicy to effective_mems, not mems_allowed Waiman Long
2026-06-21 3:28 ` [PATCH v7 2/9] cgroup/cpuset: Fix node inconsistencies between cpuset_update_tasks_nodemask() and cpuset_attach() Waiman Long
2026-06-21 3:28 ` [PATCH v7 3/9] cgroup/cpuset: Prevent race between task attach and cpuset state change Waiman Long
2026-06-21 3:28 ` [PATCH v7 4/9] cgroup/cpuset: Add a cpuset_reserve_dl_bw() helper Waiman Long
2026-06-21 3:28 ` [PATCH v7 5/9] cgroup/cpuset: Expand the scope of cpuset_can_attach_check() Waiman Long
2026-06-21 3:28 ` [PATCH v7 6/9] cgroup/cpuset: Make cpuset_attach_old_cs track task group leaders Waiman Long
2026-06-21 3:28 ` [PATCH v7 7/9] cgroup/cpuset: Move mpol_rebind_mm/cpuset_migrate_mm() calls inside cpuset_attach_task() Waiman Long
2026-06-21 3:28 ` [PATCH v7 8/9] cgroup/cpuset: Support multiple source cpusets for cpuset_*attach() Waiman Long
2026-06-21 3:28 ` Waiman Long [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260621032816.1806773-10-longman@redhat.com \
--to=longman@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=atomlin@atomlin.com \
--cc=cgroups@vger.kernel.org \
--cc=david@kernel.org \
--cc=farhad.alemi@berkeley.edu \
--cc=gourry@gourry.net \
--cc=guopeng.zhang@linux.dev \
--cc=hannes@cmpxchg.org \
--cc=linux-kernel@vger.kernel.org \
--cc=lizefan@huawei.com \
--cc=mkoutny@suse.com \
--cc=ridong.chen@linux.dev \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox