Linux cgroups development
 help / color / mirror / Atom feed
From: Ridong Chen <ridong.chen@linux.dev>
To: Waiman Long <longman@redhat.com>
Cc: cgroups@vger.kernel.org, Tejun Heo <tj@kernel.org>,
	Johannes Weiner <hannes@cmpxchg.org>,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH] cgroup/cpuset: Support multiple source/destination cpusets using pids pattern
Date: Thu, 11 Jun 2026 14:17:25 +0800	[thread overview]
Message-ID: <c81e3212-cc6d-41e8-b34a-b2e5b2d57db6@linux.dev> (raw)
In-Reply-To: <e7bbd0b7-6c44-4875-aaed-160791adc9b2@redhat.com>



On 6/9/2026 2:49 AM, Waiman Long wrote:
> 
> On 6/6/26 11:12 PM, Ridong Chen wrote:
>> When the cpuset controller is enabled/disabled in a parent cgroup, tasks
>> from multiple child cpusets need to be migrated. The current code only
>> handles a single source/destination pair.
>>
>> Support multiple source/destination cpusets by adopting the per-task
>> processing pattern similar to the pids controller:
>>
>> 1) Perform per-task DL bandwidth reservation (dl_bw_alloc) directly in
>>     cpuset_can_attach() instead of batching into sum_migrate_dl_bw. This
>>     eliminates the sum_migrate_dl_bw and dl_bw_cpu fields from the cpuset
>>     struct.
>>
>> 2) Track attach_in_progress per-task per-destination cpuset to properly
>>     guard all involved cpusets from having their cpus/mems zeroed.
>>
>> 3) Use a shared cpuset_undo_attach() helper for both rollback-on-error
>>     in cpuset_can_attach() and for cpuset_cancel_attach().
>>
>> 4) Detect many-source migrations and force cpus_updated/mems_updated
>>     to true so all tasks get properly updated during attach.
>>
>> 5) Defer nr_deadline_tasks updates to cpuset_attach() (after migration
>>     is committed) to avoid a race with dl_rebuild_rd_accounting() that
>>     could see inconsistent values between can_attach and attach.
>>
>> Signed-off-by: Ridong Chen <ridong.chen@linux.dev>
>> ---
>>   kernel/cgroup/cpuset-internal.h |   7 --
>>   kernel/cgroup/cpuset.c          | 167 ++++++++++++++++----------------
>>   2 files changed, 84 insertions(+), 90 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset- 
>> internal.h
>> index f7aaf01f7cd5..8f32cb97eb94 100644
>> --- a/kernel/cgroup/cpuset-internal.h
>> +++ b/kernel/cgroup/cpuset-internal.h
>> @@ -167,13 +167,6 @@ struct cpuset {
>>        */
>>       int nr_deadline_tasks;
>>       int nr_migrate_dl_tasks;
>> -    /* DL bandwidth that needs destination reservation for this 
>> attach. */
>> -    u64 sum_migrate_dl_bw;
>> -    /*
>> -     * CPU used for temporary DL bandwidth allocation during attach;
>> -     * -1 if no DL bandwidth was allocated in the current attach.
>> -     */
>> -    int dl_bw_cpu;
>>       /* Invalid partition error code, not lock protected */
>>       enum prs_errcode prs_err;
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index e52a5a40d607..a6d96a39cdb1 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -288,7 +288,6 @@ struct cpuset top_cpuset = {
>>       .flags = BIT(CS_CPU_EXCLUSIVE) |
>>            BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
>>       .partition_root_state = PRS_ROOT,
>> -    .dl_bw_cpu = -1,
>>   };
>>   /**
>> @@ -580,8 +579,6 @@ static struct cpuset *dup_or_alloc_cpuset(struct 
>> cpuset *cs)
>>       if (!trial)
>>           return NULL;
>> -    trial->dl_bw_cpu = -1;
>> -
>>       /* Setup cpumask pointer array */
>>       cpumask_var_t *pmask[4] = {
>>           &trial->cpus_allowed,
>> @@ -3026,31 +3023,36 @@ static int cpuset_can_attach_check(struct 
>> cpuset *cs, struct cpuset *oldcs,
>>       return 0;
>>   }
>> -static int cpuset_reserve_dl_bw(struct cpuset *cs)
>> +/*
>> + * Undo DL bandwidth reservations and attach_in_progress increments done
>> + * in cpuset_can_attach(). Used for both rollback on error and 
>> cancel_attach.
>> + * If @stop_at is non-NULL, undo only for tasks before @stop_at in 
>> the tset.
>> + */
>> +static void cpuset_undo_attach(struct cgroup_taskset *tset,
>> +                   struct task_struct *stop_at)
>>   {
>> -    int cpu, ret;
>> -
>> -    if (!cs->sum_migrate_dl_bw)
>> -        return 0;
>> +    struct cgroup_subsys_state *css;
>> +    struct task_struct *task;
>> -    cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
>> -    if (unlikely(cpu >= nr_cpu_ids))
>> -        return -EINVAL;
>> +    cgroup_taskset_for_each(task, css, tset) {
>> +        struct cpuset *cs = css_cs(css);
>> -    ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
>> -    if (ret)
>> -        return ret;
>> +        if (task == stop_at)
>> +            break;
>> -    cs->dl_bw_cpu = cpu;
>> -    return 0;
>> +        if (dl_task(task)) {
>> +            cs->nr_migrate_dl_tasks--;
>> +            if (dl_task_needs_bw_move(task, cs->effective_cpus)) {
>> +                int cpu = cpumask_any_and(cpu_active_mask,
>> +                             cs->effective_cpus);
>> +                dl_bw_free(cpu, task->dl.dl_bw);
>> +            }
>> +        }
>> +        dec_attach_in_progress_locked(cs);
>> +    }
>>   }
>> -static void reset_migrate_dl_data(struct cpuset *cs)
>> -{
>> -    cs->nr_migrate_dl_tasks = 0;
>> -    cs->sum_migrate_dl_bw = 0;
>> -    cs->dl_bw_cpu = -1;
>> -}
>> +static bool attach_many_sources;
>>   /* Called by cgroups to determine if a cpuset is usable; 
>> cpuset_mutex held */
>>   static int cpuset_can_attach(struct cgroup_taskset *tset)
>> @@ -3067,90 +3069,73 @@ static int cpuset_can_attach(struct 
>> cgroup_taskset *tset)
>>       cs = css_cs(css);
>>       mutex_lock(&cpuset_mutex);
>> +    attach_many_sources = false;
>>       /* Check to see if task is allowed in the cpuset */
>>       ret = cpuset_can_attach_check(cs, oldcs, &setsched_check);
>>       if (ret)
>>           goto out_unlock;
>> -    /*
>> -     * The cpuset_attach_old_cs is used mainly by cpuset_migrate_mm() 
>> to get
>> -     * the old_mems_allowed value. There are two ways that many-to-one
>> -     * cpuset migration can happen:
>> -     * 1) A multithread application with threads in different cpusets is
>> -     *    wholely migrated to a new cpuset.
>> -     * 2) Disabling v2 cpuset controller will move all the tasks in 
>> child
>> -     *    cpusets to the parent cpuset.
>> -     *
>> -     * In the former case, it is the mm setting of the group leader that
>> -     * really matters. So cpuset_attach_old_cs should track the oldcs 
>> of the
>> -     * group leader. It falls back to the oldcs of the first task if 
>> there
>> -     * is no group leader in the taskset. In the latter case, 
>> effective_mems
>> -     * of child cpusets must always be a subset of the parent. So no 
>> real
>> -     * page migration will be necessary no matter which child cpuset is
>> -     * selected as cpuset_attach_old_cs.
>> -     */
>>       cgroup_taskset_for_each(task, css, tset) {
>> +        struct cpuset *newcs = css_cs(css);
>> +        struct cpuset *new_oldcs = task_cs(task);
>> +
>> +        if (newcs != cs || new_oldcs != oldcs) {
>> +            if (new_oldcs != oldcs)
>> +                attach_many_sources = true;
>> +            cs = newcs;
>> +            oldcs = new_oldcs;
>> +            ret = cpuset_can_attach_check(cs, oldcs,
>> +                              &setsched_check);
>> +            if (ret)
>> +                goto out_rollback;
>> +        }
>> +
>>           ret = task_can_attach(task);
>>           if (ret)
>> -            goto out_unlock;
>> +            goto out_rollback;
>> -        /* Update cpuset_attach_old_cs to the latest group leader */
>>           if (task == task->group_leader)
>>               cpuset_attach_old_cs = task_cs(task);
>>           if (setsched_check) {
>>               ret = security_task_setscheduler(task);
>>               if (ret)
>> -                goto out_unlock;
>> +                goto out_rollback;
>>           }
>>           if (dl_task(task)) {
>> -            /*
>> -             * Count all migrating DL tasks for cpuset task accounting.
>> -             * Only tasks that need a root-domain bandwidth move
>> -             * contribute to sum_migrate_dl_bw.
>> -             */
>>               cs->nr_migrate_dl_tasks++;
>> -            if (dl_task_needs_bw_move(task, cs->effective_cpus))
>> -                cs->sum_migrate_dl_bw += task->dl.dl_bw;
>> +            if (dl_task_needs_bw_move(task, cs->effective_cpus)) {
>> +                int cpu = cpumask_any_and(cpu_active_mask,
>> +                             cs->effective_cpus);
>> +                if (unlikely(cpu >= nr_cpu_ids)) {
>> +                    ret = -EINVAL;
>> +                    goto out_rollback;
>> +                }
>> +                ret = dl_bw_alloc(cpu, task->dl.dl_bw);
>> +                if (ret)
>> +                    goto out_rollback;
>> +            }
>>           }
>> -    }
>> -
>> -    ret = cpuset_reserve_dl_bw(cs);
>> -out_unlock:
>> -    if (ret) {
>> -        reset_migrate_dl_data(cs);
>> -    } else {
>> -        /*
>> -         * Mark attach is in progress.  This makes validate_change() 
>> fail
>> -         * changes which zero cpus/mems_allowed.
>> -         */
>>           cs->attach_in_progress++;
>>       }
>> +    goto out_unlock;
>> +
>> +out_rollback:
>> +    cpuset_undo_attach(tset, task);
>> +
>> +out_unlock:
>>       mutex_unlock(&cpuset_mutex);
>>       return ret;
>>   }
>>   static void cpuset_cancel_attach(struct cgroup_taskset *tset)
>>   {
>> -    struct cgroup_subsys_state *css;
>> -    struct cpuset *cs;
>> -
>> -    cgroup_taskset_first(tset, &css);
>> -    cs = css_cs(css);
>> -
>>       mutex_lock(&cpuset_mutex);
>> -    dec_attach_in_progress_locked(cs);
>> -
>> -    if (cs->dl_bw_cpu >= 0)
>> -        dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw);
>> -
>> -    if (cs->nr_migrate_dl_tasks)
>> -        reset_migrate_dl_data(cs);
>> -
>> +    cpuset_undo_attach(tset, NULL);
>>       mutex_unlock(&cpuset_mutex);
>>   }
>> @@ -3232,8 +3217,15 @@ static void cpuset_attach(struct cgroup_taskset 
>> *tset)
>>       mutex_lock(&cpuset_mutex);
>>       queue_task_work = false;
>> -    attach_cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs- 
>> >effective_cpus);
>> -    attach_mems_updated = !nodes_equal(cs->effective_mems, oldcs- 
>> >effective_mems);
>> +    if (attach_many_sources) {
>> +        attach_cpus_updated = true;
>> +        attach_mems_updated = true;
>> +    } else {
>> +        attach_cpus_updated = !cpumask_equal(cs->effective_cpus,
>> +                            oldcs->effective_cpus);
>> +        attach_mems_updated = !nodes_equal(cs->effective_mems,
>> +                           oldcs->effective_mems);
>> +    }
>>       /*
>>        * In the default hierarchy, enabling cpuset in the child cgroups
>> @@ -3250,20 +3242,29 @@ static void cpuset_attach(struct 
>> cgroup_taskset *tset)
>>       }
>>       cgroup_taskset_for_each(task, css, tset)
>> -        cpuset_attach_task(cs, task);
>> +        cpuset_attach_task(css_cs(css), task);
>>   out:
>>       if (queue_task_work)
>>           schedule_flush_migrate_mm();
>>       cs->old_mems_allowed = cpuset_attach_nodemask_to;
>> -    if (cs->nr_migrate_dl_tasks) {
>> -        cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
>> -        oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
>> -        reset_migrate_dl_data(cs);
>> -    }
>> +    /*
>> +     * Update nr_deadline_tasks now that migration is committed.
>> +     * nr_migrate_dl_tasks was accumulated per-dst in can_attach but
>> +     * nr_deadline_tasks is deferred to here to avoid a race with
>> +     * dl_rebuild_rd_accounting() between can_attach and attach.
>> +     */
>> +    cgroup_taskset_for_each(task, css, tset) {
>> +        struct cpuset *dst_cs = css_cs(css);
>> -    dec_attach_in_progress_locked(cs);
>> +        if (dst_cs->nr_migrate_dl_tasks) {
>> +            dst_cs->nr_deadline_tasks += dst_cs->nr_migrate_dl_tasks;
>> +            oldcs->nr_deadline_tasks -= dst_cs->nr_migrate_dl_tasks;
>> +            dst_cs->nr_migrate_dl_tasks = 0;
>> +        }
>> +        dec_attach_in_progress_locked(dst_cs);
>> +    }
> 
> You are assuming that there is only one source cpuset. That may not be 
> true. I suppose that we may not track the set of destination cpusets and 
> they will be hit during task iteration. We will still need to track all 
> the source cpusets.
> 
> Cheers,
> Longman
> 
>>       mutex_unlock(&cpuset_mutex);
>>   }
> 

Yeah, we still need the per-src here. I agree that we can just track the 
source cpusets.

-- 
Best regards
Ridong


  reply	other threads:[~2026-06-11  6:17 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-02  2:31 [PATCH-next v5 0/6] cgroup/cpuset: Support multiple source/destination cpusets for cpuset_*attach() Waiman Long
2026-06-02  2:31 ` [PATCH-next v5 1/6] cgroup/cpuset: Fix node inconsistencies between cpuset_update_tasks_nodemask() and cpuset_attach() Waiman Long
2026-06-02 13:37   ` Ridong Chen
2026-06-02 18:43     ` Waiman Long
2026-06-02  2:31 ` [PATCH-next v5 2/6] cgroup/cpuset: Add a cpuset_reserve_dl_bw() helper Waiman Long
2026-06-02 13:40   ` Ridong Chen
2026-06-02  2:32 ` [PATCH-next v5 3/6] cgroup/cpuset: Expand the scope of cpuset_can_attach_check() Waiman Long
2026-06-02 13:51   ` Ridong Chen
2026-06-02  2:32 ` [PATCH-next v5 4/6] cgroup/cpuset: Make cpuset_attach_old_cs track task group leaders Waiman Long
2026-06-02 13:58   ` Ridong Chen
2026-06-02  2:32 ` [PATCH-next v5 5/6] cgroup/cpuset: Move mpol_rebind_mm/cpuset_migrate_mm() calls inside cpuset_attach_task() Waiman Long
2026-06-02  2:32 ` [PATCH-next v5 6/6] cgroup/cpuset: Support multiple source/destination cpusets for cpuset_*attach() Waiman Long
2026-06-03 10:26   ` [PATCH] cgroup/cpuset: Support multiple source/destination cpusets using pids pattern Ridong Chen
2026-06-03 10:32     ` Ridong Chen
2026-06-03 18:47     ` Waiman Long
2026-06-05  7:35       ` Ridong Chen
2026-06-05 17:15         ` Waiman Long
2026-06-07  3:12           ` Ridong Chen
2026-06-08 18:49             ` Waiman Long
2026-06-11  6:17               ` Ridong Chen [this message]
2026-06-07  3:22           ` Ridong Chen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c81e3212-cc6d-41e8-b34a-b2e5b2d57db6@linux.dev \
    --to=ridong.chen@linux.dev \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=longman@redhat.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox