* [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() @ 2026-01-22 11:29 Mateusz Guzik 2026-01-27 14:30 ` Mateusz Guzik 0 siblings, 1 reply; 11+ messages in thread From: Mateusz Guzik @ 2026-01-22 11:29 UTC (permalink / raw) To: tj, hannes, mkoutny; +Cc: brauner, linux-kernel, cgroups, Mateusz Guzik In the stock kernel the css_set_lock is taken three times during thread life cycle, turning it into the primary bottleneck in fork-heavy workloads. The acquire in perparation for clone can be avoided with a sequence counter, which in turn pushes the lock down. Accounts only for 6% speed up when creating threads in parallel on 20 cores as most of the contention shifts to pidmap_lock. Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> --- v2: - change comment about clone_seq - raw_write_seqcount* -> write_seqcount - just loop on failed seq check - don't bump it on task exit kernel/cgroup/cgroup-internal.h | 11 +++++-- kernel/cgroup/cgroup.c | 54 +++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 22051b4f1ccb..04a3aadcbc7f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +/* + * refcounted get/put for css_set objects + */ void put_css_set_locked(struct css_set *cset); static inline void put_css_set(struct css_set *cset) @@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset) spin_unlock_irqrestore(&css_set_lock, flags); } -/* - * refcounted get/put for css_set objects - */ static inline void get_css_set(struct css_set *cset) { refcount_inc(&cset->refcount); } +static inline bool get_css_set_not_zero(struct css_set *cset) +{ + return refcount_inc_not_zero(&cset->refcount); +} + bool cgroup_ssid_enabled(int ssid); bool cgroup_on_dfl(const struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 94788bd1fdf0..0053582b9b56 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -87,7 +87,14 @@ * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); -DEFINE_SPINLOCK(css_set_lock); +__cacheline_aligned DEFINE_SPINLOCK(css_set_lock); + +/* + * css_set_for_clone_seq synchronizes access to task_struct::cgroup + * and cgroup::kill_seq used on clone path + */ +static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq = + SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock); #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); @@ -907,6 +914,7 @@ static void css_set_skip_task_iters(struct css_set *cset, * @from_cset: css_set @task currently belongs to (may be NULL) * @to_cset: new css_set @task is being moved to (may be NULL) * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks + * @skip_clone_seq: don't bump css_set_for_clone_seq * * Move @task from @from_cset to @to_cset. If @task didn't belong to any * css_set, @from_cset can be NULL. If @task is being disassociated @@ -918,13 +926,16 @@ static void css_set_skip_task_iters(struct css_set *cset, */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, - bool use_mg_tasks) + bool use_mg_tasks, bool skip_clone_seq) { lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); + if (!skip_clone_seq) + write_seqcount_begin(&css_set_for_clone_seq); + if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); @@ -949,6 +960,9 @@ static void css_set_move_task(struct task_struct *task, list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); } + + if (!skip_clone_seq) + write_seqcount_end(&css_set_for_clone_seq); } /* @@ -2723,7 +2737,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; - css_set_move_task(task, from_cset, to_cset, true); + css_set_move_task(task, from_cset, to_cset, true, false); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, @@ -4183,7 +4197,9 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); + write_seqcount_begin(&css_set_for_clone_seq); cgrp->kill_seq++; + write_seqcount_end(&css_set_for_clone_seq); spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -6696,14 +6712,26 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) cgroup_threadgroup_change_begin(current); - spin_lock_irq(&css_set_lock); - cset = task_css_set(current); - get_css_set(cset); - if (kargs->cgrp) - kargs->kill_seq = kargs->cgrp->kill_seq; - else - kargs->kill_seq = cset->dfl_cgrp->kill_seq; - spin_unlock_irq(&css_set_lock); + for (;;) { + unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq); + bool got_ref = false; + rcu_read_lock(); + cset = task_css_set(current); + if (kargs->cgrp) + kargs->kill_seq = kargs->cgrp->kill_seq; + else + kargs->kill_seq = cset->dfl_cgrp->kill_seq; + if (get_css_set_not_zero(cset)) + got_ref = true; + rcu_read_unlock(); + if (unlikely(!got_ref || read_seqcount_retry(&css_set_for_clone_seq, seq))) { + if (got_ref) + put_css_set(cset); + cpu_relax(); + continue; + } + break; + } if (!(kargs->flags & CLONE_INTO_CGROUP)) { kargs->cset = cset; @@ -6907,7 +6935,7 @@ void cgroup_post_fork(struct task_struct *child, WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); + css_set_move_task(child, NULL, cset, false, true); } else { put_css_set(cset); cset = NULL; @@ -6995,7 +7023,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk) WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); - css_set_move_task(tsk, cset, NULL, false); + css_set_move_task(tsk, cset, NULL, false, true); cset->nr_tasks--; /* matches the signal->live check in css_task_iter_advance() */ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) -- 2.48.1 ^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-01-22 11:29 [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() Mateusz Guzik @ 2026-01-27 14:30 ` Mateusz Guzik 2026-01-27 17:27 ` Michal Koutný 0 siblings, 1 reply; 11+ messages in thread From: Mateusz Guzik @ 2026-01-27 14:30 UTC (permalink / raw) To: tj, hannes, mkoutny; +Cc: brauner, linux-kernel, cgroups ping? I need cgroups out of the way for further scalability work in fork+ exit On Thu, Jan 22, 2026 at 12:29 PM Mateusz Guzik <mjguzik@gmail.com> wrote: > > In the stock kernel the css_set_lock is taken three times during thread > life cycle, turning it into the primary bottleneck in fork-heavy > workloads. > > The acquire in perparation for clone can be avoided with a sequence > counter, which in turn pushes the lock down. > > Accounts only for 6% speed up when creating threads in parallel on 20 > cores as most of the contention shifts to pidmap_lock. > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> > --- > > v2: > - change comment about clone_seq > - raw_write_seqcount* -> write_seqcount > - just loop on failed seq check > - don't bump it on task exit > > kernel/cgroup/cgroup-internal.h | 11 +++++-- > kernel/cgroup/cgroup.c | 54 +++++++++++++++++++++++++-------- > 2 files changed, 49 insertions(+), 16 deletions(-) > > diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h > index 22051b4f1ccb..04a3aadcbc7f 100644 > --- a/kernel/cgroup/cgroup-internal.h > +++ b/kernel/cgroup/cgroup-internal.h > @@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp) > return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); > } > > +/* > + * refcounted get/put for css_set objects > + */ > void put_css_set_locked(struct css_set *cset); > > static inline void put_css_set(struct css_set *cset) > @@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset) > spin_unlock_irqrestore(&css_set_lock, flags); > } > > -/* > - * refcounted get/put for css_set objects > - */ > static inline void get_css_set(struct css_set *cset) > { > refcount_inc(&cset->refcount); > } > > +static inline bool get_css_set_not_zero(struct css_set *cset) > +{ > + return refcount_inc_not_zero(&cset->refcount); > +} > + > bool cgroup_ssid_enabled(int ssid); > bool cgroup_on_dfl(const struct cgroup *cgrp); > > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 94788bd1fdf0..0053582b9b56 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -87,7 +87,14 @@ > * cgroup.h can use them for lockdep annotations. > */ > DEFINE_MUTEX(cgroup_mutex); > -DEFINE_SPINLOCK(css_set_lock); > +__cacheline_aligned DEFINE_SPINLOCK(css_set_lock); > + > +/* > + * css_set_for_clone_seq synchronizes access to task_struct::cgroup > + * and cgroup::kill_seq used on clone path > + */ > +static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq = > + SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock); > > #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) > EXPORT_SYMBOL_GPL(cgroup_mutex); > @@ -907,6 +914,7 @@ static void css_set_skip_task_iters(struct css_set *cset, > * @from_cset: css_set @task currently belongs to (may be NULL) > * @to_cset: new css_set @task is being moved to (may be NULL) > * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks > + * @skip_clone_seq: don't bump css_set_for_clone_seq > * > * Move @task from @from_cset to @to_cset. If @task didn't belong to any > * css_set, @from_cset can be NULL. If @task is being disassociated > @@ -918,13 +926,16 @@ static void css_set_skip_task_iters(struct css_set *cset, > */ > static void css_set_move_task(struct task_struct *task, > struct css_set *from_cset, struct css_set *to_cset, > - bool use_mg_tasks) > + bool use_mg_tasks, bool skip_clone_seq) > { > lockdep_assert_held(&css_set_lock); > > if (to_cset && !css_set_populated(to_cset)) > css_set_update_populated(to_cset, true); > > + if (!skip_clone_seq) > + write_seqcount_begin(&css_set_for_clone_seq); > + > if (from_cset) { > WARN_ON_ONCE(list_empty(&task->cg_list)); > > @@ -949,6 +960,9 @@ static void css_set_move_task(struct task_struct *task, > list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : > &to_cset->tasks); > } > + > + if (!skip_clone_seq) > + write_seqcount_end(&css_set_for_clone_seq); > } > > /* > @@ -2723,7 +2737,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) > > get_css_set(to_cset); > to_cset->nr_tasks++; > - css_set_move_task(task, from_cset, to_cset, true); > + css_set_move_task(task, from_cset, to_cset, true, false); > from_cset->nr_tasks--; > /* > * If the source or destination cgroup is frozen, > @@ -4183,7 +4197,9 @@ static void __cgroup_kill(struct cgroup *cgrp) > lockdep_assert_held(&cgroup_mutex); > > spin_lock_irq(&css_set_lock); > + write_seqcount_begin(&css_set_for_clone_seq); > cgrp->kill_seq++; > + write_seqcount_end(&css_set_for_clone_seq); > spin_unlock_irq(&css_set_lock); > > css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); > @@ -6696,14 +6712,26 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) > > cgroup_threadgroup_change_begin(current); > > - spin_lock_irq(&css_set_lock); > - cset = task_css_set(current); > - get_css_set(cset); > - if (kargs->cgrp) > - kargs->kill_seq = kargs->cgrp->kill_seq; > - else > - kargs->kill_seq = cset->dfl_cgrp->kill_seq; > - spin_unlock_irq(&css_set_lock); > + for (;;) { > + unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq); > + bool got_ref = false; > + rcu_read_lock(); > + cset = task_css_set(current); > + if (kargs->cgrp) > + kargs->kill_seq = kargs->cgrp->kill_seq; > + else > + kargs->kill_seq = cset->dfl_cgrp->kill_seq; > + if (get_css_set_not_zero(cset)) > + got_ref = true; > + rcu_read_unlock(); > + if (unlikely(!got_ref || read_seqcount_retry(&css_set_for_clone_seq, seq))) { > + if (got_ref) > + put_css_set(cset); > + cpu_relax(); > + continue; > + } > + break; > + } > > if (!(kargs->flags & CLONE_INTO_CGROUP)) { > kargs->cset = cset; > @@ -6907,7 +6935,7 @@ void cgroup_post_fork(struct task_struct *child, > > WARN_ON_ONCE(!list_empty(&child->cg_list)); > cset->nr_tasks++; > - css_set_move_task(child, NULL, cset, false); > + css_set_move_task(child, NULL, cset, false, true); > } else { > put_css_set(cset); > cset = NULL; > @@ -6995,7 +7023,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk) > > WARN_ON_ONCE(list_empty(&tsk->cg_list)); > cset = task_css_set(tsk); > - css_set_move_task(tsk, cset, NULL, false); > + css_set_move_task(tsk, cset, NULL, false, true); > cset->nr_tasks--; > /* matches the signal->live check in css_task_iter_advance() */ > if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) > -- > 2.48.1 > ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-01-27 14:30 ` Mateusz Guzik @ 2026-01-27 17:27 ` Michal Koutný 2026-01-27 18:18 ` Mateusz Guzik 0 siblings, 1 reply; 11+ messages in thread From: Michal Koutný @ 2026-01-27 17:27 UTC (permalink / raw) To: Mateusz Guzik; +Cc: tj, hannes, brauner, linux-kernel, cgroups [-- Attachment #1: Type: text/plain, Size: 514 bytes --] On Tue, Jan 27, 2026 at 03:30:14PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > ping? I need cgroups out of the way for further scalability work in fork+ exit I got stuck on following: - possible implementation with (simpler?) rwlock, - effect of css_set_lock in cgroup_post_fork(). I want to try some measurements of the latter since I assume that limits the effect of the elision in cgroup_css_set_fork(), doesn't it? (IIUC, you'd see it again if you reduced the pidmap_lock contention.) Regards, Michal [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 265 bytes --] ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-01-27 17:27 ` Michal Koutný @ 2026-01-27 18:18 ` Mateusz Guzik 2026-01-29 13:22 ` Michal Koutný 0 siblings, 1 reply; 11+ messages in thread From: Mateusz Guzik @ 2026-01-27 18:18 UTC (permalink / raw) To: Michal Koutný; +Cc: tj, hannes, brauner, linux-kernel, cgroups On Tue, Jan 27, 2026 at 6:27 PM Michal Koutný <mkoutny@suse.com> wrote: > > On Tue, Jan 27, 2026 at 03:30:14PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > > ping? I need cgroups out of the way for further scalability work in fork+ exit > > I got stuck on following: > - possible implementation with (simpler?) rwlock, > - effect of css_set_lock in cgroup_post_fork(). > > I want to try some measurements of the latter since I assume that > limits the effect of the elision in cgroup_css_set_fork(), doesn't it? > (IIUC, you'd see it again if you reduced the pidmap_lock contention.) > Not sure what you mean here. If what you mean is merely converting css_set_lock into a rwlock and read-locking in the spot handled with seq in my patch that's a no-go -- frequent reader vs writer transitions kill perf in their own right and you still have 3 contention spots, i.e., this will remain as the primary bottleneck. To reiterate, in the kernel as found in next at the moment top of the profile is still the pidmap lock. There is a patch to remove most of the overhead under the lock: https://lore.kernel.org/linux-fsdevel/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org/ It may need some tidy ups but for the purpose of this discussion we can pretend it landed. With that in place top of the profile is the css set lock. With *this* patch in place we are back to pidmap, which is then followed by tasklist. clone + exit codepaths are globally serialized as follows: - pidmap -- once per side - tasklist -- once on clone, twice on exit - cgroups -- twice on clone, once on exit - some lock for random harvesting, can probably just go In principle with enough effort(tm) one can introduce finer-grained locking for all of these, but I suspect it is not warranted and I'm not going to do it, especially so for the tasklist lock. So I very much expect the clone + exit pair will remain globally serialized, it's the question of the nature of said serialization. I think a sensible goal is serialization at most once per side (if possible) and even then sanitized hold time. The tasklist thing may be too problematic to only take twice, but even then I can trivially reduce the hold time. If 3 spots have to remain, it will be the new top. If I work it down to two, who knows. So ye, css very much can be considered a problem here. This comment: > - effect of css_set_lock in cgroup_post_fork(). ... I don't get whatsoever. Stability of cgroup placement aside, to my reading the lock is needed in part to serialize addition of the task to the cgroup list. No matter what this will have to be serialized both ways with the same thing. Perhaps said stability can be assured in other ways and the list can be decomposed, but that's some complexity which is not warranted. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-01-27 18:18 ` Mateusz Guzik @ 2026-01-29 13:22 ` Michal Koutný 2026-02-10 10:43 ` Michal Koutný 0 siblings, 1 reply; 11+ messages in thread From: Michal Koutný @ 2026-01-29 13:22 UTC (permalink / raw) To: Mateusz Guzik; +Cc: tj, hannes, brauner, linux-kernel, cgroups [-- Attachment #1: Type: text/plain, Size: 7722 bytes --] On Tue, Jan 27, 2026 at 07:18:44PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > Not sure what you mean here. Let me add a patch for illustration. Does that clarify? (How would that change your watched metric?) ... > It may need some tidy ups but for the purpose of this discussion we > can pretend it landed. That I follow. > clone + exit codepaths are globally serialized as follows: > - pidmap -- once per side > - tasklist -- once on clone, twice on exit > - cgroups -- twice on clone, once on exit > - some lock for random harvesting, can probably just go ... > So ye, css very much can be considered a problem here. Acknowledged. > > This comment: > > - effect of css_set_lock in cgroup_post_fork(). > > ... I don't get whatsoever. I meant that css_set_lock is taken 2nd time in cgroup_post_fork() (also after this rework it remains there). And I'm wondering whether removal only in cgroup_css_set_fork() improves parallelism because the tasks (before patching) are queued on the first css_set_lock, serialized through the first critical section and when they arrive to the second critical section in cgroup_post_fork() their arrival rate is already reduced because they had to pass through the first critical section. Hence the 2nd pass through the critical section should be less contended (w/out waiting). I understand it's good to reduce the overall hold time of (every mentioned) lock but I'm unsure how much helps eliminating css_set_lock from two to one pass on the clone path. > > Stability of cgroup placement aside, to my reading the lock is needed > in part to serialize addition of the task to the cgroup list. No > matter what this will have to be serialized both ways with the same > thing. Yes, the modification of css_set->tasks list still needs css_set_lock in post fork. > > Perhaps said stability can be assured in other ways and the list can > be decomposed, but that's some complexity which is not warranted. The decomposition is not obvious to me :-/ --- 8< --- diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bc892e3b37eea..a176fd60ba08f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -413,11 +413,13 @@ static inline void cgroup_unlock(void) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU +extern rwlock_t css_set_clone_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ + lockdep_is_held(&css_set_clone_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 5f0d33b049102..4e28e922e5668 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -83,15 +83,21 @@ * css_set_lock protects task->cgroups pointer, the list of css_set * objects, and the chain of tasks off each css_set. * + * css_set_clone_lock synchronizes access to task->cgroups and cgroup->kill_seq + * instead of css_set_lock on clone path + * * These locks are exported if CONFIG_PROVE_RCU so that accessors in * cgroup.h can use them for lockdep annotations. */ DEFINE_MUTEX(cgroup_mutex); -DEFINE_SPINLOCK(css_set_lock); +__cacheline_aligned DEFINE_SPINLOCK(css_set_lock); +__cacheline_aligned DEFINE_RWLOCK(css_set_clone_lock); + #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); +EXPORT_SYMBOL_GPL(css_set_clone_lock); #endif struct blocking_notifier_head cgroup_lifetime_notifier = @@ -901,6 +907,10 @@ static void css_set_skip_task_iters(struct css_set *cset, css_task_iter_skip(it, task); } +enum css_set_move_flags { + CSET_MOVE_USE_MG_TASKS = (1 << 0), + CSET_MOVE_SKIP_LOCK = (1 << 1), +}; /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -918,13 +928,23 @@ static void css_set_skip_task_iters(struct css_set *cset, */ static void css_set_move_task(struct task_struct *task, struct css_set *from_cset, struct css_set *to_cset, - bool use_mg_tasks) + enum css_set_move_flags flags) { + /* + * Self task cannot move and clone concurrently and disassociation + * doesn't modify task->cgroups that's relevant for clone + */ + bool skip_clone_lock = task == current || to_cset == NULL; + skip_clone_lock |= flags & CSET_MOVE_SKIP_LOCK; + lockdep_assert_held(&css_set_lock); if (to_cset && !css_set_populated(to_cset)) css_set_update_populated(to_cset, true); + if (!skip_clone_lock) + write_lock(&css_set_clone_lock); + if (from_cset) { WARN_ON_ONCE(list_empty(&task->cg_list)); @@ -946,9 +966,13 @@ static void css_set_move_task(struct task_struct *task, WARN_ON_ONCE(task->flags & PF_EXITING); cgroup_move_task(task, to_cset); - list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : - &to_cset->tasks); + list_add_tail(&task->cg_list, + (flags & CSET_MOVE_USE_MG_TASKS) ? &to_cset->mg_tasks : + &to_cset->tasks); } + + if (!skip_clone_lock) + write_unlock(&css_set_clone_lock); } /* @@ -2723,7 +2747,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; - css_set_move_task(task, from_cset, to_cset, true); + css_set_move_task(task, from_cset, to_cset, CSET_MOVE_USE_MG_TASKS); from_cset->nr_tasks--; /* * If the source or destination cgroup is frozen, @@ -4183,7 +4207,9 @@ static void __cgroup_kill(struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); + write_lock(&css_set_clone_lock); cgrp->kill_seq++; + write_unlock(&css_set_clone_lock); spin_unlock_irq(&css_set_lock); css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); @@ -6696,14 +6722,15 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) cgroup_threadgroup_change_begin(current); - spin_lock_irq(&css_set_lock); + read_lock(&css_set_clone_lock); + cset = task_css_set(current); get_css_set(cset); if (kargs->cgrp) kargs->kill_seq = kargs->cgrp->kill_seq; else kargs->kill_seq = cset->dfl_cgrp->kill_seq; - spin_unlock_irq(&css_set_lock); + read_unlock(&css_set_clone_lock); if (!(kargs->flags & CLONE_INTO_CGROUP)) { kargs->cset = cset; @@ -6893,6 +6920,7 @@ void cgroup_post_fork(struct task_struct *child, cset = kargs->cset; kargs->cset = NULL; + // XXX could this be read_lock(css_set_clone_lock) ? spin_lock_irq(&css_set_lock); /* init tasks are special, only link regular threads */ @@ -6907,7 +6935,8 @@ void cgroup_post_fork(struct task_struct *child, WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; - css_set_move_task(child, NULL, cset, false); + /* child cannot run (another) clone, skip lock */ + css_set_move_task(child, NULL, cset, CSET_MOVE_SKIP_LOCK); } else { put_css_set(cset); cset = NULL; @@ -6995,7 +7024,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk) WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); - css_set_move_task(tsk, cset, NULL, false); + css_set_move_task(tsk, cset, NULL, CSET_MOVE_SKIP_LOCK); cset->nr_tasks--; /* matches the signal->live check in css_task_iter_advance() */ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 265 bytes --] ^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-01-29 13:22 ` Michal Koutný @ 2026-02-10 10:43 ` Michal Koutný 2026-02-10 11:19 ` Mateusz Guzik 0 siblings, 1 reply; 11+ messages in thread From: Michal Koutný @ 2026-02-10 10:43 UTC (permalink / raw) To: Mateusz Guzik; +Cc: tj, hannes, brauner, linux-kernel, cgroups [-- Attachment #1: Type: text/plain, Size: 2344 bytes --] Hello Mateusz. On Thu, Jan 29, 2026 at 02:22:32PM +0100, Michal Koutný <mkoutny@suse.com> wrote: > And I'm wondering whether removal only in cgroup_css_set_fork() improves > parallelism because the tasks (before patching) are queued on the first > css_set_lock, serialized through the first critical section and when > they arrive to the second critical section in cgroup_post_fork() their > arrival rate is already reduced because they had to pass through the > first critical section. Hence the 2nd pass through the critical section > should be less contended (w/out waiting). I was still curious about this, so I tried own measurement. I ran your clone'ing will-it-scale testcase [1]. Basically it was clone_processes -s 1000 -t 40 on a 40 CPUs/80 SMTs machine. I watched for the `total:` iteration counts reported by wis periodically. 6.18.8-0-default (baseline := stable + pidmap patches [2][3]) 2.9383e+05 ± 1135.5 6.18.8-1.g886f4c4-default (baseline + rwlock impl (previous message)) 2.9363e+05 ± 1219.8 6.18.8-1.gb21e8f8-default (baseline + seqcount impl (your patch)) 2.9147e+05 ± 1125.6 So I could not reproduce any non-random change with this css_set_lock split (I consider even the apparent difference between implementations rather random). At this point, I should look into profiles whether the bottleneck is really css_set_lock in cgroup_post_fork() but I'm sharing what I have, glad for your possible insights. Regards, Michal [1] Only clone_process variant, clone_threads randomly hung. will-it-scale/glibc (2.42-3.1) likely doesn't work well with the cancellation/(no) join (but I got hangs even with pthread cleanup handlers that joined the child thread) #0 futex_wait (futex_word=0x7ffff7ffd840 <_rtld_local+2112>, expected=2, private=0) at ../sysdeps/nptl/futex-internal.h:146 #1 __GI___lll_lock_wait_private (futex=0x7ffff7ffd840 <_rtld_local+2112>) at lowlevellock.c:34 #2 0x00007ffff7c98d69 in __GI___nptl_deallocate_stack (pd=0x7ffff7ab16c0) at nptl-stack.c:113 ... #5 0x00000000004029ca in kill_tasks () at main.c:151 [2] https://lore.kernel.org/linux-mm/20251206131955.780557-1-mjguzik@gmail.com/ [3] Those patched improved the metric about some 10% (but I haven't measured this difference so thoroughly). [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 265 bytes --] ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-02-10 10:43 ` Michal Koutný @ 2026-02-10 11:19 ` Mateusz Guzik 2026-02-10 16:55 ` Michal Koutný 0 siblings, 1 reply; 11+ messages in thread From: Mateusz Guzik @ 2026-02-10 11:19 UTC (permalink / raw) To: Michal Koutný; +Cc: tj, hannes, brauner, linux-kernel, cgroups On Tue, Feb 10, 2026 at 11:43 AM Michal Koutný <mkoutny@suse.com> wrote: > > Hello Mateusz. > ouch, terribly sorry for "hurry up and wait". real life suddenly got in the way and I have not looked into this since > On Thu, Jan 29, 2026 at 02:22:32PM +0100, Michal Koutný <mkoutny@suse.com> wrote: > > And I'm wondering whether removal only in cgroup_css_set_fork() improves > > parallelism because the tasks (before patching) are queued on the first > > css_set_lock, serialized through the first critical section and when > > they arrive to the second critical section in cgroup_post_fork() their > > arrival rate is already reduced because they had to pass through the > > first critical section. Hence the 2nd pass through the critical section > > should be less contended (w/out waiting). > it improves parallelism because total hold time goes down. first, there is a little less work to do with the lock in the first place even absent any contention second, there is less total overhead in terms of bouncing the lock and the cachelines used by the code protected by it. note any contention means the bouncing already happens you can see the second effect in my patch which does not reduce the amount of work per se, but merely avoids a case where someone is halfway through alloc_pid and has to wait Ignoring some single-threaded overhead from the atomics in rwlock I very much expect scalability to be about the same as with the seqlock, but only because of the bottlenecks elsewhere. While I don't understand why would you go for rwlock here, I'm not going to protest -- it still moves the css lock out of the picture. > I was still curious about this, so I tried own measurement. > I ran your clone'ing will-it-scale testcase [1]. > Basically it was > clone_processes -s 1000 -t 40 > on a 40 CPUs/80 SMTs machine. > I watched for the `total:` iteration counts reported by wis > periodically. > > 6.18.8-0-default (baseline := stable + pidmap patches [2][3]) > 2.9383e+05 ± 1135.5 > > 6.18.8-1.g886f4c4-default (baseline + rwlock impl (previous message)) > 2.9363e+05 ± 1219.8 > > 6.18.8-1.gb21e8f8-default (baseline + seqcount impl (your patch)) > 2.9147e+05 ± 1125.6 > > So I could not reproduce any non-random change with this css_set_lock > split (I consider even the apparent difference between implementations > rather random). This is going to depend on the scale you test on. I was testing on south of 32. But I also got a miniscule win from removing css set lock as the problem for me, instead everything shifted to tasklist. Per my other e-mail tasklist lock retains the terrible 3-times locking and it is doing rather expensive work while holding it. It is plausible it happens to be at the top at that scale, but that's only an argument for fixing it. Even if you don't see the css thing at the top at the moment, it will be there once someone(tm) sorts out the tasklist problem. > > At this point, I should look into profiles whether the bottleneck is > really css_set_lock in cgroup_post_fork() but I'm sharing what I have, > glad for your possible insights. > > Regards, > Michal > > [1] Only clone_process variant, clone_threads randomly hung. > will-it-scale/glibc (2.42-3.1) likely doesn't work well with the > cancellation/(no) join (but I got hangs even with pthread cleanup > handlers that joined the child thread) > > #0 futex_wait (futex_word=0x7ffff7ffd840 <_rtld_local+2112>, expected=2, private=0) at ../sysdeps/nptl/futex-internal.h:146 > #1 __GI___lll_lock_wait_private (futex=0x7ffff7ffd840 <_rtld_local+2112>) at lowlevellock.c:34 > #2 0x00007ffff7c98d69 in __GI___nptl_deallocate_stack (pd=0x7ffff7ab16c0) at nptl-stack.c:113 > ... > #5 0x00000000004029ca in kill_tasks () at main.c:151 > > [2] https://lore.kernel.org/linux-mm/20251206131955.780557-1-mjguzik@gmail.com/ > [3] Those patched improved the metric about some 10% (but I haven't > measured this difference so thoroughly). ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-02-10 11:19 ` Mateusz Guzik @ 2026-02-10 16:55 ` Michal Koutný 2026-02-10 17:33 ` Mateusz Guzik 0 siblings, 1 reply; 11+ messages in thread From: Michal Koutný @ 2026-02-10 16:55 UTC (permalink / raw) To: Mateusz Guzik; +Cc: tj, hannes, brauner, linux-kernel, cgroups [-- Attachment #1: Type: text/plain, Size: 3013 bytes --] On Tue, Feb 10, 2026 at 12:19:27PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > This is going to depend on the scale you test on. I was testing on > south of 32. But I also got a miniscule win from removing css set lock > as the problem for me, instead everything shifted to tasklist. To be on the same page -- that means you have nr_cpus >= 32? > Per my other e-mail tasklist lock retains the terrible 3-times locking > and it is doing rather expensive work while holding it. It is > plausible it happens to be at the top at that scale, but that's only > an argument for fixing it. Even if you don't see the css thing at the > top at the moment, it will be there once someone(tm) sorts out the > tasklist problem. I did a quick test (with 6.18.8-1.g886f4c4-default), first `perf top` while will-it-scale was running: 74.23% [kernel] [k] native_queued_spin_lock_slowpath 6.91% [kernel] [k] intel_idle_irq 0.87% [kernel] [k] update_sd_lb_stats.constprop.0 0.68% [kernel] [k] _raw_spin_lock 0.63% [kernel] [k] clear_page_erms 0.56% [kernel] [k] sched_balance_find_dst_group 0.40% [kernel] [k] alloc_vmap_area and then bpftrace for the waiters: $ bpftrace -e 'kprobe:native_queued_spin_lock_slowpath {@[arg0]=count();} END {for($kv : @) {printf("%s\t%d\n", ksym($kv.0), (int64)$kv.1);} clear(@); }'\ >bpftrace.out $ sort -k2 -r -n bpftrace.out | head | column -t pidmap_lock 10482583 nft_pcpu_tun_ctx 3693517 css_set_lock 1511164 input_pool 976252 tasklist_lock 798578 nft_pcpu_tun_ctx 481962 0xffff8abc3ffd55b0 95371 0xffff8a6d3ffd65b0 93686 0xffff8a5e218f0840 29501 0xffff8a5e451dca40 29421 or measured by cummulative waiting time: $ bpftrace -e 'kprobe:native_queued_spin_lock_slowpath {@[cpu]=arg0; @st[cpu]=nsecs;} kretprobe:native_queued_spin_lock_slowpath /@[cpu]/ {$lat=nsecs-@st[cpu]; @lats[@[cpu]]=sum($lat);} END {for($kv : @lats) {printf("%s\t%d\n", ksym($kv.0), (int64)$kv.1);} clear(@lats); clear(@st); clear(@) }'\ >bpftrace2.out $ sort -k2 -r -n bpftrace2.out | head -n15 | column -t pidmap_lock 1931209805 rcu_state 1823286316 rcu_state 1581455156 rcu_state 1328804835 rcu_state 1299517157 rcu_state 1134101627 nft_pcpu_tun_ctx 1027837665 0xffff8abc3ffd55b0 861441978 0xffff8a6d3ffd65b0 850732998 css_set_lock 520009479 input_pool 316598763 tasklist_lock 127161061 0xffff8aac40023200 32380418 0xffff8a5e002ab600 30194951 rcu_state 18334578 Hm, it's interesting that is suggestive of why I saw no big change with css_set_lock in my setup. Michal [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 265 bytes --] ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-02-10 16:55 ` Michal Koutný @ 2026-02-10 17:33 ` Mateusz Guzik 2026-03-11 14:41 ` Mateusz Guzik 0 siblings, 1 reply; 11+ messages in thread From: Mateusz Guzik @ 2026-02-10 17:33 UTC (permalink / raw) To: Michal Koutný; +Cc: tj, hannes, brauner, linux-kernel, cgroups On Tue, Feb 10, 2026 at 5:55 PM Michal Koutný <mkoutny@suse.com> wrote: > > On Tue, Feb 10, 2026 at 12:19:27PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > > This is going to depend on the scale you test on. I was testing on > > south of 32. But I also got a miniscule win from removing css set lock > > as the problem for me, instead everything shifted to tasklist. > > To be on the same page -- that means you have nr_cpus >= 32? > south means less > > Per my other e-mail tasklist lock retains the terrible 3-times locking > > and it is doing rather expensive work while holding it. It is > > plausible it happens to be at the top at that scale, but that's only > > an argument for fixing it. Even if you don't see the css thing at the > > top at the moment, it will be there once someone(tm) sorts out the > > tasklist problem. > > I did a quick test (with 6.18.8-1.g886f4c4-default), first `perf top` > while will-it-scale was running: I don't know what this hash corresponds to. > > 74.23% [kernel] [k] native_queued_spin_lock_slowpath > 6.91% [kernel] [k] intel_idle_irq > 0.87% [kernel] [k] update_sd_lb_stats.constprop.0 > 0.68% [kernel] [k] _raw_spin_lock > 0.63% [kernel] [k] clear_page_erms > 0.56% [kernel] [k] sched_balance_find_dst_group > 0.40% [kernel] [k] alloc_vmap_area > > and then bpftrace for the waiters: > $ bpftrace -e 'kprobe:native_queued_spin_lock_slowpath {@[arg0]=count();} > END {for($kv : @) {printf("%s\t%d\n", ksym($kv.0), (int64)$kv.1);} clear(@); }'\ > >bpftrace.out > $ sort -k2 -r -n bpftrace.out | head | column -t > pidmap_lock 10482583 > nft_pcpu_tun_ctx 3693517 > css_set_lock 1511164 > input_pool 976252 > tasklist_lock 798578 > nft_pcpu_tun_ctx 481962 > 0xffff8abc3ffd55b0 95371 > 0xffff8a6d3ffd65b0 93686 > 0xffff8a5e218f0840 29501 > 0xffff8a5e451dca40 29421 > > or measured by cummulative waiting time: > $ bpftrace -e 'kprobe:native_queued_spin_lock_slowpath {@[cpu]=arg0; @st[cpu]=nsecs;} > kretprobe:native_queued_spin_lock_slowpath /@[cpu]/ {$lat=nsecs-@st[cpu]; @lats[@[cpu]]=sum($lat);} > END {for($kv : @lats) {printf("%s\t%d\n", ksym($kv.0), (int64)$kv.1);} clear(@lats); clear(@st); clear(@) }'\ > >bpftrace2.out > > $ sort -k2 -r -n bpftrace2.out | head -n15 | column -t > pidmap_lock 1931209805 > rcu_state 1823286316 > rcu_state 1581455156 > rcu_state 1328804835 > rcu_state 1299517157 > rcu_state 1134101627 > nft_pcpu_tun_ctx 1027837665 > 0xffff8abc3ffd55b0 861441978 > 0xffff8a6d3ffd65b0 850732998 > css_set_lock 520009479 > input_pool 316598763 > tasklist_lock 127161061 > 0xffff8aac40023200 32380418 > 0xffff8a5e002ab600 30194951 > rcu_state 18334578 > If the only thing you applied is the patchset over at https://lore.kernel.org/linux-mm/20251206131955.780557-1-mjguzik@gmail.com/ , then this lines up with my own measurements, where I said the pidmap lock remains dominant. That thing gets unclogged with a patch by Christian to move pidmap handling out, which can be found here: https://lore.kernel.org/all/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org/ Afterwards it is css_set_lock at the top of the profile. > Hm, it's interesting that is suggestive of why I saw no big change with > css_set_lock in my setup. > Regardless, of the above, I noted sorting out this lock does not meaningfully improve performance, it merely shifts contention to tasklist afterwards. > > Michal ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-02-10 17:33 ` Mateusz Guzik @ 2026-03-11 14:41 ` Mateusz Guzik 2026-03-11 17:53 ` Michal Koutný 0 siblings, 1 reply; 11+ messages in thread From: Mateusz Guzik @ 2026-03-11 14:41 UTC (permalink / raw) To: Michal Koutný; +Cc: tj, hannes, brauner, linux-kernel, cgroups So I booted up a vm with 80 hw threads and the cgroup lock is still top of the profile for me when rolling with ./threadspawn1_processes -t 80 While I prefer my patch on the grounds it reduces overhead to begin with (fewer locking trips), I wont argue against yours. My primary goal here is to get cgroups out of the way. or to put it differently, can you either ack my patch or push yours? On Tue, Feb 10, 2026 at 6:33 PM Mateusz Guzik <mjguzik@gmail.com> wrote: > > On Tue, Feb 10, 2026 at 5:55 PM Michal Koutný <mkoutny@suse.com> wrote: > > > > On Tue, Feb 10, 2026 at 12:19:27PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > > > This is going to depend on the scale you test on. I was testing on > > > south of 32. But I also got a miniscule win from removing css set lock > > > as the problem for me, instead everything shifted to tasklist. > > > > To be on the same page -- that means you have nr_cpus >= 32? > > > > south means less > > > > Per my other e-mail tasklist lock retains the terrible 3-times locking > > > and it is doing rather expensive work while holding it. It is > > > plausible it happens to be at the top at that scale, but that's only > > > an argument for fixing it. Even if you don't see the css thing at the > > > top at the moment, it will be there once someone(tm) sorts out the > > > tasklist problem. > > > > I did a quick test (with 6.18.8-1.g886f4c4-default), first `perf top` > > while will-it-scale was running: > > I don't know what this hash corresponds to. > > > > > 74.23% [kernel] [k] native_queued_spin_lock_slowpath > > 6.91% [kernel] [k] intel_idle_irq > > 0.87% [kernel] [k] update_sd_lb_stats.constprop.0 > > 0.68% [kernel] [k] _raw_spin_lock > > 0.63% [kernel] [k] clear_page_erms > > 0.56% [kernel] [k] sched_balance_find_dst_group > > 0.40% [kernel] [k] alloc_vmap_area > > > > and then bpftrace for the waiters: > > $ bpftrace -e 'kprobe:native_queued_spin_lock_slowpath {@[arg0]=count();} > > END {for($kv : @) {printf("%s\t%d\n", ksym($kv.0), (int64)$kv.1);} clear(@); }'\ > > >bpftrace.out > > $ sort -k2 -r -n bpftrace.out | head | column -t > > pidmap_lock 10482583 > > nft_pcpu_tun_ctx 3693517 > > css_set_lock 1511164 > > input_pool 976252 > > tasklist_lock 798578 > > nft_pcpu_tun_ctx 481962 > > 0xffff8abc3ffd55b0 95371 > > 0xffff8a6d3ffd65b0 93686 > > 0xffff8a5e218f0840 29501 > > 0xffff8a5e451dca40 29421 > > > > or measured by cummulative waiting time: > > $ bpftrace -e 'kprobe:native_queued_spin_lock_slowpath {@[cpu]=arg0; @st[cpu]=nsecs;} > > kretprobe:native_queued_spin_lock_slowpath /@[cpu]/ {$lat=nsecs-@st[cpu]; @lats[@[cpu]]=sum($lat);} > > END {for($kv : @lats) {printf("%s\t%d\n", ksym($kv.0), (int64)$kv.1);} clear(@lats); clear(@st); clear(@) }'\ > > >bpftrace2.out > > > > $ sort -k2 -r -n bpftrace2.out | head -n15 | column -t > > pidmap_lock 1931209805 > > rcu_state 1823286316 > > rcu_state 1581455156 > > rcu_state 1328804835 > > rcu_state 1299517157 > > rcu_state 1134101627 > > nft_pcpu_tun_ctx 1027837665 > > 0xffff8abc3ffd55b0 861441978 > > 0xffff8a6d3ffd65b0 850732998 > > css_set_lock 520009479 > > input_pool 316598763 > > tasklist_lock 127161061 > > 0xffff8aac40023200 32380418 > > 0xffff8a5e002ab600 30194951 > > rcu_state 18334578 > > > > If the only thing you applied is the patchset over at > https://lore.kernel.org/linux-mm/20251206131955.780557-1-mjguzik@gmail.com/ > , then this lines up with my own measurements, where I said the pidmap > lock remains dominant. > > That thing gets unclogged with a patch by Christian to move pidmap > handling out, which can be found here: > https://lore.kernel.org/all/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org/ > > Afterwards it is css_set_lock at the top of the profile. > > > Hm, it's interesting that is suggestive of why I saw no big change with > > css_set_lock in my setup. > > > > Regardless, of the above, I noted sorting out this lock does not > meaningfully improve performance, it merely shifts contention to > tasklist afterwards. > > > > > Michal ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() 2026-03-11 14:41 ` Mateusz Guzik @ 2026-03-11 17:53 ` Michal Koutný 0 siblings, 0 replies; 11+ messages in thread From: Michal Koutný @ 2026-03-11 17:53 UTC (permalink / raw) To: Mateusz Guzik; +Cc: tj, hannes, brauner, linux-kernel, cgroups [-- Attachment #1: Type: text/plain, Size: 1066 bytes --] On Wed, Mar 11, 2026 at 03:41:56PM +0100, Mateusz Guzik <mjguzik@gmail.com> wrote: > So I booted up a vm with 80 hw threads and the cgroup lock is still > top of the profile for me when rolling with ./threadspawn1_processes > -t 80 > > While I prefer my patch on the grounds it reduces overhead to begin > with (fewer locking trips), I wont argue against yours. My primary > goal here is to get cgroups out of the way. I filed this under -- there are still other locks above css_set_lock. Has this changed with current mainline? Furthermore, there's still: a) css_set_lock in post_fork and b) tasklist lock which is much harder problem. > > or to put it differently, can you either ack my patch or push yours? Without a convincing measurement, I'd say either are making synchronization more complex for nothing. What contention numbers do you see before and after the patch (on what base)? (Sorry, I dismantled my measuring environment meanwhile (and I wouldn't know what other non-mainline patches would I need).) Thanks, Michal [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 265 bytes --] ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2026-03-11 17:53 UTC | newest] Thread overview: 11+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-01-22 11:29 [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() Mateusz Guzik 2026-01-27 14:30 ` Mateusz Guzik 2026-01-27 17:27 ` Michal Koutný 2026-01-27 18:18 ` Mateusz Guzik 2026-01-29 13:22 ` Michal Koutný 2026-02-10 10:43 ` Michal Koutný 2026-02-10 11:19 ` Mateusz Guzik 2026-02-10 16:55 ` Michal Koutný 2026-02-10 17:33 ` Mateusz Guzik 2026-03-11 14:41 ` Mateusz Guzik 2026-03-11 17:53 ` Michal Koutný
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox