public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed
From: Mateusz Guzik <mjguzik@gmail.com>
To: tj@kernel.org, hannes@cmpxchg.org, mkoutny@suse.com
Cc: brauner@kernel.org, linux-kernel@vger.kernel.org,
	cgroups@vger.kernel.org, Mateusz Guzik <mjguzik@gmail.com>
Subject: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
Date: Thu, 22 Jan 2026 12:29:51 +0100	[thread overview]
Message-ID: <20260122112951.1854124-1-mjguzik@gmail.com> (raw)

In the stock kernel the css_set_lock is taken three times during thread
life cycle, turning it into the primary bottleneck in fork-heavy
workloads.

The acquire in perparation for clone can be avoided with a sequence
counter, which in turn pushes the lock down.

Accounts only for 6% speed up when creating threads in parallel on 20
cores as most of the contention shifts to pidmap_lock.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

v2:
- change comment about clone_seq
- raw_write_seqcount* -> write_seqcount
- just loop on failed seq check
- don't bump it on task exit

 kernel/cgroup/cgroup-internal.h | 11 +++++--
 kernel/cgroup/cgroup.c          | 54 +++++++++++++++++++++++++--------
 2 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 22051b4f1ccb..04a3aadcbc7f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -194,6 +194,9 @@ static inline bool notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 
+/*
+ * refcounted get/put for css_set objects
+ */
 void put_css_set_locked(struct css_set *cset);
 
 static inline void put_css_set(struct css_set *cset)
@@ -213,14 +216,16 @@ static inline void put_css_set(struct css_set *cset)
 	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
-/*
- * refcounted get/put for css_set objects
- */
 static inline void get_css_set(struct css_set *cset)
 {
 	refcount_inc(&cset->refcount);
 }
 
+static inline bool get_css_set_not_zero(struct css_set *cset)
+{
+	return refcount_inc_not_zero(&cset->refcount);
+}
+
 bool cgroup_ssid_enabled(int ssid);
 bool cgroup_on_dfl(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 94788bd1fdf0..0053582b9b56 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -87,7 +87,14 @@
  * cgroup.h can use them for lockdep annotations.
  */
 DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
+
+/*
+ * css_set_for_clone_seq synchronizes access to task_struct::cgroup
+ * and cgroup::kill_seq used on clone path
+ */
+static __cacheline_aligned seqcount_spinlock_t css_set_for_clone_seq =
+	SEQCNT_SPINLOCK_ZERO(css_set_for_clone_seq, &css_set_lock);
 
 #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
 EXPORT_SYMBOL_GPL(cgroup_mutex);
@@ -907,6 +914,7 @@ static void css_set_skip_task_iters(struct css_set *cset,
  * @from_cset: css_set @task currently belongs to (may be NULL)
  * @to_cset: new css_set @task is being moved to (may be NULL)
  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ * @skip_clone_seq: don't bump css_set_for_clone_seq
  *
  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
  * css_set, @from_cset can be NULL.  If @task is being disassociated
@@ -918,13 +926,16 @@ static void css_set_skip_task_iters(struct css_set *cset,
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
-			      bool use_mg_tasks)
+			      bool use_mg_tasks, bool skip_clone_seq)
 {
 	lockdep_assert_held(&css_set_lock);
 
 	if (to_cset && !css_set_populated(to_cset))
 		css_set_update_populated(to_cset, true);
 
+	if (!skip_clone_seq)
+		write_seqcount_begin(&css_set_for_clone_seq);
+
 	if (from_cset) {
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
@@ -949,6 +960,9 @@ static void css_set_move_task(struct task_struct *task,
 		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 							     &to_cset->tasks);
 	}
+
+	if (!skip_clone_seq)
+		write_seqcount_end(&css_set_for_clone_seq);
 }
 
 /*
@@ -2723,7 +2737,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 
 			get_css_set(to_cset);
 			to_cset->nr_tasks++;
-			css_set_move_task(task, from_cset, to_cset, true);
+			css_set_move_task(task, from_cset, to_cset, true, false);
 			from_cset->nr_tasks--;
 			/*
 			 * If the source or destination cgroup is frozen,
@@ -4183,7 +4197,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
+	write_seqcount_begin(&css_set_for_clone_seq);
 	cgrp->kill_seq++;
+	write_seqcount_end(&css_set_for_clone_seq);
 	spin_unlock_irq(&css_set_lock);
 
 	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -6696,14 +6712,26 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 
 	cgroup_threadgroup_change_begin(current);
 
-	spin_lock_irq(&css_set_lock);
-	cset = task_css_set(current);
-	get_css_set(cset);
-	if (kargs->cgrp)
-		kargs->kill_seq = kargs->cgrp->kill_seq;
-	else
-		kargs->kill_seq = cset->dfl_cgrp->kill_seq;
-	spin_unlock_irq(&css_set_lock);
+	for (;;) {
+		unsigned seq = raw_read_seqcount_begin(&css_set_for_clone_seq);
+		bool got_ref = false;
+		rcu_read_lock();
+		cset = task_css_set(current);
+		if (kargs->cgrp)
+			kargs->kill_seq = kargs->cgrp->kill_seq;
+		else
+			kargs->kill_seq = cset->dfl_cgrp->kill_seq;
+		if (get_css_set_not_zero(cset))
+			got_ref = true;
+		rcu_read_unlock();
+		if (unlikely(!got_ref || read_seqcount_retry(&css_set_for_clone_seq, seq))) {
+			if (got_ref)
+				put_css_set(cset);
+			cpu_relax();
+			continue;
+		}
+		break;
+	}
 
 	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
 		kargs->cset = cset;
@@ -6907,7 +6935,7 @@ void cgroup_post_fork(struct task_struct *child,
 
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
-		css_set_move_task(child, NULL, cset, false);
+		css_set_move_task(child, NULL, cset, false, true);
 	} else {
 		put_css_set(cset);
 		cset = NULL;
@@ -6995,7 +7023,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
-	css_set_move_task(tsk, cset, NULL, false);
+	css_set_move_task(tsk, cset, NULL, false, true);
 	cset->nr_tasks--;
 	/* matches the signal->live check in css_task_iter_advance() */
 	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
-- 
2.48.1


             reply	other threads:[~2026-01-22 11:30 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-22 11:29 Mateusz Guzik [this message]
2026-01-27 14:30 ` [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork() Mateusz Guzik
2026-01-27 17:27   ` Michal Koutný
2026-01-27 18:18     ` Mateusz Guzik
2026-01-29 13:22       ` Michal Koutný
2026-02-10 10:43         ` Michal Koutný
2026-02-10 11:19           ` Mateusz Guzik
2026-02-10 16:55             ` Michal Koutný
2026-02-10 17:33               ` Mateusz Guzik
2026-03-11 14:41                 ` Mateusz Guzik
2026-03-11 17:53                   ` Michal Koutný

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260122112951.1854124-1-mjguzik@gmail.com \
    --to=mjguzik@gmail.com \
    --cc=brauner@kernel.org \
    --cc=cgroups@vger.kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mkoutny@suse.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox