From: Thomas Gleixner <tglx@kernel.org>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Matthieu Baerts <matttbe@kernel.org>,
Jiri Slaby <jirislaby@kernel.org>
Subject: [patch 4/4] sched/mmcid: Avoid full tasklist walks
Date: Tue, 10 Mar 2026 21:29:09 +0100 [thread overview]
Message-ID: <20260310202526.183824481@kernel.org> (raw)
In-Reply-To: 20260310201009.257617049@kernel.org
Chasing vfork()'ed tasks on a CID ownership mode switch requires a full
task list walk, which is obviously expensive on large systems.
Avoid that by keeping a list of tasks using a mm MMCID entity in mm::mm_cid
and walk this list instead. This removes the proven to be flaky counting
logic and avoids a full task list walk in the case of vfork()'ed tasks.
Fixes: fbd0e71dc370 ("sched/mmcid: Provide CID ownership mode fixup functions")
Signed-off-by: Thomas Gleixner <tglx@kernel.org>
---
include/linux/rseq_types.h | 6 ++++-
kernel/fork.c | 1
kernel/sched/core.c | 54 ++++++++++-----------------------------------
3 files changed, 18 insertions(+), 43 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -133,10 +133,12 @@ struct rseq_data { };
* @active: MM CID is active for the task
* @cid: The CID associated to the task either permanently or
* borrowed from the CPU
+ * @node: Queued in the per MM MMCID list
*/
struct sched_mm_cid {
unsigned int active;
unsigned int cid;
+ struct hlist_node node;
};
/**
@@ -157,6 +159,7 @@ struct mm_cid_pcpu {
* @work: Regular work to handle the affinity mode change case
* @lock: Spinlock to protect against affinity setting which can't take @mutex
* @mutex: Mutex to serialize forks and exits related to this mm
+ * @user_list: List of the MM CID users of a MM
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
* is growth only.
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
@@ -177,13 +180,14 @@ struct mm_mm_cid {
raw_spinlock_t lock;
struct mutex mutex;
+ struct hlist_head user_list;
/* Low frequency modified */
unsigned int nr_cpus_allowed;
unsigned int users;
unsigned int pcpu_thrs;
unsigned int update_deferred;
-}____cacheline_aligned_in_smp;
+} ____cacheline_aligned;
#else /* CONFIG_SCHED_MM_CID */
struct mm_mm_cid { };
struct sched_mm_cid { };
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_stru
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
+ INIT_HLIST_NODE(&tsk->mm_cid.node);
#endif
return tsk;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10621,13 +10621,10 @@ static inline void mm_cid_transit_to_cpu
}
}
-static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
guard(task_rq_lock)(t);
- /* If the task is not active it is not in the users count */
- if (!t->mm_cid.active)
- return false;
if (cid_on_task(t->mm_cid.cid)) {
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
@@ -10635,51 +10632,21 @@ static bool mm_cid_fixup_task_to_cpu(str
else
mm_unset_cid_on_task(t);
}
- return true;
}
-static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
+static void mm_cid_fixup_tasks_to_cpus(void)
{
- struct task_struct *p, *t;
- unsigned int users;
-
- /*
- * This can obviously race with a concurrent affinity change, which
- * increases the number of allowed CPUs for this mm, but that does
- * not affect the mode and only changes the CID constraints. A
- * possible switch back to per task mode happens either in the
- * deferred handler function or in the next fork()/exit().
- *
- * The caller has already transferred so remove it from the users
- * count. The incoming task is already visible and has mm_cid.active,
- * but has task::mm_cid::cid == UNSET. Still it needs to be accounted
- * for. Concurrent fork()s might add more threads, but all of them have
- * task::mm_cid::active = 0, so they don't affect the accounting here.
- */
- users = mm->mm_cid.users - 1;
-
- guard(rcu)();
- for_other_threads(current, t) {
- if (mm_cid_fixup_task_to_cpu(t, mm))
- users--;
- }
+ struct mm_struct *mm = current->mm;
+ struct task_struct *t;
- if (!users)
- return;
+ lockdep_assert_held(&mm->mm_cid.mutex);
- /* Happens only for VM_CLONE processes. */
- for_each_process_thread(p, t) {
- if (t == current || t->mm != mm)
- continue;
- mm_cid_fixup_task_to_cpu(t, mm);
+ hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
+ /* Current has already transferred before invoking the fixup. */
+ if (t != current)
+ mm_cid_fixup_task_to_cpu(t, mm);
}
-}
-
-static void mm_cid_fixup_tasks_to_cpus(void)
-{
- struct mm_struct *mm = current->mm;
- mm_cid_do_fixup_tasks_to_cpus(mm);
mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
@@ -10688,6 +10655,7 @@ static bool sched_mm_cid_add_user(struct
lockdep_assert_held(&mm->mm_cid.lock);
t->mm_cid.active = 1;
+ hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
mm->mm_cid.users++;
return mm_update_max_cids(mm);
}
@@ -10745,6 +10713,7 @@ static bool sched_mm_cid_remove_user(str
/* Clear the transition bit */
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
mm_unset_cid_on_task(t);
+ hlist_del_init(&t->mm_cid.node);
t->mm->mm_cid.users--;
return mm_update_max_cids(t->mm);
}
@@ -10887,6 +10856,7 @@ void mm_init_cid(struct mm_struct *mm, s
mutex_init(&mm->mm_cid.mutex);
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ INIT_HLIST_HEAD(&mm->mm_cid.user_list);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
}
next prev parent reply other threads:[~2026-03-10 20:29 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-10 20:28 [patch 0/4] sched/mmcid: Cure fork()/vfork() related problems Thomas Gleixner
2026-03-10 20:28 ` [patch 1/4] sched/mmcid: Prevent CID stalls due to concurrent forks Thomas Gleixner
2026-03-11 7:33 ` Jiri Slaby
2026-03-11 7:49 ` Peter Zijlstra
2026-03-11 11:04 ` [tip: sched/urgent] " tip-bot2 for Thomas Gleixner
2026-03-10 20:28 ` [patch 2/4] sched/mmcid: Handle vfork()/CLONE_VM correctly Thomas Gleixner
2026-03-11 11:04 ` [tip: sched/urgent] " tip-bot2 for Thomas Gleixner
2026-03-10 20:29 ` [patch 3/4] sched/mmcid: Remove pointless preempt guard Thomas Gleixner
2026-03-11 11:04 ` [tip: sched/urgent] " tip-bot2 for Thomas Gleixner
2026-03-10 20:29 ` Thomas Gleixner [this message]
2026-03-11 11:04 ` [tip: sched/urgent] sched/mmcid: Avoid full tasklist walks tip-bot2 for Thomas Gleixner
2026-03-11 9:43 ` [patch 0/4] sched/mmcid: Cure fork()/vfork() related problems Matthieu Baerts
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260310202526.183824481@kernel.org \
--to=tglx@kernel.org \
--cc=jirislaby@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matttbe@kernel.org \
--cc=peterz@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox