* [patch 01/19] sched/mmcid: Revert the complex CID management
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 02/19] sched/mmcid: Use proper data structures Thomas Gleixner
` (19 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
The CID management is a complex beast, which affects both scheduling and
task migration. The compaction mechanism forces random tasks of a process
into task work on exit to user space causing latency spikes.
Revert back to the initial simple bitmap allocating mechanics, which are
known to have scalability issues as that allows to gradually build up a
replacement functionality in a reviewable way.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/mm_types.h | 53 ----
kernel/fork.c | 5
kernel/sched/core.c | 514 +----------------------------------------------
kernel/sched/sched.h | 291 +++-----------------------
4 files changed, 64 insertions(+), 799 deletions(-)
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/87tt0k3oks.ffs@tglx
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -924,13 +924,9 @@ struct vm_area_struct {
#define vma_policy(vma) NULL
#endif
-#ifdef CONFIG_SCHED_MM_CID
struct mm_cid {
- u64 time;
- int cid;
- int recent_cid;
+ unsigned int cid;
};
-#endif
struct kioctx_table;
struct iommu_mm_data;
@@ -993,12 +989,6 @@ struct mm_struct {
* runqueue locks.
*/
struct mm_cid __percpu *pcpu_cid;
- /*
- * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
- *
- * When the next mm_cid scan is due (in jiffies).
- */
- unsigned long mm_cid_next_scan;
/**
* @nr_cpus_allowed: Number of CPUs allowed for mm.
*
@@ -1007,14 +997,6 @@ struct mm_struct {
*/
unsigned int nr_cpus_allowed;
/**
- * @max_nr_cid: Maximum number of allowed concurrency
- * IDs allocated.
- *
- * Track the highest number of allowed concurrency IDs
- * allocated for the mm.
- */
- atomic_t max_nr_cid;
- /**
* @cpus_allowed_lock: Lock protecting mm cpus_allowed.
*
* Provide mutual exclusion for mm cpus_allowed and
@@ -1325,35 +1307,7 @@ static inline void vma_iter_init(struct
#ifdef CONFIG_SCHED_MM_CID
-enum mm_cid_state {
- MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
- MM_CID_LAZY_PUT = (1U << 31),
-};
-
-static inline bool mm_cid_is_unset(int cid)
-{
- return cid == MM_CID_UNSET;
-}
-
-static inline bool mm_cid_is_lazy_put(int cid)
-{
- return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
-}
-
-static inline bool mm_cid_is_valid(int cid)
-{
- return !(cid & MM_CID_LAZY_PUT);
-}
-
-static inline int mm_cid_set_lazy_put(int cid)
-{
- return cid | MM_CID_LAZY_PUT;
-}
-
-static inline int mm_cid_clear_lazy_put(int cid)
-{
- return cid & ~MM_CID_LAZY_PUT;
-}
+#define MM_CID_UNSET (~0U)
/*
* mm_cpus_allowed: Union of all mm's threads allowed CPUs.
@@ -1386,11 +1340,8 @@ static inline void mm_init_cid(struct mm
struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
pcpu_cid->cid = MM_CID_UNSET;
- pcpu_cid->recent_cid = MM_CID_UNSET;
- pcpu_cid->time = 0;
}
mm->nr_cpus_allowed = p->nr_cpus_allowed;
- atomic_set(&mm->max_nr_cid, 0);
raw_spin_lock_init(&mm->cpus_allowed_lock);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
cpumask_clear(mm_cidmask(mm));
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,10 +955,9 @@ static struct task_struct *dup_task_stru
#endif
#ifdef CONFIG_SCHED_MM_CID
- tsk->mm_cid = -1;
- tsk->last_mm_cid = -1;
+ tsk->mm_cid = MM_CID_UNSET;
+ tsk->last_mm_cid = MM_CID_UNSET;
tsk->mm_cid_active = 0;
- tsk->migrate_from_cpu = -1;
#endif
return tsk;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2126,8 +2126,6 @@ void activate_task(struct rq *rq, struct
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
- if (flags & ENQUEUE_MIGRATED)
- sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -3364,7 +3362,6 @@ void set_task_cpu(struct task_struct *p,
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -5344,8 +5341,7 @@ context_switch(struct rq *rq, struct tas
}
}
- /* switch_mm_cid() requires the memory barriers above. */
- switch_mm_cid(rq, prev, next);
+ switch_mm_cid(prev, next);
/*
* Tell rseq that the task was scheduled in. Must be after
@@ -5636,7 +5632,6 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -10408,522 +10403,47 @@ void call_trace_sched_update_nr_running(
}
#ifdef CONFIG_SCHED_MM_CID
-
/*
- * @cid_lock: Guarantee forward-progress of cid allocation.
- *
- * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
- * is only used when contention is detected by the lock-free allocation so
- * forward progress can be guaranteed.
- */
-DEFINE_RAW_SPINLOCK(cid_lock);
-
-/*
- * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
- *
- * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
- * detected, it is set to 1 to ensure that all newly coming allocations are
- * serialized by @cid_lock until the allocation which detected contention
- * completes and sets @use_cid_lock back to 0. This guarantees forward progress
- * of a cid allocation.
- */
-int use_cid_lock;
-
-/*
- * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
- * concurrently with respect to the execution of the source runqueue context
- * switch.
- *
- * There is one basic properties we want to guarantee here:
- *
- * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
- * used by a task. That would lead to concurrent allocation of the cid and
- * userspace corruption.
- *
- * Provide this guarantee by introducing a Dekker memory ordering to guarantee
- * that a pair of loads observe at least one of a pair of stores, which can be
- * shown as:
- *
- * X = Y = 0
- *
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
- *
- * Which guarantees that x==0 && y==0 is impossible. But rather than using
- * values 0 and 1, this algorithm cares about specific state transitions of the
- * runqueue current task (as updated by the scheduler context switch), and the
- * per-mm/cpu cid value.
- *
- * Let's introduce task (Y) which has task->mm == mm and task (N) which has
- * task->mm != mm for the rest of the discussion. There are two scheduler state
- * transitions on context switch we care about:
- *
- * (TSA) Store to rq->curr with transition from (N) to (Y)
- *
- * (TSB) Store to rq->curr with transition from (Y) to (N)
- *
- * On the remote-clear side, there is one transition we care about:
- *
- * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
- *
- * There is also a transition to UNSET state which can be performed from all
- * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
- * guarantees that only a single thread will succeed:
- *
- * (TMB) cmpxchg to *pcpu_cid to mark UNSET
- *
- * Just to be clear, what we do _not_ want to happen is a transition to UNSET
- * when a thread is actively using the cid (property (1)).
- *
- * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
- *
- * Scenario A) (TSA)+(TMA) (from next task perspective)
- *
- * CPU0 CPU1
- *
- * Context switch CS-1 Remote-clear
- * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
- * (implied barrier after cmpxchg)
- * - switch_mm_cid()
- * - memory barrier (see switch_mm_cid()
- * comment explaining how this barrier
- * is combined with other scheduler
- * barriers)
- * - mm_cid_get (next)
- * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
- *
- * This Dekker ensures that either task (Y) is observed by the
- * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
- * observed.
- *
- * If task (Y) store is observed by rcu_dereference(), it means that there is
- * still an active task on the cpu. Remote-clear will therefore not transition
- * to UNSET, which fulfills property (1).
- *
- * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
- * it will move its state to UNSET, which clears the percpu cid perhaps
- * uselessly (which is not an issue for correctness). Because task (Y) is not
- * observed, CPU1 can move ahead to set the state to UNSET. Because moving
- * state to UNSET is done with a cmpxchg expecting that the old state has the
- * LAZY flag set, only one thread will successfully UNSET.
- *
- * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
- * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
- * CPU1 will observe task (Y) and do nothing more, which is fine.
- *
- * What we are effectively preventing with this Dekker is a scenario where
- * neither LAZY flag nor store (Y) are observed, which would fail property (1)
- * because this would UNSET a cid which is actively used.
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
*/
-
-void sched_mm_cid_migrate_from(struct task_struct *t)
-{
- t->migrate_from_cpu = task_cpu(t);
-}
-
-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid)
-{
- struct mm_struct *mm = t->mm;
- struct task_struct *src_task;
- int src_cid, last_mm_cid;
-
- if (!mm)
- return -1;
-
- last_mm_cid = t->last_mm_cid;
- /*
- * If the migrated task has no last cid, or if the current
- * task on src rq uses the cid, it means the source cid does not need
- * to be moved to the destination cpu.
- */
- if (last_mm_cid == -1)
- return -1;
- src_cid = READ_ONCE(src_pcpu_cid->cid);
- if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
- return -1;
-
- /*
- * If we observe an active task using the mm on this rq, it means we
- * are not the last task to be migrated from this cpu for this mm, so
- * there is no need to move src_cid to the destination cpu.
- */
- guard(rcu)();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- t->last_mm_cid = -1;
- return -1;
- }
-
- return src_cid;
-}
-
-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid,
- int src_cid)
-{
- struct task_struct *src_task;
- struct mm_struct *mm = t->mm;
- int lazy_cid;
-
- if (src_cid == -1)
- return -1;
-
- /*
- * Attempt to clear the source cpu cid to move it to the destination
- * cpu.
- */
- lazy_cid = mm_cid_set_lazy_put(src_cid);
- if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
- return -1;
-
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
-
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, this task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
- }
- }
-
- /*
- * The src_cid is unused, so it can be unset.
- */
- if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- return -1;
- WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
- return src_cid;
-}
-
-/*
- * Migration to dst cpu. Called with dst_rq lock held.
- * Interrupts are disabled, which keeps the window of cid ownership without the
- * source rq lock held small.
- */
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
-{
- struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
- struct mm_struct *mm = t->mm;
- int src_cid, src_cpu;
- bool dst_cid_is_set;
- struct rq *src_rq;
-
- lockdep_assert_rq_held(dst_rq);
-
- if (!mm)
- return;
- src_cpu = t->migrate_from_cpu;
- if (src_cpu == -1) {
- t->last_mm_cid = -1;
- return;
- }
- /*
- * Move the src cid if the dst cid is unset. This keeps id
- * allocation closest to 0 in cases where few threads migrate around
- * many CPUs.
- *
- * If destination cid or recent cid is already set, we may have
- * to just clear the src cid to ensure compactness in frequent
- * migrations scenarios.
- *
- * It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed CPUs, because user-space
- * can expect that the number of allowed cids can reach the number of
- * allowed CPUs.
- */
- dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
- !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
- if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
- return;
- src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
- src_rq = cpu_rq(src_cpu);
- src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
- if (src_cid == -1)
- return;
- src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
- src_cid);
- if (src_cid == -1)
- return;
- if (dst_cid_is_set) {
- __mm_cid_put(mm, src_cid);
- return;
- }
- /* Move src_cid to dst cpu. */
- mm_cid_snapshot_time(dst_rq, mm);
- WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
- WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
-}
-
-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
- int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- struct task_struct *t;
- int cid, lazy_cid;
-
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid))
- return;
-
- /*
- * Clear the cpu cid if it is set to keep cid allocation compact. If
- * there happens to be other tasks left on the source cpu using this
- * mm, the next task using this mm will reallocate its cid on context
- * switch.
- */
- lazy_cid = mm_cid_set_lazy_put(cid);
- if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
- return;
-
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
-
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, that task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
- return;
- }
-
- /*
- * The cid is unused, so it can be unset.
- * Disable interrupts to keep the window of cid ownership without rq
- * lock small.
- */
- scoped_guard (irqsave) {
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
- }
-}
-
-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- struct mm_cid *pcpu_cid;
- struct task_struct *curr;
- u64 rq_clock;
-
- /*
- * rq->clock load is racy on 32-bit but one spurious clear once in a
- * while is irrelevant.
- */
- rq_clock = READ_ONCE(rq->clock);
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
-
- /*
- * In order to take care of infrequently scheduled tasks, bump the time
- * snapshot associated with this cid if an active task using the mm is
- * observed on this rq.
- */
- scoped_guard (rcu) {
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- return;
- }
- }
-
- if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
- return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
-}
-
-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
- int weight)
-{
- struct mm_cid *pcpu_cid;
- int cid;
-
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid) || cid < weight)
- return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
-}
-
-static void task_mm_cid_work(struct callback_head *work)
-{
- unsigned long now = jiffies, old_scan, next_scan;
- struct task_struct *t = current;
- struct cpumask *cidmask;
- struct mm_struct *mm;
- int weight, cpu;
-
- WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
-
- work->next = work; /* Prevent double-add */
- if (t->flags & PF_EXITING)
- return;
- mm = t->mm;
- if (!mm)
- return;
- old_scan = READ_ONCE(mm->mm_cid_next_scan);
- next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
- if (!old_scan) {
- unsigned long res;
-
- res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
- if (res != old_scan)
- old_scan = res;
- else
- old_scan = next_scan;
- }
- if (time_before(now, old_scan))
- return;
- if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
- return;
- cidmask = mm_cidmask(mm);
- /* Clear cids that were not recently used. */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_old(mm, cpu);
- weight = cpumask_weight(cidmask);
- /*
- * Clear cids that are greater or equal to the cidmask weight to
- * recompact it.
- */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_weight(mm, cpu, weight);
-}
-
-void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- int mm_users = 0;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1)
- mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
- }
- t->cid_work.next = &t->cid_work; /* Protect against double add */
- init_task_work(&t->cid_work, task_mm_cid_work);
-}
-
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
-{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
-
- if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
- return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
- return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
-}
-
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
+ if (!mm || !t->mm_cid_active)
return;
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ guard(preempt)();
+ t->mm_cid_active = 0;
+ if (t->mm_cid != MM_CID_UNSET) {
+ cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm));
+ t->mm_cid = MM_CID_UNSET;
+ }
}
+/* Deactivate MM CID allocation across execve() */
void sched_mm_cid_before_execve(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
- struct rq *rq;
-
- if (!mm)
- return;
-
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ sched_mm_cid_exit_signals(t);
}
+/* Reactivate MM CID after successful execve() */
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
if (!mm)
return;
- preempt_disable();
- rq = this_rq();
- scoped_guard (rq_lock_irqsave, rq) {
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
- }
+ guard(preempt)();
+ t->mm_cid_active = 1;
+ mm_cid_select(t);
}
void sched_mm_cid_fork(struct task_struct *t)
{
- WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET);
t->mm_cid_active = 1;
}
#endif /* CONFIG_SCHED_MM_CID */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3534,288 +3534,83 @@ extern void sched_dynamic_update(int mod
extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
-
-#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
-#define MM_CID_SCAN_DELAY 100 /* 100ms */
-
-extern raw_spinlock_t cid_lock;
-extern int use_cid_lock;
-
-extern void sched_mm_cid_migrate_from(struct task_struct *t);
-extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
-extern void init_sched_mm_cid(struct task_struct *t);
-
-static inline void __mm_cid_put(struct mm_struct *mm, int cid)
-{
- if (cid < 0)
- return;
- cpumask_clear_cpu(cid, mm_cidmask(mm));
-}
-
-/*
- * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
- * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
- * be held to transition to other states.
- *
- * State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
- */
-static inline void mm_cid_put_lazy(struct task_struct *t)
+static inline void init_sched_mm_cid(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
+ unsigned int max_cid;
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- if (!mm_cid_is_lazy_put(cid) ||
- !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ if (!mm)
return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
-}
-
-static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
-{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, res;
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- for (;;) {
- if (mm_cid_is_unset(cid))
- return MM_CID_UNSET;
- /*
- * Attempt transition from valid or lazy-put to unset.
- */
- res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
- if (res == cid)
- break;
- cid = res;
- }
- return cid;
+ /* Preset last_mm_cid */
+ max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
+ t->last_mm_cid = max_cid - 1;
}
-static inline void mm_cid_put(struct mm_struct *mm)
+static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
{
- int cid;
+ struct mm_struct *mm = t->mm;
- lockdep_assert_irqs_disabled();
- cid = mm_cid_pcpu_unset(mm);
- if (cid == MM_CID_UNSET)
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ if (cid >= max_cids)
+ return false;
+ if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm)))
+ return false;
+ t->mm_cid = t->last_mm_cid = cid;
+ __this_cpu_write(mm->pcpu_cid->cid, cid);
+ return true;
}
-static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
+static inline bool mm_cid_get(struct task_struct *t)
{
- struct cpumask *cidmask = mm_cidmask(mm);
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, max_nr_cid, allowed_max_nr_cid;
+ struct mm_struct *mm = t->mm;
+ unsigned int max_cids;
- /*
- * After shrinking the number of threads or reducing the number
- * of allowed cpus, reduce the value of max_nr_cid so expansion
- * of cid allocation will preserve cache locality if the number
- * of threads or allowed cpus increase again.
- */
- max_nr_cid = atomic_read(&mm->max_nr_cid);
- while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
- atomic_read(&mm->mm_users))),
- max_nr_cid > allowed_max_nr_cid) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
- if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
- max_nr_cid = allowed_max_nr_cid;
- break;
- }
- }
- /* Try to re-use recent cid. This improves cache locality. */
- cid = __this_cpu_read(pcpu_cid->recent_cid);
- if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
- !cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- /*
- * Expand cid allocation if the maximum number of concurrency
- * IDs allocated (max_nr_cid) is below the number cpus allowed
- * and number of threads. Expanding cid allocation as much as
- * possible improves cache locality.
- */
- cid = max_nr_cid;
- while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
- if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
- continue;
- if (!cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- }
- /*
- * Find the first available concurrency id.
- * Retry finding first zero bit if the mask is temporarily
- * filled. This only happens during concurrent remote-clear
- * which owns a cid without holding a rq lock.
- */
- for (;;) {
- cid = cpumask_first_zero(cidmask);
- if (cid < READ_ONCE(mm->nr_cpus_allowed))
- break;
- cpu_relax();
- }
- if (cpumask_test_and_set_cpu(cid, cidmask))
- return -1;
+ max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
- return cid;
-}
+ /* Try to reuse the last CID of this task */
+ if (__mm_cid_get(t, t->last_mm_cid, max_cids))
+ return true;
-/*
- * Save a snapshot of the current runqueue time of this cpu
- * with the per-cpu cid value, allowing to estimate how recently it was used.
- */
-static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
-{
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+ /* Try to reuse the last CID of this mm on this CPU */
+ if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids))
+ return true;
- lockdep_assert_rq_held(rq);
- WRITE_ONCE(pcpu_cid->time, rq->clock);
+ /* Try the first zero bit in the cidmask. */
+ return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids);
}
-static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static inline void mm_cid_select(struct task_struct *t)
{
- int cid;
-
- /*
- * All allocations (even those using the cid_lock) are lock-free. If
- * use_cid_lock is set, hold the cid_lock to perform cid allocation to
- * guarantee forward progress.
- */
- if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto end;
- raw_spin_lock(&cid_lock);
- } else {
- raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto unlock;
- }
-
- /*
- * cid concurrently allocated. Retry while forcing following
- * allocations to use the cid_lock to ensure forward progress.
- */
- WRITE_ONCE(use_cid_lock, 1);
/*
- * Set use_cid_lock before allocation. Only care about program order
- * because this is only required for forward progress.
+ * mm_cid_get() can fail when the maximum CID, which is determined
+ * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
+ * That's a transient failure as there cannot be more tasks
+ * concurrently on a CPU (or about to be scheduled in) than that.
*/
- barrier();
- /*
- * Retry until it succeeds. It is guaranteed to eventually succeed once
- * all newcoming allocations observe the use_cid_lock flag set.
- */
- do {
- cid = __mm_cid_try_get(t, mm);
- cpu_relax();
- } while (cid < 0);
- /*
- * Allocate before clearing use_cid_lock. Only care about
- * program order because this is for forward progress.
- */
- barrier();
- WRITE_ONCE(use_cid_lock, 0);
-unlock:
- raw_spin_unlock(&cid_lock);
-end:
- mm_cid_snapshot_time(rq, mm);
-
- return cid;
-}
-
-static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
-{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- struct cpumask *cpumask;
- int cid;
-
- lockdep_assert_rq_held(rq);
- cpumask = mm_cidmask(mm);
- cid = __this_cpu_read(pcpu_cid->cid);
- if (mm_cid_is_valid(cid)) {
- mm_cid_snapshot_time(rq, mm);
- return cid;
- }
- if (mm_cid_is_lazy_put(cid)) {
- if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ for (;;) {
+ if (mm_cid_get(t))
+ break;
}
- cid = __mm_cid_get(rq, t, mm);
- __this_cpu_write(pcpu_cid->cid, cid);
- __this_cpu_write(pcpu_cid->recent_cid, cid);
-
- return cid;
}
-static inline void switch_mm_cid(struct rq *rq,
- struct task_struct *prev,
- struct task_struct *next)
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
{
- /*
- * Provide a memory barrier between rq->curr store and load of
- * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
- *
- * Should be adapted if context_switch() is modified.
- */
- if (!next->mm) { // to kernel
- /*
- * user -> kernel transition does not guarantee a barrier, but
- * we can use the fact that it performs an atomic operation in
- * mmgrab().
- */
- if (prev->mm) // from user
- smp_mb__after_mmgrab();
- /*
- * kernel -> kernel transition does not change rq->curr->mm
- * state. It stays NULL.
- */
- } else { // to user
- /*
- * kernel -> user transition does not provide a barrier
- * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
- * Provide it here.
- */
- if (!prev->mm) { // from kernel
- smp_mb();
- } else { // from user
- /*
- * user->user transition relies on an implicit
- * memory barrier in switch_mm() when
- * current->mm changes. If the architecture
- * switch_mm() does not have an implicit memory
- * barrier, it is emitted here. If current->mm
- * is unchanged, no barrier is needed.
- */
- smp_mb__after_switch_mm();
- }
- }
if (prev->mm_cid_active) {
- mm_cid_snapshot_time(rq, prev->mm);
- mm_cid_put_lazy(prev);
- prev->mm_cid = -1;
+ if (prev->mm_cid != MM_CID_UNSET)
+ cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm));
+ prev->mm_cid = MM_CID_UNSET;
}
+
if (next->mm_cid_active) {
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+ mm_cid_select(next);
rseq_sched_set_task_mm_cid(next, next->mm_cid);
}
}
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
-static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
-static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
+static inline void mm_cid_select(struct task_struct *t) { }
+static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 02/19] sched/mmcid: Use proper data structures
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
2025-10-15 17:29 ` [patch 01/19] sched/mmcid: Revert the complex " Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 03/19] sched/mmcid: Cacheline align MM CID storage Thomas Gleixner
` (18 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Having a lot of CID functionality specific members in struct task_struct
and struct mm_struct is not really making the code easier to read.
Encapsulate the CID specific parts in data structures and keep them
seperate from the stuff they are embedded in.
No functional change.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/mm_types.h | 56 +++++++++++----------------------------------
include/linux/rseq_types.h | 42 +++++++++++++++++++++++++++++++++
include/linux/sched.h | 11 +-------
init/init_task.c | 3 ++
kernel/fork.c | 6 ++--
kernel/sched/core.c | 16 ++++++------
kernel/sched/sched.h | 26 ++++++++++----------
7 files changed, 85 insertions(+), 75 deletions(-)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
+#include <linux/rseq_types.h>
#include <asm/mmu.h>
@@ -924,10 +925,6 @@ struct vm_area_struct {
#define vma_policy(vma) NULL
#endif
-struct mm_cid {
- unsigned int cid;
-};
-
struct kioctx_table;
struct iommu_mm_data;
struct mm_struct {
@@ -980,30 +977,9 @@ struct mm_struct {
*/
atomic_t mm_users;
-#ifdef CONFIG_SCHED_MM_CID
- /**
- * @pcpu_cid: Per-cpu current cid.
- *
- * Keep track of the currently allocated mm_cid for each cpu.
- * The per-cpu mm_cid values are serialized by their respective
- * runqueue locks.
- */
- struct mm_cid __percpu *pcpu_cid;
- /**
- * @nr_cpus_allowed: Number of CPUs allowed for mm.
- *
- * Number of CPUs allowed in the union of all mm's
- * threads allowed CPUs.
- */
- unsigned int nr_cpus_allowed;
- /**
- * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
- *
- * Provide mutual exclusion for mm cpus_allowed and
- * mm nr_cpus_allowed updates.
- */
- raw_spinlock_t cpus_allowed_lock;
-#endif
+ /* MM CID related storage */
+ struct mm_mm_cid mm_cid;
+
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
@@ -1306,9 +1282,6 @@ static inline void vma_iter_init(struct
}
#ifdef CONFIG_SCHED_MM_CID
-
-#define MM_CID_UNSET (~0U)
-
/*
* mm_cpus_allowed: Union of all mm's threads allowed CPUs.
*/
@@ -1337,20 +1310,20 @@ static inline void mm_init_cid(struct mm
int i;
for_each_possible_cpu(i) {
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
+ struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i);
- pcpu_cid->cid = MM_CID_UNSET;
+ pcpu->cid = MM_CID_UNSET;
}
- mm->nr_cpus_allowed = p->nr_cpus_allowed;
- raw_spin_lock_init(&mm->cpus_allowed_lock);
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ raw_spin_lock_init(&mm->mm_cid.lock);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
cpumask_clear(mm_cidmask(mm));
}
static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
{
- mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
- if (!mm->pcpu_cid)
+ mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu);
+ if (!mm->mm_cid.pcpu)
return -ENOMEM;
mm_init_cid(mm, p);
return 0;
@@ -1359,8 +1332,8 @@ static inline int mm_alloc_cid_noprof(st
static inline void mm_destroy_cid(struct mm_struct *mm)
{
- free_percpu(mm->pcpu_cid);
- mm->pcpu_cid = NULL;
+ free_percpu(mm->mm_cid.pcpu);
+ mm->mm_cid.pcpu = NULL;
}
static inline unsigned int mm_cid_size(void)
@@ -1375,10 +1348,9 @@ static inline void mm_set_cpus_allowed(s
if (!mm)
return;
/* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
- raw_spin_lock(&mm->cpus_allowed_lock);
+ guard(raw_spinlock)(&mm->mm_cid.lock);
cpumask_or(mm_allowed, mm_allowed, cpumask);
- WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
- raw_spin_unlock(&mm->cpus_allowed_lock);
+ WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -90,4 +90,46 @@ struct rseq_data {
struct rseq_data { };
#endif /* !CONFIG_RSEQ */
+#ifdef CONFIG_SCHED_MM_CID
+
+#define MM_CID_UNSET (~0U)
+
+/**
+ * struct sched_mm_cid - Storage for per task MM CID data
+ * @active: MM CID is active for the task
+ * @cid: The CID associated to the task
+ * @last_cid: The last CID associated to the task
+ */
+struct sched_mm_cid {
+ unsigned int active;
+ unsigned int cid;
+ unsigned int last_cid;
+};
+
+/**
+ * struct mm_cid_pcpu - Storage for per CPU MM_CID data
+ * @cid: The CID associated to the CPU
+ */
+struct mm_cid_pcpu {
+ unsigned int cid;
+};
+
+/**
+ * struct mm_mm_cid - Storage for per MM CID data
+ * @pcpu: Per CPU storage for CIDs associated to a CPU
+ * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
+ * is growth only.
+ * @lock: Spinlock to protect all fields except @pcpu. It also protects
+ * the MM cid cpumask and the MM cidmask bitmap.
+ */
+struct mm_mm_cid {
+ struct mm_cid_pcpu __percpu *pcpu;
+ unsigned int nr_cpus_allowed;
+ raw_spinlock_t lock;
+};
+#else /* CONFIG_SCHED_MM_CID */
+struct mm_cid { };
+struct sched_mm_cid { };
+#endif /* !CONFIG_SCHED_MM_CID */
+
#endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1399,14 +1399,7 @@ struct task_struct {
#endif /* CONFIG_NUMA_BALANCING */
struct rseq_data rseq;
-
-#ifdef CONFIG_SCHED_MM_CID
- int mm_cid; /* Current cid in mm */
- int last_mm_cid; /* Most recent cid in mm */
- int migrate_from_cpu;
- int mm_cid_active; /* Whether cid bitmap is active */
- struct callback_head cid_work;
-#endif
+ struct sched_mm_cid mm_cid;
struct tlbflush_unmap_batch tlb_ubc;
@@ -2300,7 +2293,7 @@ void sched_mm_cid_fork(struct task_struc
void sched_mm_cid_exit_signals(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
- return t->mm_cid;
+ return t->mm_cid.cid;
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -220,6 +220,9 @@ struct task_struct init_task __aligned(L
#ifdef CONFIG_SECCOMP_FILTER
.seccomp = { .filter_count = ATOMIC_INIT(0) },
#endif
+#ifdef CONFIG_SCHED_MM_CID
+ .mm_cid = { .cid = MM_CID_UNSET, },
+#endif
};
EXPORT_SYMBOL(init_task);
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,9 +955,9 @@ static struct task_struct *dup_task_stru
#endif
#ifdef CONFIG_SCHED_MM_CID
- tsk->mm_cid = MM_CID_UNSET;
- tsk->last_mm_cid = MM_CID_UNSET;
- tsk->mm_cid_active = 0;
+ tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.last_cid = MM_CID_UNSET;
+ tsk->mm_cid.active = 0;
#endif
return tsk;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10411,14 +10411,14 @@ void sched_mm_cid_exit_signals(struct ta
{
struct mm_struct *mm = t->mm;
- if (!mm || !t->mm_cid_active)
+ if (!mm || !t->mm_cid.active)
return;
guard(preempt)();
- t->mm_cid_active = 0;
- if (t->mm_cid != MM_CID_UNSET) {
- cpumask_clear_cpu(t->mm_cid, mm_cidmask(mm));
- t->mm_cid = MM_CID_UNSET;
+ t->mm_cid.active = 0;
+ if (t->mm_cid.cid != MM_CID_UNSET) {
+ cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm));
+ t->mm_cid.cid = MM_CID_UNSET;
}
}
@@ -10437,14 +10437,14 @@ void sched_mm_cid_after_execve(struct ta
return;
guard(preempt)();
- t->mm_cid_active = 1;
+ t->mm_cid.active = 1;
mm_cid_select(t);
}
void sched_mm_cid_fork(struct task_struct *t)
{
- WARN_ON_ONCE(!t->mm || t->mm_cid != MM_CID_UNSET);
- t->mm_cid_active = 1;
+ WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
+ t->mm_cid.active = 1;
}
#endif /* CONFIG_SCHED_MM_CID */
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3543,8 +3543,8 @@ static inline void init_sched_mm_cid(str
return;
/* Preset last_mm_cid */
- max_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
- t->last_mm_cid = max_cid - 1;
+ max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
+ t->mm_cid.last_cid = max_cid - 1;
}
static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
@@ -3555,8 +3555,8 @@ static inline bool __mm_cid_get(struct t
return false;
if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm)))
return false;
- t->mm_cid = t->last_mm_cid = cid;
- __this_cpu_write(mm->pcpu_cid->cid, cid);
+ t->mm_cid.cid = t->mm_cid.last_cid = cid;
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
return true;
}
@@ -3565,14 +3565,14 @@ static inline bool mm_cid_get(struct tas
struct mm_struct *mm = t->mm;
unsigned int max_cids;
- max_cids = min_t(int, READ_ONCE(mm->nr_cpus_allowed), atomic_read(&mm->mm_users));
+ max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
/* Try to reuse the last CID of this task */
- if (__mm_cid_get(t, t->last_mm_cid, max_cids))
+ if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
return true;
/* Try to reuse the last CID of this mm on this CPU */
- if (__mm_cid_get(t, __this_cpu_read(mm->pcpu_cid->cid), max_cids))
+ if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
return true;
/* Try the first zero bit in the cidmask. */
@@ -3595,15 +3595,15 @@ static inline void mm_cid_select(struct
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
{
- if (prev->mm_cid_active) {
- if (prev->mm_cid != MM_CID_UNSET)
- cpumask_clear_cpu(prev->mm_cid, mm_cidmask(prev->mm));
- prev->mm_cid = MM_CID_UNSET;
+ if (prev->mm_cid.active) {
+ if (prev->mm_cid.cid != MM_CID_UNSET)
+ cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm));
+ prev->mm_cid.cid = MM_CID_UNSET;
}
- if (next->mm_cid_active) {
+ if (next->mm_cid.active) {
mm_cid_select(next);
- rseq_sched_set_task_mm_cid(next, next->mm_cid);
+ rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
}
}
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 03/19] sched/mmcid: Cacheline align MM CID storage
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
2025-10-15 17:29 ` [patch 01/19] sched/mmcid: Revert the complex " Thomas Gleixner
2025-10-15 17:29 ` [patch 02/19] sched/mmcid: Use proper data structures Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 04/19] sched: Fixup whitespace damage Thomas Gleixner
` (17 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Both the per CPU storage and the data in mm_struct are heavily used in
context switch. As they can end up next to other frequently modified data,
they are subject to false sharing.
Make them cache line aligned.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq_types.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -112,7 +112,7 @@ struct sched_mm_cid {
*/
struct mm_cid_pcpu {
unsigned int cid;
-};
+}____cacheline_aligned_in_smp;
/**
* struct mm_mm_cid - Storage for per MM CID data
@@ -126,7 +126,7 @@ struct mm_mm_cid {
struct mm_cid_pcpu __percpu *pcpu;
unsigned int nr_cpus_allowed;
raw_spinlock_t lock;
-};
+}____cacheline_aligned_in_smp;
#else /* CONFIG_SCHED_MM_CID */
struct mm_cid { };
struct sched_mm_cid { };
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 04/19] sched: Fixup whitespace damage
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (2 preceding siblings ...)
2025-10-15 17:29 ` [patch 03/19] sched/mmcid: Cacheline align MM CID storage Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 05/19] sched/mmcid: Move scheduler code out of global header Thomas Gleixner
` (16 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
With whitespace checks enabled in the editor this makes eyes bleed.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/sched/core.c | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5309,19 +5309,16 @@ context_switch(struct rq *rq, struct tas
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
- *
- * switch_mm_cid() needs to be updated if the barriers provided
- * by context_switch() are modified.
*/
- if (!next->mm) { // to kernel
+ if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
- if (prev->mm) // from user
+ if (prev->mm) // from user
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
- } else { // to user
+ } else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
@@ -5334,7 +5331,7 @@ context_switch(struct rq *rq, struct tas
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
- if (!prev->mm) { // from kernel
+ if (!prev->mm) { // from kernel
/* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 05/19] sched/mmcid: Move scheduler code out of global header
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (3 preceding siblings ...)
2025-10-15 17:29 ` [patch 04/19] sched: Fixup whitespace damage Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() Thomas Gleixner
` (15 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
This is only used in the scheduler core code, so there is no point to have
it in a global header.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/mm_types.h | 13 -------------
kernel/sched/core.c | 20 ++++++++++++++++++--
2 files changed, 18 insertions(+), 15 deletions(-)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1341,27 +1341,14 @@ static inline unsigned int mm_cid_size(v
return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */
}
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
-{
- struct cpumask *mm_allowed = mm_cpus_allowed(mm);
-
- if (!mm)
- return;
- /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
- guard(raw_spinlock)(&mm->mm_cid.lock);
- cpumask_or(mm_allowed, mm_allowed, cpumask);
- WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
-}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }
-
static inline unsigned int mm_cid_size(void)
{
return 0;
}
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
#endif /* CONFIG_SCHED_MM_CID */
struct mmu_gather;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2704,6 +2704,8 @@ int push_cpu_stop(void *arg)
return 0;
}
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2763,7 +2765,7 @@ static void
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -10404,6 +10406,18 @@ void call_trace_sched_update_nr_running(
* When a task exits, the MM CID held by the task is not longer required as
* the task cannot return to user space.
*/
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
+{
+ struct cpumask *mm_allowed = mm_cpus_allowed(mm);
+
+ if (!mm)
+ return;
+ /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
+ guard(raw_spinlock)(&mm->mm_cid.lock);
+ cpumask_or(mm_allowed, mm_allowed, affmsk);
+ WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
+}
+
void sched_mm_cid_exit_signals(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
@@ -10443,7 +10457,9 @@ void sched_mm_cid_fork(struct task_struc
WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
t->mm_cid.active = 1;
}
-#endif /* CONFIG_SCHED_MM_CID */
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
#ifdef CONFIG_SCHED_CLASS_EXT
void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (4 preceding siblings ...)
2025-10-15 17:29 ` [patch 05/19] sched/mmcid: Move scheduler code out of global header Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-17 11:12 ` Peter Zijlstra
2025-10-15 17:29 ` [patch 07/19] cpumask: Introduce cpumask_or_weight() Thomas Gleixner
` (14 subsequent siblings)
20 siblings, 1 reply; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
The @nr_cpus_allowed management does way too much useless work for the
common case where a process starts with unrestricted affinity,
i.e. @nr_cpus_allowed is equal to the number of possible CPUs right
away.
Add a check whether that limit is reached already and then avoid the whole
cpumask update and evaluation.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/sched/core.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2719,6 +2719,7 @@ void set_cpus_allowed_common(struct task
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
/*
* Swap in a new user_cpus_ptr if SCA_USER flag set
@@ -2765,7 +2766,6 @@ static void
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, ctx);
- mm_update_cpus_allowed(p->mm, ctx->new_mask);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -10408,12 +10408,20 @@ void call_trace_sched_update_nr_running(
*/
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
- struct cpumask *mm_allowed = mm_cpus_allowed(mm);
+ struct cpumask *mm_allowed;
- if (!mm)
+ if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
return;
- /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
+
+ /*
+ * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+ * allowed CPUs mask which means it can only grow.
+ */
guard(raw_spinlock)(&mm->mm_cid.lock);
+ /* Check again under the lock */
+ if (mm->mm_cid.nr_cpus_allowed == nr_cpu_ids)
+ return;
+ mm_allowed = mm_cpus_allowed(mm);
cpumask_or(mm_allowed, mm_allowed, affmsk);
WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
}
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-15 17:29 ` [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() Thomas Gleixner
@ 2025-10-17 11:12 ` Peter Zijlstra
2025-10-17 12:49 ` Thomas Gleixner
0 siblings, 1 reply; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-17 11:12 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Wed, Oct 15, 2025 at 07:29:34PM +0200, Thomas Gleixner wrote:
> + if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
> return;
FWIW this doesn't work on architectures that change their
cpu_possible_mask around (eg. Power).
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-17 11:12 ` Peter Zijlstra
@ 2025-10-17 12:49 ` Thomas Gleixner
2025-10-17 17:58 ` Peter Zijlstra
0 siblings, 1 reply; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-17 12:49 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Fri, Oct 17 2025 at 13:12, Peter Zijlstra wrote:
> On Wed, Oct 15, 2025 at 07:29:34PM +0200, Thomas Gleixner wrote:
>
>> + if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
>> return;
>
> FWIW this doesn't work on architectures that change their
> cpu_possible_mask around (eg. Power).
No. Power does not change it after boot either. Half of the kernel would
explode if that'd be the case.
Thanks,
tglx
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-17 12:49 ` Thomas Gleixner
@ 2025-10-17 17:58 ` Peter Zijlstra
2025-10-17 18:19 ` Peter Zijlstra
0 siblings, 1 reply; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-17 17:58 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Fri, Oct 17, 2025 at 02:49:29PM +0200, Thomas Gleixner wrote:
> On Fri, Oct 17 2025 at 13:12, Peter Zijlstra wrote:
> > On Wed, Oct 15, 2025 at 07:29:34PM +0200, Thomas Gleixner wrote:
> >
> >> + if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
> >> return;
> >
> > FWIW this doesn't work on architectures that change their
> > cpu_possible_mask around (eg. Power).
>
> No. Power does not change it after boot either. Half of the kernel would
> explode if that'd be the case.
Power very much does changes cpu_possible_mask; it doesn't change
nr_cpu_ids. Anyway, the point is that a full mask won't be nr_cpu_ids.
Same is true when you offline a CPU come to think of it.
Same is true if the cpumask is sparse.
Anyway, just saying, checking against nr_cpu_ids might not be the best
shortcut here.
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-17 17:58 ` Peter Zijlstra
@ 2025-10-17 18:19 ` Peter Zijlstra
2025-10-19 20:32 ` Thomas Gleixner
0 siblings, 1 reply; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-17 18:19 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Fri, Oct 17, 2025 at 07:58:53PM +0200, Peter Zijlstra wrote:
> On Fri, Oct 17, 2025 at 02:49:29PM +0200, Thomas Gleixner wrote:
> > On Fri, Oct 17 2025 at 13:12, Peter Zijlstra wrote:
> > > On Wed, Oct 15, 2025 at 07:29:34PM +0200, Thomas Gleixner wrote:
> > >
> > >> + if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
> > >> return;
> > >
> > > FWIW this doesn't work on architectures that change their
> > > cpu_possible_mask around (eg. Power).
> >
> > No. Power does not change it after boot either. Half of the kernel would
> > explode if that'd be the case.
>
> Power very much does changes cpu_possible_mask; it doesn't change
> nr_cpu_ids. Anyway, the point is that a full mask won't be nr_cpu_ids.
Gah, bad memories, it is cpu_present_mask they change.
> Same is true when you offline a CPU come to think of it.
>
> Same is true if the cpumask is sparse.
>
> Anyway, just saying, checking against nr_cpu_ids might not be the best
> shortcut here.
Put another way, nr_cpus_allowed == nr_cpu_ids only work when none of
the masks involved have holes. The moment anything {possible, present,
online} has holes in, it goes sideways.
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-17 18:19 ` Peter Zijlstra
@ 2025-10-19 20:32 ` Thomas Gleixner
2025-10-20 8:22 ` Peter Zijlstra
0 siblings, 1 reply; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-19 20:32 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Fri, Oct 17 2025 at 20:19, Peter Zijlstra wrote:
> On Fri, Oct 17, 2025 at 07:58:53PM +0200, Peter Zijlstra wrote:
>> Same is true when you offline a CPU come to think of it.
>>
>> Same is true if the cpumask is sparse.
>>
>> Anyway, just saying, checking against nr_cpu_ids might not be the best
>> shortcut here.
>
> Put another way, nr_cpus_allowed == nr_cpu_ids only work when none of
> the masks involved have holes. The moment anything {possible, present,
> online} has holes in, it goes sideways.
You're right. I was too narrowly focussed on the normal x86 case, where
nr_cpu_ids == num_possible_cpus ....
Let me think about that.
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-19 20:32 ` Thomas Gleixner
@ 2025-10-20 8:22 ` Peter Zijlstra
2025-10-21 18:25 ` Thomas Gleixner
0 siblings, 1 reply; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-20 8:22 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Sun, Oct 19, 2025 at 10:32:47PM +0200, Thomas Gleixner wrote:
> On Fri, Oct 17 2025 at 20:19, Peter Zijlstra wrote:
> > On Fri, Oct 17, 2025 at 07:58:53PM +0200, Peter Zijlstra wrote:
> >> Same is true when you offline a CPU come to think of it.
> >>
> >> Same is true if the cpumask is sparse.
> >>
> >> Anyway, just saying, checking against nr_cpu_ids might not be the best
> >> shortcut here.
> >
> > Put another way, nr_cpus_allowed == nr_cpu_ids only work when none of
> > the masks involved have holes. The moment anything {possible, present,
> > online} has holes in, it goes sideways.
>
> You're right. I was too narrowly focussed on the normal x86 case, where
> nr_cpu_ids == num_possible_cpus ....
>
> Let me think about that.
So the obvious idea would be to grow hotplug hooks, such that you can
do:
nr_cpus_allowed == num_online_cpus()
But then hotplug will have to iterate all mm's. Doable, but not really
nice.
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
2025-10-20 8:22 ` Peter Zijlstra
@ 2025-10-21 18:25 ` Thomas Gleixner
0 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-21 18:25 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Mon, Oct 20 2025 at 10:22, Peter Zijlstra wrote:
> On Sun, Oct 19, 2025 at 10:32:47PM +0200, Thomas Gleixner wrote:
>> On Fri, Oct 17 2025 at 20:19, Peter Zijlstra wrote:
>> > On Fri, Oct 17, 2025 at 07:58:53PM +0200, Peter Zijlstra wrote:
>> >> Same is true when you offline a CPU come to think of it.
>> >>
>> >> Same is true if the cpumask is sparse.
>> >>
>> >> Anyway, just saying, checking against nr_cpu_ids might not be the best
>> >> shortcut here.
>> >
>> > Put another way, nr_cpus_allowed == nr_cpu_ids only work when none of
>> > the masks involved have holes. The moment anything {possible, present,
>> > online} has holes in, it goes sideways.
>>
>> You're right. I was too narrowly focussed on the normal x86 case, where
>> nr_cpu_ids == num_possible_cpus ....
>>
>> Let me think about that.
>
> So the obvious idea would be to grow hotplug hooks, such that you can
> do:
>
> nr_cpus_allowed == num_online_cpus()
>
> But then hotplug will have to iterate all mm's. Doable, but not really
> nice.
Right, but that can be done once the dust settled and if there is
actually a need for it.
Thanks,
tglx
^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 07/19] cpumask: Introduce cpumask_or_weight()
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (5 preceding siblings ...)
2025-10-15 17:29 ` [patch 06/19] sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:41 ` Yury Norov
2025-10-15 17:29 ` [patch 08/19] sched/mmcid: Use cpumask_or_weight() Thomas Gleixner
` (13 subsequent siblings)
20 siblings, 1 reply; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team, Yury Norov
CID management OR's two cpumasks and then calculates the weight on the
result. That's inefficient as that has to walk the same stuff twice. As
this is done with runqueue lock held, there is a real benefit of speeding
this up.
Provide cpumask_or_weight() and the corresponding bitmap functions which
return the weight of the OR result right away.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Yury Norov <yury.norov@gmail.com>
---
include/linux/bitmap.h | 15 +++++++++++++++
include/linux/cpumask.h | 16 ++++++++++++++++
lib/bitmap.c | 17 +++++++++++++++++
3 files changed, 48 insertions(+)
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -45,6 +45,7 @@ struct device;
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
+ * bitmap_or_weight(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
@@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co
const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
+unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
@@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const
}
static __always_inline
+unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src1,
+ const unsigned long *src2, unsigned int nbits)
+{
+ if (small_const_nbits(nbits)) {
+ *dst = *src1 | *src2;
+ return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
+ } else {
+ return __bitmap_or_weight(dst, src1, src2, nbits);
+ }
+}
+
+static __always_inline
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co
}
/**
+ * cpumask_or_weight - *dstp = *src1p | *src2p and return the weight of the result
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Return: The number of bits set in the resulting cpumask @dstp
+ */
+static __always_inline
+unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask *src1p,
+ const struct cpumask *src2p)
+{
+ return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p),
+ cpumask_bits(src2p), small_cpumask_bits);
+}
+
+/**
* cpumask_xor - *dstp = *src1p ^ *src2p
* @dstp: the cpumask result
* @src1p: the first input
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con
}
EXPORT_SYMBOL(__bitmap_or);
+unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int bits)
+{
+ unsigned int k, w = 0;
+
+ for (k = 0; k < bits / BITS_PER_LONG; k++) {
+ dst[k] = bitmap1[k] | bitmap2[k];
+ w += hweight_long(dst[k]);
+ }
+
+ if (bits % BITS_PER_LONG) {
+ dst[k] = bitmap1[k] | bitmap2[k];
+ w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
+ }
+ return w;
+}
+
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int bits)
{
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
2025-10-15 17:29 ` [patch 07/19] cpumask: Introduce cpumask_or_weight() Thomas Gleixner
@ 2025-10-15 17:41 ` Yury Norov
2025-10-15 18:06 ` Yury Norov
2025-10-21 19:34 ` Thomas Gleixner
0 siblings, 2 replies; 39+ messages in thread
From: Yury Norov @ 2025-10-15 17:41 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Hi Tomas,
On Wed, Oct 15, 2025 at 07:29:36PM +0200, Thomas Gleixner wrote:
> CID management OR's two cpumasks and then calculates the weight on the
> result. That's inefficient as that has to walk the same stuff twice. As
> this is done with runqueue lock held, there is a real benefit of speeding
> this up.
>
> Provide cpumask_or_weight() and the corresponding bitmap functions which
> return the weight of the OR result right away.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> Cc: Yury Norov <yury.norov@gmail.com>
> ---
> include/linux/bitmap.h | 15 +++++++++++++++
> include/linux/cpumask.h | 16 ++++++++++++++++
> lib/bitmap.c | 17 +++++++++++++++++
> 3 files changed, 48 insertions(+)
>
> --- a/include/linux/bitmap.h
> +++ b/include/linux/bitmap.h
> @@ -45,6 +45,7 @@ struct device;
> * bitmap_copy(dst, src, nbits) *dst = *src
> * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
> * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
> + * bitmap_or_weight(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst
> * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
> * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
> * bitmap_complement(dst, src, nbits) *dst = ~(*src)
> @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co
> const unsigned long *bitmap2, unsigned int nbits);
> void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
> const unsigned long *bitmap2, unsigned int nbits);
> +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> + const unsigned long *bitmap2, unsigned int nbits);
> void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
> const unsigned long *bitmap2, unsigned int nbits);
> bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
> @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const
> }
>
> static __always_inline
> +unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src1,
> + const unsigned long *src2, unsigned int nbits)
> +{
> + if (small_const_nbits(nbits)) {
> + *dst = *src1 | *src2;
> + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
> + } else {
> + return __bitmap_or_weight(dst, src1, src2, nbits);
> + }
> +}
> +
> +static __always_inline
> void bitmap_xor(unsigned long *dst, const unsigned long *src1,
> const unsigned long *src2, unsigned int nbits)
> {
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co
> }
>
> /**
> + * cpumask_or_weight - *dstp = *src1p | *src2p and return the weight of the result
> + * @dstp: the cpumask result
> + * @src1p: the first input
> + * @src2p: the second input
> + *
> + * Return: The number of bits set in the resulting cpumask @dstp
> + */
> +static __always_inline
> +unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask *src1p,
> + const struct cpumask *src2p)
> +{
> + return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p),
> + cpumask_bits(src2p), small_cpumask_bits);
> +}
> +
> +/**
> * cpumask_xor - *dstp = *src1p ^ *src2p
> * @dstp: the cpumask result
> * @src1p: the first input
> --- a/lib/bitmap.c
> +++ b/lib/bitmap.c
> @@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con
> }
> EXPORT_SYMBOL(__bitmap_or);
>
> +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> + const unsigned long *bitmap2, unsigned int bits)
> +{
> + unsigned int k, w = 0;
> +
> + for (k = 0; k < bits / BITS_PER_LONG; k++) {
> + dst[k] = bitmap1[k] | bitmap2[k];
> + w += hweight_long(dst[k]);
> + }
> +
> + if (bits % BITS_PER_LONG) {
> + dst[k] = bitmap1[k] | bitmap2[k];
> + w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
> + }
> + return w;
> +}
We've got bitmap_weight_and() and bitmap_weight_andnot() already. Can
you align naming with the existing scheme: bitmap_weight_or().
Also, for outline implementation, can you employ the BITMAP_WEIGHT()
macro?
Thanks,
Yury
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
2025-10-15 17:41 ` Yury Norov
@ 2025-10-15 18:06 ` Yury Norov
2025-10-21 20:21 ` Thomas Gleixner
2025-10-21 19:34 ` Thomas Gleixner
1 sibling, 1 reply; 39+ messages in thread
From: Yury Norov @ 2025-10-15 18:06 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
On Wed, Oct 15, 2025 at 01:41:50PM -0400, Yury Norov wrote:
> Hi Tomas,
>
> On Wed, Oct 15, 2025 at 07:29:36PM +0200, Thomas Gleixner wrote:
> > CID management OR's two cpumasks and then calculates the weight on the
> > result. That's inefficient as that has to walk the same stuff twice. As
> > this is done with runqueue lock held, there is a real benefit of speeding
> > this up.
> >
> > Provide cpumask_or_weight() and the corresponding bitmap functions which
> > return the weight of the OR result right away.
> >
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Yury Norov <yury.norov@gmail.com>
> > ---
> > include/linux/bitmap.h | 15 +++++++++++++++
> > include/linux/cpumask.h | 16 ++++++++++++++++
> > lib/bitmap.c | 17 +++++++++++++++++
> > 3 files changed, 48 insertions(+)
> >
> > --- a/include/linux/bitmap.h
> > +++ b/include/linux/bitmap.h
> > @@ -45,6 +45,7 @@ struct device;
> > * bitmap_copy(dst, src, nbits) *dst = *src
> > * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
> > * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
> > + * bitmap_or_weight(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst
> > * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
> > * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
> > * bitmap_complement(dst, src, nbits) *dst = ~(*src)
> > @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, co
> > const unsigned long *bitmap2, unsigned int nbits);
> > void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
> > const unsigned long *bitmap2, unsigned int nbits);
> > +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> > + const unsigned long *bitmap2, unsigned int nbits);
> > void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
> > const unsigned long *bitmap2, unsigned int nbits);
> > bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
> > @@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const
> > }
> >
> > static __always_inline
> > +unsigned int bitmap_or_weight(unsigned long *dst, const unsigned long *src1,
> > + const unsigned long *src2, unsigned int nbits)
> > +{
> > + if (small_const_nbits(nbits)) {
> > + *dst = *src1 | *src2;
> > + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
> > + } else {
> > + return __bitmap_or_weight(dst, src1, src2, nbits);
> > + }
> > +}
> > +
> > +static __always_inline
> > void bitmap_xor(unsigned long *dst, const unsigned long *src1,
> > const unsigned long *src2, unsigned int nbits)
> > {
> > --- a/include/linux/cpumask.h
> > +++ b/include/linux/cpumask.h
> > @@ -729,6 +729,22 @@ void cpumask_or(struct cpumask *dstp, co
> > }
> >
> > /**
> > + * cpumask_or_weight - *dstp = *src1p | *src2p and return the weight of the result
> > + * @dstp: the cpumask result
> > + * @src1p: the first input
> > + * @src2p: the second input
> > + *
> > + * Return: The number of bits set in the resulting cpumask @dstp
> > + */
> > +static __always_inline
> > +unsigned int cpumask_or_weight(struct cpumask *dstp, const struct cpumask *src1p,
> > + const struct cpumask *src2p)
> > +{
> > + return bitmap_or_weight(cpumask_bits(dstp), cpumask_bits(src1p),
> > + cpumask_bits(src2p), small_cpumask_bits);
> > +}
> > +
> > +/**
> > * cpumask_xor - *dstp = *src1p ^ *src2p
> > * @dstp: the cpumask result
> > * @src1p: the first input
> > --- a/lib/bitmap.c
> > +++ b/lib/bitmap.c
> > @@ -253,6 +253,23 @@ void __bitmap_or(unsigned long *dst, con
> > }
> > EXPORT_SYMBOL(__bitmap_or);
> >
> > +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
> > + const unsigned long *bitmap2, unsigned int bits)
> > +{
> > + unsigned int k, w = 0;
> > +
> > + for (k = 0; k < bits / BITS_PER_LONG; k++) {
> > + dst[k] = bitmap1[k] | bitmap2[k];
> > + w += hweight_long(dst[k]);
> > + }
> > +
> > + if (bits % BITS_PER_LONG) {
> > + dst[k] = bitmap1[k] | bitmap2[k];
> > + w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
> > + }
> > + return w;
> > +}
>
> We've got bitmap_weight_and() and bitmap_weight_andnot() already. Can
> you align naming with the existing scheme: bitmap_weight_or().
>
> Also, for outline implementation, can you employ the BITMAP_WEIGHT()
> macro?
Ok, I see now. You want to do a regular cpumask_or(), but return the
hweight() of the result, instead of a boolean.
The cpumask_or_weight() may be really confused with cpumask_weight_or().
Can you try considering a different naming? (I am seemingly can't.)
Can you describe the performance impact you've mentioned in the commit
message in more details?
Anyways, for the approach:
Acked-by: Yury Norov (NVIDIA) <yury.norov@gmail.com>
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
2025-10-15 18:06 ` Yury Norov
@ 2025-10-21 20:21 ` Thomas Gleixner
0 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-21 20:21 UTC (permalink / raw)
To: Yury Norov
Cc: LKML, Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Yury!
On Wed, Oct 15 2025 at 14:06, Yury Norov wrote:
> On Wed, Oct 15, 2025 at 01:41:50PM -0400, Yury Norov wrote:
> Ok, I see now. You want to do a regular cpumask_or(), but return the
> hweight() of the result, instead of a boolean.
>
> The cpumask_or_weight() may be really confused with cpumask_weight_or().
> Can you try considering a different naming? (I am seemingly can't.)
the only thing I came up with was cpumask_or_and_weight(), but that
sounded odd too. cpumask_or_and_calc_weight() perhaps.
> Can you describe the performance impact you've mentioned in the commit
> message in more details?
It's sparing the second loop with the related memory reads. It's about
10-20% faster for a 4k CPU mask (64 iterations) depending on the machine
I test on.
As this is invoked with runqueue lock held, there is definitely a desire
to spare as much cycles as possible.
Thanks,
tglx
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [patch 07/19] cpumask: Introduce cpumask_or_weight()
2025-10-15 17:41 ` Yury Norov
2025-10-15 18:06 ` Yury Norov
@ 2025-10-21 19:34 ` Thomas Gleixner
1 sibling, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-21 19:34 UTC (permalink / raw)
To: Yury Norov
Cc: LKML, Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Yury!
On Wed, Oct 15 2025 at 13:41, Yury Norov wrote:
> On Wed, Oct 15, 2025 at 07:29:36PM +0200, Thomas Gleixner wrote:
>> +unsigned int __bitmap_or_weight(unsigned long *dst, const unsigned long *bitmap1,
>> + const unsigned long *bitmap2, unsigned int bits)
>> +{
>> + unsigned int k, w = 0;
>> +
>> + for (k = 0; k < bits / BITS_PER_LONG; k++) {
>> + dst[k] = bitmap1[k] | bitmap2[k];
>> + w += hweight_long(dst[k]);
>> + }
>> +
>> + if (bits % BITS_PER_LONG) {
>> + dst[k] = bitmap1[k] | bitmap2[k];
>> + w += hweight_long(dst[k] & BITMAP_LAST_WORD_MASK(bits));
>> + }
>> + return w;
>> +}
>
> We've got bitmap_weight_and() and bitmap_weight_andnot() already. Can
> you align naming with the existing scheme: bitmap_weight_or().
That's not the same thing. bitmap_weight_and/not() calculate the weight
of the AND resp. ANDNOT of the two bitmaps w/o modifying them:
for (...)
w += hweight(map1[k] & map2[k]);
While the above does:
for (...) {
dst[k] = map1[k] | map2[k];
w += hweight(dst[k]);
}
The whole point of this as explained in the change log is to avoid
walking the resulting bitmap after doing the OR operation. The compiler
is clever enough to do the or operation in a register, write it to dst
and then do the hweight calculation with it.
> Also, for outline implementation, can you employ the BITMAP_WEIGHT()
> macro?
If you insist on this ugly:
return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits);
Sure.
Thanks,
tglx
^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 08/19] sched/mmcid: Use cpumask_or_weight()
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (6 preceding siblings ...)
2025-10-15 17:29 ` [patch 07/19] cpumask: Introduce cpumask_or_weight() Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 09/19] sched/mmcid: Convert mm CID mask to a bitmap Thomas Gleixner
` (12 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Use cpumask_or_weight() instead of cpumask_or() and cpumask_weight() on the
result, which walks the same bitmap twice.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/sched/core.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10409,6 +10409,7 @@ void call_trace_sched_update_nr_running(
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
struct cpumask *mm_allowed;
+ unsigned int weight;
if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
return;
@@ -10422,8 +10423,8 @@ static inline void mm_update_cpus_allowe
if (mm->mm_cid.nr_cpus_allowed == nr_cpu_ids)
return;
mm_allowed = mm_cpus_allowed(mm);
- cpumask_or(mm_allowed, mm_allowed, affmsk);
- WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed));
+ weight = cpumask_or_weight(mm_allowed, mm_allowed, affmsk);
+ WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 09/19] sched/mmcid: Convert mm CID mask to a bitmap
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (7 preceding siblings ...)
2025-10-15 17:29 ` [patch 08/19] sched/mmcid: Use cpumask_or_weight() Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 10/19] signal: Move MMCID exit out of sighand lock Thomas Gleixner
` (11 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
This is truly a bitmap and just conveniently uses a cpumask because the
maximum size of the bitmap is nr_cpu_ids.
But that prevents to do searches for a zero bit in a limited range, which
is helpful to provide an efficient mechanism to consolidate the CID space
when the number of users decreases.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/mm_types.h | 6 +++---
kernel/sched/core.c | 2 +-
kernel/sched/sched.h | 6 +++---
3 files changed, 7 insertions(+), 7 deletions(-)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1296,13 +1296,13 @@ static inline cpumask_t *mm_cpus_allowed
}
/* Accessor for struct mm_struct's cidmask. */
-static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+static inline unsigned long *mm_cidmask(struct mm_struct *mm)
{
unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);
/* Skip mm_cpus_allowed */
cid_bitmap += cpumask_size();
- return (struct cpumask *)cid_bitmap;
+ return (unsigned long *)cid_bitmap;
}
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
@@ -1317,7 +1317,7 @@ static inline void mm_init_cid(struct mm
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
raw_spin_lock_init(&mm->mm_cid.lock);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
- cpumask_clear(mm_cidmask(mm));
+ bitmap_zero(mm_cidmask(mm), nr_cpu_ids);
}
static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10437,7 +10437,7 @@ void sched_mm_cid_exit_signals(struct ta
guard(preempt)();
t->mm_cid.active = 0;
if (t->mm_cid.cid != MM_CID_UNSET) {
- cpumask_clear_cpu(t->mm_cid.cid, mm_cidmask(mm));
+ clear_bit(t->mm_cid.cid, mm_cidmask(mm));
t->mm_cid.cid = MM_CID_UNSET;
}
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3553,7 +3553,7 @@ static inline bool __mm_cid_get(struct t
if (cid >= max_cids)
return false;
- if (cpumask_test_and_set_cpu(cid, mm_cidmask(mm)))
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
return false;
t->mm_cid.cid = t->mm_cid.last_cid = cid;
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
@@ -3576,7 +3576,7 @@ static inline bool mm_cid_get(struct tas
return true;
/* Try the first zero bit in the cidmask. */
- return __mm_cid_get(t, cpumask_first_zero(mm_cidmask(mm)), max_cids);
+ return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), nr_cpu_ids), max_cids);
}
static inline void mm_cid_select(struct task_struct *t)
@@ -3597,7 +3597,7 @@ static inline void switch_mm_cid(struct
{
if (prev->mm_cid.active) {
if (prev->mm_cid.cid != MM_CID_UNSET)
- cpumask_clear_cpu(prev->mm_cid.cid, mm_cidmask(prev->mm));
+ clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
prev->mm_cid.cid = MM_CID_UNSET;
}
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 10/19] signal: Move MMCID exit out of sighand lock
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (8 preceding siblings ...)
2025-10-15 17:29 ` [patch 09/19] sched/mmcid: Convert mm CID mask to a bitmap Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 11/19] sched/mmcid: Move initialization out of line Thomas Gleixner
` (10 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
There is no need anymore to keep this under sighand lock as the current
code and the upcoming replacement are not depending on the exit state of a
task anymore.
That allows to use a mutex in the exit path.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/sched.h | 4 ++--
kernel/exit.c | 1 +
kernel/sched/core.c | 4 ++--
kernel/signal.c | 2 --
4 files changed, 5 insertions(+), 6 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2290,7 +2290,7 @@ static __always_inline void alloc_tag_re
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
+void sched_mm_cid_exit(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
return t->mm_cid.cid;
@@ -2299,7 +2299,7 @@ static inline int task_mm_cid(struct tas
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline void sched_mm_cid_exit(struct task_struct *t) { }
static inline int task_mm_cid(struct task_struct *t)
{
/*
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -905,6 +905,7 @@ void __noreturn do_exit(long code)
user_events_exit(tsk);
io_uring_files_cancel();
+ sched_mm_cid_exit(tsk);
exit_signals(tsk); /* sets PF_EXITING */
seccomp_filter_release(tsk);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10427,7 +10427,7 @@ static inline void mm_update_cpus_allowe
WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
}
-void sched_mm_cid_exit_signals(struct task_struct *t)
+void sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
@@ -10445,7 +10445,7 @@ void sched_mm_cid_exit_signals(struct ta
/* Deactivate MM CID allocation across execve() */
void sched_mm_cid_before_execve(struct task_struct *t)
{
- sched_mm_cid_exit_signals(t);
+ sched_mm_cid_exit(t);
}
/* Reactivate MM CID after successful execve() */
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *ts
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
- sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *ts
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
- sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 11/19] sched/mmcid: Move initialization out of line
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (9 preceding siblings ...)
2025-10-15 17:29 ` [patch 10/19] signal: Move MMCID exit out of sighand lock Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 12/19] sched/mmcid: Provide precomputed maximal value Thomas Gleixner
` (9 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
It's getting bigger soon, so just move it out of line to the rest of the
code.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/mm_types.h | 15 +--------------
kernel/sched/core.c | 14 ++++++++++++++
2 files changed, 15 insertions(+), 14 deletions(-)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1305,20 +1305,7 @@ static inline unsigned long *mm_cidmask(
return (unsigned long *)cid_bitmap;
}
-static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
-{
- int i;
-
- for_each_possible_cpu(i) {
- struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i);
-
- pcpu->cid = MM_CID_UNSET;
- }
- mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
- raw_spin_lock_init(&mm->mm_cid.lock);
- cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
- bitmap_zero(mm_cidmask(mm), nr_cpu_ids);
-}
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p);
static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
{
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10466,6 +10466,20 @@ void sched_mm_cid_fork(struct task_struc
WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
t->mm_cid.active = 1;
}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
+ struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
+
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ raw_spin_lock_init(&mm->mm_cid.lock);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ bitmap_zero(mm_cidmask(mm), nr_cpu_ids);
+}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
#endif /* !CONFIG_SCHED_MM_CID */
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 12/19] sched/mmcid: Provide precomputed maximal value
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (10 preceding siblings ...)
2025-10-15 17:29 ` [patch 11/19] sched/mmcid: Move initialization out of line Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 13/19] sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex Thomas Gleixner
` (8 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed everytime to compute
the maximal CID value is just wasteful as that value is only changing on
fork(), exit() and eventually when the affinity changes.
So it can be easily precomputed at those points and provided in mm::mm_cid
for consumption in the hot path.
But there is an issue with using mm::mm_users for accounting because that
does not necessarily reflect the number of user space tasks as other kernel
code can take temporary references on the MM which skew the picture.
Solve that by adding a users counter to struct mm_mm_cid, which is modified
by fork() and exit() and used for precomputing under mm_mm_cid::lock.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq_types.h | 6 ++++
kernel/fork.c | 1
kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++++-------------
kernel/sched/sched.h | 3 --
4 files changed, 56 insertions(+), 20 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -117,14 +117,20 @@ struct mm_cid_pcpu {
/**
* struct mm_mm_cid - Storage for per MM CID data
* @pcpu: Per CPU storage for CIDs associated to a CPU
+ * @max_cids: The exclusive maximum CID value for allocation and convergance
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
* is growth only.
+ * @users: The number of tasks sharing this MM. Seperate from mm::mm_users
+ * as that is modified by mmget()/mm_put() by other entities which
+ * do not actually share the MM.
* @lock: Spinlock to protect all fields except @pcpu. It also protects
* the MM cid cpumask and the MM cidmask bitmap.
*/
struct mm_mm_cid {
struct mm_cid_pcpu __percpu *pcpu;
+ unsigned int max_cids;
unsigned int nr_cpus_allowed;
+ unsigned int users;
raw_spinlock_t lock;
}____cacheline_aligned_in_smp;
#else /* CONFIG_SCHED_MM_CID */
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2449,6 +2449,7 @@ static bool need_futex_hash_allocate_def
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
+ sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4517,7 +4517,6 @@ static void __sched_fork(unsigned long c
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
- init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -10403,15 +10402,32 @@ void call_trace_sched_update_nr_running(
#ifdef CONFIG_SCHED_MM_CID
/*
- * When a task exits, the MM CID held by the task is not longer required as
- * the task cannot return to user space.
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
*/
+static void mm_update_max_cids(struct mm_struct *mm)
+{
+ struct mm_mm_cid *mc = &mm->mm_cid;
+ unsigned int max_cids;
+
+ lockdep_assert_held(&mm->mm_cid.lock);
+
+ /* Calculate the new maximum constraint */
+ max_cids = min(mc->nr_cpus_allowed, mc->users);
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
+
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
struct cpumask *mm_allowed;
unsigned int weight;
- if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids)
+ /*
+ * Nothing to do when the mask is already maxed out or the user
+ * count dropped to zero.
+ */
+ if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids ||
+ !READ_ONCE(mm->mm_cid.users))
return;
/*
@@ -10420,13 +10436,34 @@ static inline void mm_update_cpus_allowe
*/
guard(raw_spinlock)(&mm->mm_cid.lock);
/* Check again under the lock */
- if (mm->mm_cid.nr_cpus_allowed == nr_cpu_ids)
+ if (mm->mm_cid.nr_cpus_allowed == nr_cpu_ids || !mm->mm_cid.users)
return;
mm_allowed = mm_cpus_allowed(mm);
weight = cpumask_or_weight(mm_allowed, mm_allowed, affmsk);
+ if (weight == mm->mm_cid.nr_cpus_allowed)
+ return;
WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
+ mm_update_max_cids(mm);
}
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+
+ WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+
+ guard(raw_spinlock)(&mm->mm_cid.lock);
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ /* Preset last_cid for mm_cid_select() */
+ t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1;
+ mm_update_max_cids(mm);
+}
+
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
void sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
@@ -10434,12 +10471,14 @@ void sched_mm_cid_exit(struct task_struc
if (!mm || !t->mm_cid.active)
return;
- guard(preempt)();
+ guard(raw_spinlock)(&mm->mm_cid.lock);
t->mm_cid.active = 0;
+ mm->mm_cid.users--;
if (t->mm_cid.cid != MM_CID_UNSET) {
clear_bit(t->mm_cid.cid, mm_cidmask(mm));
t->mm_cid.cid = MM_CID_UNSET;
}
+ mm_update_max_cids(mm);
}
/* Deactivate MM CID allocation across execve() */
@@ -10451,22 +10490,11 @@ void sched_mm_cid_before_execve(struct t
/* Reactivate MM CID after successful execve() */
void sched_mm_cid_after_execve(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
-
- if (!mm)
- return;
-
+ sched_mm_cid_fork(t);
guard(preempt)();
- t->mm_cid.active = 1;
mm_cid_select(t);
}
-void sched_mm_cid_fork(struct task_struct *t)
-{
- WARN_ON_ONCE(!t->mm || t->mm_cid.cid != MM_CID_UNSET);
- t->mm_cid.active = 1;
-}
-
void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
@@ -10475,7 +10503,9 @@ void mm_init_cid(struct mm_struct *mm, s
for_each_possible_cpu(cpu)
per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
+ mm->mm_cid.max_cids = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ mm->mm_cid.users = 0;
raw_spin_lock_init(&mm->mm_cid.lock);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), nr_cpu_ids);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3565,7 +3565,7 @@ static inline bool mm_cid_get(struct tas
struct mm_struct *mm = t->mm;
unsigned int max_cids;
- max_cids = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
/* Try to reuse the last CID of this task */
if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
@@ -3608,7 +3608,6 @@ static inline void switch_mm_cid(struct
}
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void init_sched_mm_cid(struct task_struct *t) { }
static inline void mm_cid_select(struct task_struct *t) { }
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 13/19] sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (11 preceding siblings ...)
2025-10-15 17:29 ` [patch 12/19] sched/mmcid: Provide precomputed maximal value Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 14/19] sched/mmcid: Introduce per task/CPU ownership infrastrcuture Thomas Gleixner
` (7 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Prepare for the new CID management scheme which puts the CID ownership
transition into the fork() and exit() slow path by serializing
sched_mm_cid_fork()/exit() with it, so task list and cpu mask walks can be
done in interruptible and preemptible code.
The contention on it is not worse than on other concurrency controls in the
fork()/exit() machinery.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq_types.h | 2 ++
kernel/sched/core.c | 22 ++++++++++++++++++++++
2 files changed, 24 insertions(+)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -125,6 +125,7 @@ struct mm_cid_pcpu {
* do not actually share the MM.
* @lock: Spinlock to protect all fields except @pcpu. It also protects
* the MM cid cpumask and the MM cidmask bitmap.
+ * @mutex: Mutex to serialize forks and exits related to this mm
*/
struct mm_mm_cid {
struct mm_cid_pcpu __percpu *pcpu;
@@ -132,6 +133,7 @@ struct mm_mm_cid {
unsigned int nr_cpus_allowed;
unsigned int users;
raw_spinlock_t lock;
+ struct mutex mutex;
}____cacheline_aligned_in_smp;
#else /* CONFIG_SCHED_MM_CID */
struct mm_cid { };
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10402,6 +10402,25 @@ void call_trace_sched_update_nr_running(
#ifdef CONFIG_SCHED_MM_CID
/*
+ * Concurrency IDentifier management
+ *
+ * Serialization rules:
+ *
+ * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
+ * protects mm::mm_cid::users.
+ *
+ * mm::mm_cid::lock: Serializes mm_update_max_cids() and
+ * mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ * and runqueue lock.
+ *
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
+ *
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
+ */
+
+/*
* Update the CID range properties when the constraints change. Invoked via
* fork(), exit() and affinity changes
*/
@@ -10452,6 +10471,7 @@ void sched_mm_cid_fork(struct task_struc
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+ guard(mutex)(&mm->mm_cid.mutex);
guard(raw_spinlock)(&mm->mm_cid.lock);
t->mm_cid.active = 1;
mm->mm_cid.users++;
@@ -10471,6 +10491,7 @@ void sched_mm_cid_exit(struct task_struc
if (!mm || !t->mm_cid.active)
return;
+ guard(mutex)(&mm->mm_cid.mutex);
guard(raw_spinlock)(&mm->mm_cid.lock);
t->mm_cid.active = 0;
mm->mm_cid.users--;
@@ -10507,6 +10528,7 @@ void mm_init_cid(struct mm_struct *mm, s
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
raw_spin_lock_init(&mm->mm_cid.lock);
+ mutex_init(&mm->mm_cid.mutex);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), nr_cpu_ids);
}
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 14/19] sched/mmcid: Introduce per task/CPU ownership infrastrcuture
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (12 preceding siblings ...)
2025-10-15 17:29 ` [patch 13/19] sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 15/19] sched/mmcid: Provide new scheduler CID mechanism Thomas Gleixner
` (6 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
The MM CID management has two fundamental requirements:
1) It has to guarantee that at no given point in time the same CID is
used by concurrent tasks in userspace.
2) The CID space must not exceed the number of possible CPUs in a
system. While most allocators (glibc, tcmalloc, jemalloc) do not
care about that, there seems to be at least some LTTng library
depending on it.
The CID space compaction itself is not a functional correctness
requirement, it is only a useful optimization mechanism to reduce the
memory foot print in unused user space pools.
The optimal CID space is:
min(nr_tasks, nr_cpus_allowed);
Where @nr_tasks is the number of actual user space threads associated to
the mm and @nr_cpus_allowed is the superset of all task affinities. It is
growth only as it would be insane to take a racy snapshot of all task
affinities when the affinity of one task changes just do redo it 2
milliseconds later when the next task changes it's affinity.
That means that as long as the number of tasks is lower or equal than the
number of CPUs allowed, each task owns a CID. If the number of tasks
exceeds the number of CPUs allowed it switches to per CPU mode, where the
CPUs own the CIDs and the tasks borrow them as long as they are scheduled
in.
For transition periods CIDs can go beyond the optimal space as long as they
don't go beyond the number of possible CPUs.
The current upstream implementation tries to keep the CID with the task
even in overcommit situations, which complicates task migration. It also
has to do the CID space consolidation work from a task work in the exit to
user space path. As that work is assigned to a random task related to a MM
this can inflict unwanted exit latencies.
This can be done differently by implementing a strict CID ownership
mechanism. Either the CIDs are owned by the tasks or by the CPUs. The
latter provides less locality when tasks are heavily migrating, but there
is no justification to optimize for overcommit scenarios and thereby
penalazing everyone else.
Provide the basic infrastructure to implement this:
- Change the UNSET marker to BIT(31) from ~0U
- Add the ONCPU marker as BIT(30)
That allows to check for ownership trivialy and provides a simple check for
UNSET as well.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq_types.h | 3 ++-
include/linux/sched.h | 6 +++---
kernel/sched/core.c | 7 +++++++
kernel/sched/sched.h | 44 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 56 insertions(+), 4 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -92,7 +92,8 @@ struct rseq_data { };
#ifdef CONFIG_SCHED_MM_CID
-#define MM_CID_UNSET (~0U)
+#define MM_CID_UNSET BIT(31)
+#define MM_CID_ONCPU BIT(30)
/**
* struct sched_mm_cid - Storage for per task MM CID data
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2291,16 +2291,16 @@ void sched_mm_cid_before_execve(struct t
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
+static __always_inline int task_mm_cid(struct task_struct *t)
{
- return t->mm_cid.cid;
+ return t->mm_cid.cid & ~MM_CID_ONCPU;
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
+static __always_inline int task_mm_cid(struct task_struct *t)
{
/*
* Use the processor id as a fall-back when the mm cid feature is
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10418,6 +10418,13 @@ void call_trace_sched_update_nr_running(
*
* The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
* lock.
+ *
+ * CID ownership:
+ *
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. This bit is filtered out by task_cid() when it
+ * is actualy handed over to user space in the RSEQ memory.
*/
/*
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3534,6 +3534,50 @@ extern void sched_dynamic_update(int mod
extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
+
+static __always_inline bool cid_on_cpu(unsigned int cid)
+{
+ return cid & MM_CID_ONCPU;
+}
+
+static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_ONCPU;
+}
+
+static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid)
+{
+ return cid | MM_CID_ONCPU;
+}
+
+static __always_inline bool cid_on_task(unsigned int cid)
+{
+ /* True if neither MM_CID_ONCPU nor MM_CID_UNSET set */
+ return cid < MM_CID_ONCPU;
+}
+
+static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
+{
+ clear_bit(cid, mm_cidmask(mm));
+}
+
+static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
+{
+ unsigned int cid = t->mm_cid.cid;
+
+ t->mm_cid.cid = MM_CID_UNSET;
+ if (cid_on_task(cid))
+ mm_drop_cid(t->mm, cid);
+}
+
+static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
+{
+ /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+}
+
+/* Active implementation */
static inline void init_sched_mm_cid(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 15/19] sched/mmcid: Provide new scheduler CID mechanism
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (13 preceding siblings ...)
2025-10-15 17:29 ` [patch 14/19] sched/mmcid: Introduce per task/CPU ownership infrastrcuture Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions Thomas Gleixner
` (5 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
The MM CID management has two fundamental requirements:
1) It has to guarantee that at no given point in time the same CID is
used by concurrent tasks in userspace.
2) The CID space must not exceed the number of possible CPUs in a
system. While most allocators (glibc, tcmalloc, jemalloc) do not
care about that, there seems to be at least some LTTng library
depending on it.
The CID space compaction itself is not a functional correctness
requirement, it is only a useful optimization mechanism to reduce the
memory foot print in unused user space pools.
The optimal CID space is:
min(nr_tasks, nr_cpus_allowed);
Where @nr_tasks is the number of actual user space threads associated to
the mm and @nr_cpus_allowed is the superset of all task affinities. It is
growth only as it would be insane to take a racy snapshot of all task
affinities when the affinity of one task changes just do redo it 2
milliseconds later when the next task changes it's affinity.
That means that as long as the number of tasks is lower or equal than the
number of CPUs allowed, each task owns a CID. If the number of tasks
exceeds the number of CPUs allowed it switches to per CPU mode, where the
CPUs own the CIDs and the tasks borrow them as long as they are scheduled
in.
For transition periods CIDs can go beyond the optimal space as long as they
don't go beyond the number of possible CPUs.
The current upstream implementation tries to keep the CID with the task
even in overcommit situations, which complicates task migration. It also
has to do the CID space consolidation work from a task work in the exit to
user space path. As that work is assigned to a random task related to a MM
this can inflict unwanted exit latencies.
Implement the context switch parts of a strict ownership mechanism to
address this.
This removes all work from a task which schedules out. That's a benefit as
tasks which schedule out have the related shared mm:mm_cid data and the per
CPU storage cache cold when the task has a big enough cache foot print
while doing work in user space as perf top clearly shows.
The task which schedules in has to check whether:
1) The ownership mode changed
2) The CID is within the optimal CID space
In stable situations this results in zero work. The only short disruption
is when ownership mode changes or when the associated CID is not in the
optimal CID space. The latter only happens when tasks exit and therefore
the optimal CID space shrinks.
That mechanism is strictly optimized for the common case where no change
happens. The only case where it actually causes a temporary one time spike
is on mode changes when and only when a lot of tasks related to a MM
schedule exactly at the same time and have eventually to compete on
allocating a CID from the bitmap.
In the sysbench test case which triggered the spinlock contention in the
initial CID code, __schedule() drops significantly in perf top on a 128
Core (256 threads) machine when running sysbench with 255 threads, which
fits into the task mode limit of 256 together with the parent thread:
Upstream rseq/perf branch +CID rework
0.42% 0.37% 0.32% [k] __schedule
Increasing the number of threads to 256, which puts the test process into
per CPU mode looks about the same.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq.h | 8 +-
include/linux/rseq_types.h | 3 +
kernel/sched/core.c | 1
kernel/sched/sched.h | 130 ++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 137 insertions(+), 5 deletions(-)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -71,13 +71,13 @@ static __always_inline void rseq_sched_s
}
/*
- * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
- * update.
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
*
* This does not raise TIF_NOTIFY_RESUME as that happens in
* rseq_sched_switch_event().
*/
-static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
t->rseq.event.ids_changed = true;
}
@@ -176,7 +176,7 @@ static inline void rseq_fork(struct task
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
-static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -118,6 +118,7 @@ struct mm_cid_pcpu {
/**
* struct mm_mm_cid - Storage for per MM CID data
* @pcpu: Per CPU storage for CIDs associated to a CPU
+ * @percpu: Set, when CIDs are in per CPU mode
* @max_cids: The exclusive maximum CID value for allocation and convergance
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
* is growth only.
@@ -129,7 +130,9 @@ struct mm_cid_pcpu {
* @mutex: Mutex to serialize forks and exits related to this mm
*/
struct mm_mm_cid {
+ /* Hotpath read mostly members */
struct mm_cid_pcpu __percpu *pcpu;
+ unsigned int percpu;
unsigned int max_cids;
unsigned int nr_cpus_allowed;
unsigned int users;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10532,6 +10532,7 @@ void mm_init_cid(struct mm_struct *mm, s
per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
raw_spin_lock_init(&mm->mm_cid.lock);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2206,7 +2206,7 @@ static inline void __set_task_cpu(struct
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
- rseq_sched_set_task_cpu(p, cpu);
+ rseq_sched_set_ids_changed(p);
#endif /* CONFIG_SMP */
}
@@ -3577,6 +3577,134 @@ static __always_inline void mm_drop_cid_
mm_drop_cid(mm, pcp->cid);
}
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+ unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+
+ if (cid >= max_cids)
+ return MM_CID_UNSET;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return MM_CID_UNSET;
+ return cid;
+}
+
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
+{
+ unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
+
+ for (; cid == MM_CID_UNSET; cpu_relax())
+ cid = __mm_get_cid(mm, nr_cpu_ids);
+
+ return cid;
+}
+
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+ unsigned int max_cids)
+{
+ unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
+
+ /* Is it in the optimal CID space? */
+ if (likely(cid < max_cids))
+ return orig_cid;
+
+ /* Try to find one in the optimal space. Otherwise keep the provided. */
+ new_cid = __mm_get_cid(mm, max_cids);
+ if (new_cid != MM_CID_UNSET) {
+ mm_drop_cid(mm, cid);
+ /* Preserve the ONCPU mode of the original CID */
+ return new_cid | (orig_cid & MM_CID_ONCPU);
+ }
+ return orig_cid;
+}
+
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+ if (t->mm_cid.cid != cid) {
+ t->mm_cid.cid = cid;
+ rseq_sched_set_ids_changed(t);
+ }
+}
+
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
+}
+
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+{
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case where both have the ONCPU bit set */
+ if (likely(cid_on_cpu(cpu_cid & tcid))) {
+ if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+ mm_cid_update_task_cid(t, cpu_cid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+ } else {
+ /* Hand over or drop the task owned CID */
+ if (cid_on_task(tcid)) {
+ if (cid_on_cpu(cpu_cid))
+ mm_unset_cid_on_task(t);
+ else
+ cpu_cid = cid_to_cpu_cid(tcid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_cpu(cpu_cid))
+ cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+ }
+ mm_cid_update_pcpu_cid(mm, cpu_cid);
+ mm_cid_update_task_cid(t, cpu_cid);
+}
+
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+{
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case, where both have the ONCPU bit clear */
+ if (likely(cid_on_task(tcid | cpu_cid))) {
+ if (likely(tcid < max_cids)) {
+ mm_cid_update_pcpu_cid(mm, tcid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ tcid = mm_cid_converge(mm, tcid, max_cids);
+ } else {
+ /* Hand over or drop the CPU owned CID */
+ if (cid_on_cpu(cpu_cid)) {
+ if (cid_on_task(tcid))
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+ else
+ tcid = cpu_cid_to_cid(cpu_cid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_task(tcid))
+ tcid = mm_get_cid(mm);
+ }
+ mm_cid_update_pcpu_cid(mm, tcid);
+ mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+ struct mm_struct *mm = next->mm;
+ unsigned int cpu_cid;
+
+ if (!next->mm_cid.active)
+ return;
+
+ cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+ if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+ mm_cid_from_task(next, cpu_cid);
+ else
+ mm_cid_from_cpu(next, cpu_cid);
+}
+
/* Active implementation */
static inline void init_sched_mm_cid(struct task_struct *t)
{
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (14 preceding siblings ...)
2025-10-15 17:29 ` [patch 15/19] sched/mmcid: Provide new scheduler CID mechanism Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-20 6:34 ` Thomas Gleixner
` (3 more replies)
2025-10-15 17:29 ` [patch 17/19] irqwork: Move data struct to a types header Thomas Gleixner
` (4 subsequent siblings)
20 siblings, 4 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
CIDs are either owned by tasks or by CPUs. The ownership mode depends on
the number of tasks related to a MM and the number of CPUs on which these
tasks are theoretically allowed to run on. Theoretically because that
number is the superset of CPU affinities of all tasks which only grows and
never shrinks.
Switching to per CPU mode happens when the user count becomes greater than
the maximum number of CIDs, which is calculated by:
opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
max_cids = min(1.25 * opt_cids, nr_cpu_ids);
The +25% allowance is useful for tight CPU masks in scenarios where only a
few threads are created and destroyed to avoid frequent mode
switches. Though this allowance shrinks, the closer opt_cids becomes to
nr_cpu_ids, which is the (unfortunate) hard ABI limit.
At the point of switching to per CPU mode the new user is not yet visible
in the system, so the task which initiated the fork() runs the fixup
function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either
transfers each tasks owned CID to the CPU the task runs on or drops it into
the CID pool if a task is not on a CPU at that point in time. Tasks which
schedule in before the task walk reaches them do the handover in
mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's
guaranteed that no task related to that MM owns a CID anymore.
Switching back to task mode happens when the user count goes below the
threshold which was recorded on the per CPU mode switch:
pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2);
This threshold is updated when a affinity change increases the number of
allowed CPUs for the MM, which might cause a switch back to per task mode.
If the switch back was initiated by a exiting task, then that task runs the
fixup function. If it was initiated by a affinity change, then it's run
either in the deferred update function in context of a workqueue or by a
task which forks a new one or by a task which exits. Whatever happens
first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and
either transfers the CPU owned CIDs to a related task which runs on the CPU
or drops it into the pool. Tasks which schedule in on a CPU which the walk
did not cover yet do the handover themself.
As the goal is to avoid serialization of the scheduler hotpath, this
requires that the switch back threshold is maximally nr_cpu_ids / 2.
Otherwise the CID space might become exhausted when tasks are scheduled in
on CPUs which already transferred ownership before the fixup function was
able to free or transfer enough CIDs. That would result in a live lock
because the task loops in mm_get_cid() with runqueue lock held and the
fixup function is stuck on that runqueue lock.
When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
related to that MM is owned by a CPU anymore.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq_types.h | 10 +
kernel/sched/core.c | 251 ++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 234 insertions(+), 27 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -125,8 +125,9 @@ struct mm_cid_pcpu {
* @users: The number of tasks sharing this MM. Seperate from mm::mm_users
* as that is modified by mmget()/mm_put() by other entities which
* do not actually share the MM.
- * @lock: Spinlock to protect all fields except @pcpu. It also protects
- * the MM cid cpumask and the MM cidmask bitmap.
+ * @pcpu_thrs: Threshold for switching back from per CPU mode
+ * @update_deferred: A deferred switch back to per task mode is pending.
+ * @lock: Spinlock to protect against affinity setting which can't take @mutex
* @mutex: Mutex to serialize forks and exits related to this mm
*/
struct mm_mm_cid {
@@ -134,8 +135,13 @@ struct mm_mm_cid {
struct mm_cid_pcpu __percpu *pcpu;
unsigned int percpu;
unsigned int max_cids;
+
+ /* Low frequency modified */
unsigned int nr_cpus_allowed;
unsigned int users;
+ unsigned int pcpu_thrs;
+ unsigned int update_deferred;
+
raw_spinlock_t lock;
struct mutex mutex;
}____cacheline_aligned_in_smp;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10425,27 +10425,116 @@ void call_trace_sched_update_nr_running(
* by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
* MM_CID_ONCPU bit set. This bit is filtered out by task_cid() when it
* is actualy handed over to user space in the RSEQ memory.
+ *
+ * Mode switching:
+ *
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
+ *
+ * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ * max_cids = min(1.25 * opt_cids, nr_cpu_ids);
+ *
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * nr_cpu_ids, which is the (unfortunate) hard ABI limit.
+ *
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
+ *
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
+ *
+ * pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2);
+ *
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
+ *
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
+ *
+ * As the goal is to avoid serialization of the scheduler hotpath, this
+ * requires that the switch back threshold is maximally nr_cpu_ids / 2.
+ * Otherwise the CID space might become exhausted when tasks are scheduled
+ * in on CPUs which already transferred ownership before the fixup function
+ * was able to free or transfer enough CIDs. That would result in a live
+ * lock because the task loops in mm_get_cid() with runqueue lock held and
+ * the fixup function is stuck on that runqueue lock.
+ *
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
*/
/*
* Update the CID range properties when the constraints change. Invoked via
* fork(), exit() and affinity changes
*/
-static void mm_update_max_cids(struct mm_struct *mm)
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids, max_cids;
+
+ /* Calculate the new optimal constraint */
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+
+ /* Adjust the maximum CIDs to +25% limited by nr_cpu_ids */
+ max_cids = min(opt_cids + (opt_cids / 4), nr_cpu_ids);
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
+
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids;
+
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+ /* See mode switch documentation above! */
+ return min(opt_cids - opt_cids / 4, nr_cpu_ids / 2);
+}
+
+static bool mm_update_max_cids(struct mm_struct *mm)
{
struct mm_mm_cid *mc = &mm->mm_cid;
- unsigned int max_cids;
lockdep_assert_held(&mm->mm_cid.lock);
- /* Calculate the new maximum constraint */
- max_cids = min(mc->nr_cpus_allowed, mc->users);
- WRITE_ONCE(mc->max_cids, max_cids);
+ /* Clear deferred mode switch flag. A change is handled by the caller */
+ mc->update_deferred = false;
+ __mm_update_max_cids(mc);
+
+ /* Check whether owner mode must be changed */
+ if (!mc->percpu) {
+ /* Enable per CPU mode when the number of users is above max_cids */
+ if (mc->users > mc->max_cids)
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ } else {
+ /* Switch back to per task if user count under threshold */
+ if (mc->users < mc->pcpu_thrs)
+ mc->pcpu_thrs = 0;
+ }
+
+ /* Mode change required? */
+ if (!!mc->percpu == !!mc->pcpu_thrs)
+ return false;
+ WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ return true;
}
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
struct cpumask *mm_allowed;
+ struct mm_mm_cid *mc;
unsigned int weight;
/*
@@ -10455,21 +10544,130 @@ static inline void mm_update_cpus_allowe
if (!mm || READ_ONCE(mm->mm_cid.nr_cpus_allowed) == nr_cpu_ids ||
!READ_ONCE(mm->mm_cid.users))
return;
-
/*
* mm::mm_cid::mm_cpus_allowed is the superset of each threads
* allowed CPUs mask which means it can only grow.
*/
- guard(raw_spinlock)(&mm->mm_cid.lock);
+ mc = &mm->mm_cid;
+ guard(raw_spinlock)(&mc->lock);
/* Check again under the lock */
- if (mm->mm_cid.nr_cpus_allowed == nr_cpu_ids || !mm->mm_cid.users)
+ if (mc->nr_cpus_allowed == nr_cpu_ids || !mc->users)
return;
+
mm_allowed = mm_cpus_allowed(mm);
weight = cpumask_or_weight(mm_allowed, mm_allowed, affmsk);
- if (weight == mm->mm_cid.nr_cpus_allowed)
+ if (weight == mc->nr_cpus_allowed)
+ return;
+
+ WRITE_ONCE(mc->nr_cpus_allowed, weight);
+ __mm_update_max_cids(mc);
+ if (!mc->percpu)
return;
- WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, weight);
- mm_update_max_cids(mm);
+
+ /* Adjust the threshold to the wider set */
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+
+ /* Scheduling of deferred mode switch goes here */
+}
+
+static inline void mm_cid_transfer_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_cpu(t->mm_cid.cid)) {
+ t->mm_cid.cid = cpu_cid_to_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
+
+static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+ unsigned int cpu;
+
+ /* Walk the CPUs and fixup all stale CIDs */
+ for_each_possible_cpu(cpu) {
+ struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(rq_lock_irq)(rq);
+ if (cid_on_cpu(pcp->cid)) {
+ /* If rq->curr has @mm, fix it up right here */
+ if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+ mm_cid_transfer_to_task(rq->curr, pcp);
+ else
+ mm_drop_cid_on_cpu(mm, pcp);
+ }
+ }
+}
+
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_task(t->mm_cid.cid)) {
+ t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
+
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+{
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(task_rq_lock)(t);
+ if (t->mm != mm)
+ return false;
+ if (cid_on_task(t->mm_cid.cid)) {
+ /* If running on the CPU, transfer the CID, otherwise drop it */
+ if (task_rq(t)->curr == t)
+ mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ else
+ mm_unset_cid_on_task(t);
+ }
+ return true;
+}
+
+static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct task_struct *p, *t;
+ unsigned int users;
+
+ /*
+ * This can obviously race with a concurrent affinity change, which
+ * increases the number of allowed CPUs for this mm, but that does
+ * not affect the mode and only changes the CID constraints. A
+ * possible switch back to per task mode happens either in the
+ * deferred handler function or in the next fork()/exit().
+ *
+ * The caller has already transferred. The newly incoming task is
+ * already accounted for, but not yet visible.
+ */
+ users = mm->mm_cid.users - 2;
+ if (!users)
+ return;
+
+ guard(rcu)();
+ for_other_threads(current, t) {
+ mm_cid_fixup_task_to_cpu(t, mm);
+ users--;
+ }
+
+ if (!users)
+ return;
+
+ /* Happens only for VM_CLONE processes. */
+ for_each_process_thread(p, t) {
+ if (t == current || t->mm != mm)
+ continue;
+ if (mm_cid_fixup_task_to_cpu(t, mm)) {
+ if (--users == 0)
+ return;
+ }
+ }
+}
+
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
+{
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ return mm_update_max_cids(mm);
}
void sched_mm_cid_fork(struct task_struct *t)
@@ -10479,12 +10677,19 @@ void sched_mm_cid_fork(struct task_struc
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
- guard(raw_spinlock)(&mm->mm_cid.lock);
- t->mm_cid.active = 1;
- mm->mm_cid.users++;
- /* Preset last_cid for mm_cid_select() */
- t->mm_cid.last_cid = READ_ONCE(mm->mm_cid.max_cids) - 1;
- mm_update_max_cids(mm);
+ scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
+ sched_mm_cid_add_user(t, mm);
+ /* Preset last_cid for mm_cid_select() */
+ t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+ }
+}
+
+static bool sched_mm_cid_remove_user(struct task_struct *t)
+{
+ t->mm_cid.active = 0;
+ mm_unset_cid_on_task(t);
+ t->mm->mm_cid.users--;
+ return mm_update_max_cids(t->mm);
}
/*
@@ -10499,14 +10704,8 @@ void sched_mm_cid_exit(struct task_struc
return;
guard(mutex)(&mm->mm_cid.mutex);
- guard(raw_spinlock)(&mm->mm_cid.lock);
- t->mm_cid.active = 0;
- mm->mm_cid.users--;
- if (t->mm_cid.cid != MM_CID_UNSET) {
- clear_bit(t->mm_cid.cid, mm_cidmask(mm));
- t->mm_cid.cid = MM_CID_UNSET;
- }
- mm_update_max_cids(mm);
+ scoped_guard(raw_spinlock, &mm->mm_cid.lock)
+ sched_mm_cid_remove_user(t);
}
/* Deactivate MM CID allocation across execve() */
@@ -10535,6 +10734,8 @@ void mm_init_cid(struct mm_struct *mm, s
mm->mm_cid.percpu = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
mm->mm_cid.users = 0;
+ mm->mm_cid.pcpu_thrs = 0;
+ mm->mm_cid.update_deferred = 0;
raw_spin_lock_init(&mm->mm_cid.lock);
mutex_init(&mm->mm_cid.mutex);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions
2025-10-15 17:29 ` [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions Thomas Gleixner
@ 2025-10-20 6:34 ` Thomas Gleixner
2025-10-20 9:13 ` Peter Zijlstra
` (2 subsequent siblings)
3 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-20 6:34 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen
On Wed, Oct 15 2025 at 19:29, Thomas Gleixner wrote:
> As the goal is to avoid serialization of the scheduler hotpath, this
> requires that the switch back threshold is maximally nr_cpu_ids / 2.
> Otherwise the CID space might become exhausted when tasks are scheduled in
> on CPUs which already transferred ownership before the fixup function was
> able to free or transfer enough CIDs. That would result in a live lock
> because the task loops in mm_get_cid() with runqueue lock held and the
> fixup function is stuck on that runqueue lock.
I've found a way how to lift that limit and to address a problem I've
overlooked in the initial implementation.
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions
2025-10-15 17:29 ` [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions Thomas Gleixner
2025-10-20 6:34 ` Thomas Gleixner
@ 2025-10-20 9:13 ` Peter Zijlstra
2025-10-20 9:16 ` Peter Zijlstra
2025-10-20 9:27 ` Peter Zijlstra
3 siblings, 0 replies; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-20 9:13 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Wed, Oct 15, 2025 at 07:29:54PM +0200, Thomas Gleixner wrote:
> +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
> +{
> + /* Remote access to mm::mm_cid::pcpu requires rq_lock */
> + guard(task_rq_lock)(t);
FWIW, an alternative to using task_rq(t) in combination with the guard()
above would be something like:
CLASS(task_rq_lock, rq_guard)(t);
struct rq *rq = rq_guard.rq;
> + if (t->mm != mm)
> + return false;
> + if (cid_on_task(t->mm_cid.cid)) {
> + /* If running on the CPU, transfer the CID, otherwise drop it */
> + if (task_rq(t)->curr == t)
> + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
> + else
> + mm_unset_cid_on_task(t);
> + }
> + return true;
> +}
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions
2025-10-15 17:29 ` [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions Thomas Gleixner
2025-10-20 6:34 ` Thomas Gleixner
2025-10-20 9:13 ` Peter Zijlstra
@ 2025-10-20 9:16 ` Peter Zijlstra
2025-10-20 9:27 ` Peter Zijlstra
3 siblings, 0 replies; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-20 9:16 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Wed, Oct 15, 2025 at 07:29:54PM +0200, Thomas Gleixner wrote:
> +static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
> +{
> + unsigned int cpu;
Alternatively you could take cpus_read_lock() and iterate online_cpus().
No offline CPU should be running a userspace task and all that.
Doesn't really matter much, this also works fine.
> + /* Walk the CPUs and fixup all stale CIDs */
> + for_each_possible_cpu(cpu) {
> + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
> + struct rq *rq = cpu_rq(cpu);
> +
> + /* Remote access to mm::mm_cid::pcpu requires rq_lock */
> + guard(rq_lock_irq)(rq);
> + if (cid_on_cpu(pcp->cid)) {
> + /* If rq->curr has @mm, fix it up right here */
> + if (rq->curr->mm == mm && rq->curr->mm_cid.active)
> + mm_cid_transfer_to_task(rq->curr, pcp);
> + else
> + mm_drop_cid_on_cpu(mm, pcp);
> + }
> + }
> +}
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions
2025-10-15 17:29 ` [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions Thomas Gleixner
` (2 preceding siblings ...)
2025-10-20 9:16 ` Peter Zijlstra
@ 2025-10-20 9:27 ` Peter Zijlstra
2025-10-21 18:27 ` Thomas Gleixner
3 siblings, 1 reply; 39+ messages in thread
From: Peter Zijlstra @ 2025-10-20 9:27 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Wed, Oct 15, 2025 at 07:29:54PM +0200, Thomas Gleixner wrote:
> +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
> +{
> + /* Remote access to mm::mm_cid::pcpu requires rq_lock */
> + guard(task_rq_lock)(t);
> + if (t->mm != mm)
> + return false;
I'm slightly confused about this one; I'm assuming it is a double check
of mm for the VM_CLONE case below, once before lock once after. Except,
none of the scheduler locks actually serialize t->mm ...
IIRC the only way to actually change ->mm is exec(), and that is under
task_lock().
> + if (cid_on_task(t->mm_cid.cid)) {
> + /* If running on the CPU, transfer the CID, otherwise drop it */
> + if (task_rq(t)->curr == t)
> + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
> + else
> + mm_unset_cid_on_task(t);
> + }
> + return true;
> +}
> +
> +static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
> +{
> + struct mm_struct *mm = current->mm;
> + struct task_struct *p, *t;
> + unsigned int users;
> +
> + /*
> + * This can obviously race with a concurrent affinity change, which
> + * increases the number of allowed CPUs for this mm, but that does
> + * not affect the mode and only changes the CID constraints. A
> + * possible switch back to per task mode happens either in the
> + * deferred handler function or in the next fork()/exit().
> + *
> + * The caller has already transferred. The newly incoming task is
> + * already accounted for, but not yet visible.
> + */
> + users = mm->mm_cid.users - 2;
> + if (!users)
> + return;
> +
> + guard(rcu)();
> + for_other_threads(current, t) {
> + mm_cid_fixup_task_to_cpu(t, mm);
> + users--;
> + }
> +
> + if (!users)
> + return;
> +
> + /* Happens only for VM_CLONE processes. */
Right, sorry for reminding you about this :-)
> + for_each_process_thread(p, t) {
> + if (t == current || t->mm != mm)
> + continue;
> + if (mm_cid_fixup_task_to_cpu(t, mm)) {
> + if (--users == 0)
> + return;
> + }
> + }
> +}
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions
2025-10-20 9:27 ` Peter Zijlstra
@ 2025-10-21 18:27 ` Thomas Gleixner
0 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-21 18:27 UTC (permalink / raw)
To: Peter Zijlstra
Cc: LKML, Gabriele Monaco, Mathieu Desnoyers, Michael Jeanson,
Jens Axboe, Paul E. McKenney, Gautham R. Shenoy, Florian Weimer,
Tim Chen, TCMalloc Team
On Mon, Oct 20 2025 at 11:27, Peter Zijlstra wrote:
> On Wed, Oct 15, 2025 at 07:29:54PM +0200, Thomas Gleixner wrote:
>
>> +static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
>> +{
>> + /* Remote access to mm::mm_cid::pcpu requires rq_lock */
>> + guard(task_rq_lock)(t);
>> + if (t->mm != mm)
>> + return false;
>
> I'm slightly confused about this one; I'm assuming it is a double check
> of mm for the VM_CLONE case below, once before lock once after. Except,
> none of the scheduler locks actually serialize t->mm ...
>
> IIRC the only way to actually change ->mm is exec(), and that is under
> task_lock().
Right. That's also where the task removed itself from CID
management. Let me look at that again.
^ permalink raw reply [flat|nested] 39+ messages in thread
* [patch 17/19] irqwork: Move data struct to a types header
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (15 preceding siblings ...)
2025-10-15 17:29 ` [patch 16/19] sched/mmcid: Provide CID ownership mode fixup functions Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:29 ` [patch 18/19] sched/mmcid: Implement deferred mode change Thomas Gleixner
` (3 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
... to avoid header recursion hell.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/irq_work.h | 9 ++-------
include/linux/irq_work_types.h | 14 ++++++++++++++
2 files changed, 16 insertions(+), 7 deletions(-)
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -2,8 +2,9 @@
#ifndef _LINUX_IRQ_WORK_H
#define _LINUX_IRQ_WORK_H
-#include <linux/smp_types.h>
+#include <linux/irq_work_types.h>
#include <linux/rcuwait.h>
+#include <linux/smp_types.h>
/*
* An entry can be in one of four states:
@@ -14,12 +15,6 @@
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
*/
-struct irq_work {
- struct __call_single_node node;
- void (*func)(struct irq_work *);
- struct rcuwait irqwait;
-};
-
#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
.node = { .u_flags = (_flags), }, \
.func = (_func), \
--- /dev/null
+++ b/include/linux/irq_work_types.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQ_WORK_TYPES_H
+#define _LINUX_IRQ_WORK_TYPES_H
+
+#include <linux/smp_types.h>
+#include <linux/types.h>
+
+struct irq_work {
+ struct __call_single_node node;
+ void (*func)(struct irq_work *);
+ struct rcuwait irqwait;
+};
+
+#endif
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 18/19] sched/mmcid: Implement deferred mode change
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (16 preceding siblings ...)
2025-10-15 17:29 ` [patch 17/19] irqwork: Move data struct to a types header Thomas Gleixner
@ 2025-10-15 17:29 ` Thomas Gleixner
2025-10-15 17:30 ` [patch 19/19] sched/mmcid: Switch over to the new mechanism Thomas Gleixner
` (2 subsequent siblings)
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:29 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
When affinity changes cause an increase of the number of CPUs allowed for
tasks which are related to a MM, that might results in a situation where
the ownership mode can go back from per CPU mode to per task mode.
As affinity changes happen with runqueue lock held there is no way to do
the actual mode change and required fixup right there.
Add the infrastructure to defer it to a workqueue. The scheduled work can
race with a fork() or exit(). Whatever happens first takes care of it.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq_types.h | 8 ++++++
kernel/sched/core.c | 58 +++++++++++++++++++++++++++++++++++++++------
2 files changed, 59 insertions(+), 7 deletions(-)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -2,7 +2,9 @@
#ifndef _LINUX_RSEQ_TYPES_H
#define _LINUX_RSEQ_TYPES_H
+#include <linux/irq_work_types.h>
#include <linux/types.h>
+#include <linux/workqueue_types.h>
#ifdef CONFIG_RSEQ
struct rseq;
@@ -127,6 +129,8 @@ struct mm_cid_pcpu {
* do not actually share the MM.
* @pcpu_thrs: Threshold for switching back from per CPU mode
* @update_deferred: A deferred switch back to per task mode is pending.
+ * @irq_work: irq_work to handle the affinity mode change case
+ * @work: Regular work to handle the affinity mode change case
* @lock: Spinlock to protect against affinity setting which can't take @mutex
* @mutex: Mutex to serialize forks and exits related to this mm
*/
@@ -142,6 +146,10 @@ struct mm_mm_cid {
unsigned int pcpu_thrs;
unsigned int update_deferred;
+ /* Rarely used. Moves @lock and @mutex into the second cacheline */
+ struct irq_work irq_work;
+ struct work_struct work;
+
raw_spinlock_t lock;
struct mutex mutex;
}____cacheline_aligned_in_smp;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10566,8 +10566,17 @@ static inline void mm_update_cpus_allowe
/* Adjust the threshold to the wider set */
mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ /* Switch back to per task mode? */
+ if (mc->users >= mc->pcpu_thrs)
+ return;
+
+ /* Don't queue twice */
+ if (mc->update_deferred)
+ return;
- /* Scheduling of deferred mode switch goes here */
+ /* Queue the irq work, which schedules the real work */
+ mc->update_deferred = true;
+ irq_work_queue(&mc->irq_work);
}
static inline void mm_cid_transfer_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10578,7 +10587,7 @@ static inline void mm_cid_transfer_to_ta
}
}
-static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
{
unsigned int cpu;
@@ -10722,14 +10731,47 @@ void sched_mm_cid_after_execve(struct ta
mm_cid_select(t);
}
-void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+static void mm_cid_work_fn(struct work_struct *work)
{
- struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
- int cpu;
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
+
+ /* Make it compile, but not functional yet */
+ if (!IS_ENABLED(CONFIG_NEW_MM_CID))
+ return;
- for_each_possible_cpu(cpu)
- per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
+ guard(mutex)(&mm->mm_cid.mutex);
+ /* Did the last user task exit already? */
+ if (!mm->mm_cid.users)
+ return;
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Have fork() or exit() handled it already? */
+ if (!mm->mm_cid.update_deferred)
+ return;
+ /* This clears mm_cid::update_deferred */
+ if (!mm_update_max_cids(mm))
+ return;
+ /* Affinity changes can only switch back to task mode */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return;
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+}
+
+static void mm_cid_irq_work(struct irq_work *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+
+ /*
+ * Needs to be unconditional because mm_cid::lock cannot be held
+ * when scheduling work as mm_update_cpus_allowed() nests inside
+ * rq::lock and schedule_work() might end up in wakeup...
+ */
+ schedule_work(&mm->mm_cid.work);
+}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
mm->mm_cid.max_cids = 0;
mm->mm_cid.percpu = 0;
mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
@@ -10738,6 +10780,8 @@ void mm_init_cid(struct mm_struct *mm, s
mm->mm_cid.update_deferred = 0;
raw_spin_lock_init(&mm->mm_cid.lock);
mutex_init(&mm->mm_cid.mutex);
+ mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+ INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), nr_cpu_ids);
}
^ permalink raw reply [flat|nested] 39+ messages in thread* [patch 19/19] sched/mmcid: Switch over to the new mechanism
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (17 preceding siblings ...)
2025-10-15 17:29 ` [patch 18/19] sched/mmcid: Implement deferred mode change Thomas Gleixner
@ 2025-10-15 17:30 ` Thomas Gleixner
2025-10-17 7:09 ` [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
2025-10-17 11:31 ` Florian Weimer
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-15 17:30 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
Now that all pieces are in place, change the implementations of
sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict
ownership scheme and switch context_switch() over to use the new
mm_cid_schedin() functionality.
The common case is that there is no mode change required, which makes
fork() and exit() just update the user count and the constraints.
In case that a new user would exceed the CID space limit the fork() context
handles the transition to per CPU mode with mm::mm_cid::mutex held. exit()
handles the transition back to per task mode when the user count drops
below the switch back threshold. fork() might also be forced to handle a
deferred switch back to per task mode, when a affinity change increased the
number of allowed CPUs enough.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/rseq.h | 19 -------
include/linux/rseq_types.h | 8 +--
kernel/fork.c | 1
kernel/sched/core.c | 109 ++++++++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 78 --------------------------------
5 files changed, 99 insertions(+), 116 deletions(-)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -82,24 +82,6 @@ static __always_inline void rseq_sched_s
t->rseq.event.ids_changed = true;
}
-/*
- * Invoked from switch_mm_cid() in context switch when the task gets a MM
- * CID assigned.
- *
- * This does not raise TIF_NOTIFY_RESUME as that happens in
- * rseq_sched_switch_event().
- */
-static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
-{
- /*
- * Requires a comparison as the switch_mm_cid() code does not
- * provide a conditional for it readily. So avoid excessive updates
- * when nothing changes.
- */
- if (t->rseq.ids.mm_cid != cid)
- t->rseq.event.ids_changed = true;
-}
-
/* Enforce a full update after RSEQ registration and when execve() failed */
static inline void rseq_force_update(void)
{
@@ -177,7 +159,6 @@ static inline void rseq_handle_slowpath(
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
-static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
static inline void rseq_force_update(void) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { }
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -100,18 +100,18 @@ struct rseq_data { };
/**
* struct sched_mm_cid - Storage for per task MM CID data
* @active: MM CID is active for the task
- * @cid: The CID associated to the task
- * @last_cid: The last CID associated to the task
+ * @cid: The CID associated to the task either permanently or
+ * borrowed from the CPU
*/
struct sched_mm_cid {
unsigned int active;
unsigned int cid;
- unsigned int last_cid;
};
/**
* struct mm_cid_pcpu - Storage for per CPU MM_CID data
- * @cid: The CID associated to the CPU
+ * @cid: The CID associated to the CPU either permanently or
+ * while a task with a CID is running
*/
struct mm_cid_pcpu {
unsigned int cid;
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -956,7 +956,6 @@ static struct task_struct *dup_task_stru
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid.cid = MM_CID_UNSET;
- tsk->mm_cid.last_cid = MM_CID_UNSET;
tsk->mm_cid.active = 0;
#endif
return tsk;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5339,7 +5339,7 @@ context_switch(struct rq *rq, struct tas
}
}
- switch_mm_cid(prev, next);
+ mm_cid_schedin(next);
/*
* Tell rseq that the task was scheduled in. Must be after
@@ -10632,7 +10632,7 @@ static bool mm_cid_fixup_task_to_cpu(str
return true;
}
-static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_fixup_tasks_to_cpus(void)
{
struct mm_struct *mm = current->mm;
struct task_struct *p, *t;
@@ -10682,14 +10682,42 @@ static bool sched_mm_cid_add_user(struct
void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
+ bool percpu;
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
- scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
- sched_mm_cid_add_user(t, mm);
- /* Preset last_cid for mm_cid_select() */
- t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transfer_to_task(current, pcp);
+ else
+ mm_cid_transfer_to_cpu(current, pcp);
+ }
+
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
}
}
@@ -10701,6 +10729,30 @@ static bool sched_mm_cid_remove_user(str
return mm_update_max_cids(t->mm);
}
+static bool __sched_mm_cid_exit(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+
+ if (!sched_mm_cid_remove_user(t))
+ return false;
+ /*
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
+ */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
+ /*
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
+ */
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
+}
+
/*
* When a task exits, the MM CID held by the task is not longer required as
* the task cannot return to user space.
@@ -10711,10 +10763,43 @@ void sched_mm_cid_exit(struct task_struc
if (!mm || !t->mm_cid.active)
return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transfer_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transfer_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
- guard(mutex)(&mm->mm_cid.mutex);
- scoped_guard(raw_spinlock, &mm->mm_cid.lock)
- sched_mm_cid_remove_user(t);
+ /*
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventally pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
+ */
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
}
/* Deactivate MM CID allocation across execve() */
@@ -10727,18 +10812,12 @@ void sched_mm_cid_before_execve(struct t
void sched_mm_cid_after_execve(struct task_struct *t)
{
sched_mm_cid_fork(t);
- guard(preempt)();
- mm_cid_select(t);
}
static void mm_cid_work_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
- /* Make it compile, but not functional yet */
- if (!IS_ENABLED(CONFIG_NEW_MM_CID))
- return;
-
guard(mutex)(&mm->mm_cid.mutex);
/* Did the last user task exit already? */
if (!mm->mm_cid.users)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3704,84 +3704,8 @@ static __always_inline void mm_cid_sched
else
mm_cid_from_cpu(next, cpu_cid);
}
-
-/* Active implementation */
-static inline void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- unsigned int max_cid;
-
- if (!mm)
- return;
-
- /* Preset last_mm_cid */
- max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
- t->mm_cid.last_cid = max_cid - 1;
-}
-
-static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
-{
- struct mm_struct *mm = t->mm;
-
- if (cid >= max_cids)
- return false;
- if (test_and_set_bit(cid, mm_cidmask(mm)))
- return false;
- t->mm_cid.cid = t->mm_cid.last_cid = cid;
- __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
- return true;
-}
-
-static inline bool mm_cid_get(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- unsigned int max_cids;
-
- max_cids = READ_ONCE(mm->mm_cid.max_cids);
-
- /* Try to reuse the last CID of this task */
- if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
- return true;
-
- /* Try to reuse the last CID of this mm on this CPU */
- if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
- return true;
-
- /* Try the first zero bit in the cidmask. */
- return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), nr_cpu_ids), max_cids);
-}
-
-static inline void mm_cid_select(struct task_struct *t)
-{
- /*
- * mm_cid_get() can fail when the maximum CID, which is determined
- * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
- * That's a transient failure as there cannot be more tasks
- * concurrently on a CPU (or about to be scheduled in) than that.
- */
- for (;;) {
- if (mm_cid_get(t))
- break;
- }
-}
-
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
-{
- if (prev->mm_cid.active) {
- if (prev->mm_cid.cid != MM_CID_UNSET)
- clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
- prev->mm_cid.cid = MM_CID_UNSET;
- }
-
- if (next->mm_cid.active) {
- mm_cid_select(next);
- rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
- }
-}
-
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void mm_cid_select(struct task_struct *t) { }
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void mm_cid_schedin(struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 00/19] sched: Rewrite MM CID management
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (18 preceding siblings ...)
2025-10-15 17:30 ` [patch 19/19] sched/mmcid: Switch over to the new mechanism Thomas Gleixner
@ 2025-10-17 7:09 ` Thomas Gleixner
2025-10-17 11:31 ` Florian Weimer
20 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-17 7:09 UTC (permalink / raw)
To: LKML
Cc: Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Florian Weimer, Tim Chen, TCMalloc Team
On Wed, Oct 15 2025 at 19:29, Thomas Gleixner wrote:
> Thread create teardown
>
> I wrote a micro benchmark, which spawns pools which each create
> threads and let the threads die after creation. The 32 pools/ 32
> threads case triggers the ownership mode change case in both
> directions. The source is appended at the end of this mail.
>
> I initialy wrote it to stress the mode change mechanics, but then I
> noticed the massive difference when I ran it on upstream:
>
> 8 pools / 8 threads 32 pools / 32 threads
>
> v6.17 23666 thr/sec 16161 thr/sec
> +rseq/perf 23656 thr/sec 0% 16196 thr/sec 0%
> +cid rework 32025 thr/sec +35% 21004 thr/sec +30%
>
> Both v6.17 and v6.17 + rseq/perf show this in perf top:
>
> 14.62% [kernel] [k] update_sd_lb_stats.constprop.0
> 13.08% [kernel] [k] native_queued_spin_lock_slowpath
> 4.66% [kernel] [k] osq_lock
> 3.06% [kernel] [k] _find_next_and_bit
> 2.21% [kernel] [k] __schedule
> 2.16% [kernel] [k] sched_balance_rq
>
> with the CID rewrite this becomes:
>
> 13.48% [kernel] [k] native_queued_spin_lock_slowpath
> 8.98% [kernel] [k] update_sd_lb_stats.constprop.0
> 5.16% [kernel] [k] osq_lock
> 2.28% [kernel] [k] _find_next_and_bit
> 2.11% [kernel] [k] __schedule
> 1.75% [kernel] [k] psi_group_change
> ...
> 1.32% [kernel] [k] sched_balance_rq
>
> I haven't been able to understand that massive difference yet.
Looked deeper into it and it turns out that the problem is caused by the
upstream MM_CID implmementation. The extra work in the task migration
code increases rq lock hold time enough to cause that.
When I make CONFIG_SCHED_MM_CID a real knob and disable it on top of
rseq/perf then it becomes on par with the rewritten CID code. Toggling
it on top of the CID rewrite series does not really make a difference.
Thanks,
tglx
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 00/19] sched: Rewrite MM CID management
2025-10-15 17:29 [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
` (19 preceding siblings ...)
2025-10-17 7:09 ` [patch 00/19] sched: Rewrite MM CID management Thomas Gleixner
@ 2025-10-17 11:31 ` Florian Weimer
2025-10-17 12:56 ` Thomas Gleixner
20 siblings, 1 reply; 39+ messages in thread
From: Florian Weimer @ 2025-10-17 11:31 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Tim Chen, TCMalloc Team
* Thomas Gleixner:
> The CID space compaction itself is not a functional correctness
> requirement, it is only a useful optimization mechanism to reduce the
> memory foot print in unused user space pools.
>
> The optimal CID space is:
>
> min(nr_tasks, nr_cpus_allowed);
>
> Where @nr_tasks is the number of actual user space threads associated to
> the mm.
>
> @nr_cpus_allowed is the superset of all task affinities. It is growth
> only as it would be insane to take a racy snapshot of all task
> affinities when the affinity of one task changes just do redo it 2
> milliseconds later when the next task changes its affinity.
How can userspace obtain the maximum possible nr_cpus_allowed value?
Thanks,
Florian
^ permalink raw reply [flat|nested] 39+ messages in thread* Re: [patch 00/19] sched: Rewrite MM CID management
2025-10-17 11:31 ` Florian Weimer
@ 2025-10-17 12:56 ` Thomas Gleixner
0 siblings, 0 replies; 39+ messages in thread
From: Thomas Gleixner @ 2025-10-17 12:56 UTC (permalink / raw)
To: Florian Weimer
Cc: LKML, Peter Zijlstra, Gabriele Monaco, Mathieu Desnoyers,
Michael Jeanson, Jens Axboe, Paul E. McKenney, Gautham R. Shenoy,
Tim Chen, TCMalloc Team
On Fri, Oct 17 2025 at 13:31, Florian Weimer wrote:
> * Thomas Gleixner:
>
>> The CID space compaction itself is not a functional correctness
>> requirement, it is only a useful optimization mechanism to reduce the
>> memory foot print in unused user space pools.
>>
>> The optimal CID space is:
>>
>> min(nr_tasks, nr_cpus_allowed);
>>
>> Where @nr_tasks is the number of actual user space threads associated to
>> the mm.
>>
>> @nr_cpus_allowed is the superset of all task affinities. It is growth
>> only as it would be insane to take a racy snapshot of all task
>> affinities when the affinity of one task changes just do redo it 2
>> milliseconds later when the next task changes its affinity.
>
> How can userspace obtain the maximum possible nr_cpus_allowed value?
get_nprocs_conf(3), which reads /sys/devices/system/cpu/possible
^ permalink raw reply [flat|nested] 39+ messages in thread