[PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify_resume()
       [not found] <20250716160603.138385-6-gmonaco@redhat.com>
@ 2025-07-16 16:06 ` Gabriele Monaco
  2025-08-26 18:01   ` Mathieu Desnoyers
  2025-07-16 16:06 ` [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches Gabriele Monaco
  1 sibling, 1 reply; 6+ messages in thread
From: Gabriele Monaco @ 2025-07-16 16:06 UTC (permalink / raw)
  To: linux-kernel, Andrew Morton, David Hildenbrand, Ingo Molnar,
	Peter Zijlstra, Mathieu Desnoyers, Paul E. McKenney, linux-mm
  Cc: Gabriele Monaco, Ingo Molnar

Currently the mm_cid_compaction is triggered by the scheduler tick and
runs in a task_work, behaviour is more unpredictable with periodic tasks
with short runtime, which may rarely run during a tick.

Run the mm_cid_compaction from the rseq_handle_notify_resume() call,
which runs from resume_user_mode_work. Since the context is the same
where the task_work would run, skip this step and call the compaction
function directly.
The compaction function still exits prematurely in case the scan is not
required, that is when the pseudo-period of 100ms did not elapse.

Keep a tick handler used for long running tasks that are never preempted
(i.e. that never call rseq_handle_notify_resume), which triggers a
compaction and mm_cid update only in that case.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/mm.h       |  2 ++
 include/linux/mm_types.h | 11 ++++++++
 include/linux/sched.h    |  2 +-
 kernel/rseq.c            |  2 ++
 kernel/sched/core.c      | 55 +++++++++++++++++++++++++---------------
 kernel/sched/sched.h     |  2 ++
 6 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa538feaa8d95..cc8c1c9ae26c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2294,6 +2294,7 @@ void sched_mm_cid_before_execve(struct task_struct *t);
 void sched_mm_cid_after_execve(struct task_struct *t);
 void sched_mm_cid_fork(struct task_struct *t);
 void sched_mm_cid_exit_signals(struct task_struct *t);
+void task_mm_cid_work(struct task_struct *t);
 static inline int task_mm_cid(struct task_struct *t)
 {
 	return t->mm_cid;
@@ -2303,6 +2304,7 @@ static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
 static inline void sched_mm_cid_fork(struct task_struct *t) { }
 static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
+static inline void task_mm_cid_work(struct task_struct *t) { }
 static inline int task_mm_cid(struct task_struct *t)
 {
 	/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d6b91e8a66d6d..e6d6e468e64b4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1420,6 +1420,13 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
 	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
 	raw_spin_unlock(&mm->cpus_allowed_lock);
 }
+
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+	if (!mm)
+		return false;
+	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
+}
 #else /* CONFIG_SCHED_MM_CID */
 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
 static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
@@ -1430,6 +1437,10 @@ static inline unsigned int mm_cid_size(void)
 	return 0;
 }
 static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
+static inline bool mm_cid_needs_scan(struct mm_struct *mm)
+{
+	return false;
+}
 #endif /* CONFIG_SCHED_MM_CID */
 
 struct mmu_gather;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa9c5be7a6325..a75f61cea2271 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1428,7 +1428,7 @@ struct task_struct {
 	int				last_mm_cid;	/* Most recent cid in mm */
 	int				migrate_from_cpu;
 	int				mm_cid_active;	/* Whether cid bitmap is active */
-	struct callback_head		cid_work;
+	unsigned long			last_cid_reset;	/* Time of last reset in jiffies */
 #endif
 
 	struct tlbflush_unmap_batch	tlb_ubc;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e811..100f81e330dc6 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
 	}
 	if (unlikely(rseq_update_cpu_node_id(t)))
 		goto error;
+	/* The mm_cid compaction returns prematurely if scan is not needed. */
+	task_mm_cid_work(t);
 	return;
 
 error:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 81c6df746df17..27b856a1cb0a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10589,22 +10589,13 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
 	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
 }
 
-static void task_mm_cid_work(struct callback_head *work)
+void task_mm_cid_work(struct task_struct *t)
 {
 	unsigned long now = jiffies, old_scan, next_scan;
-	struct task_struct *t = current;
 	struct cpumask *cidmask;
-	struct mm_struct *mm;
 	int weight, cpu;
+	struct mm_struct *mm = t->mm;
 
-	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
-
-	work->next = work;	/* Prevent double-add */
-	if (t->flags & PF_EXITING)
-		return;
-	mm = t->mm;
-	if (!mm)
-		return;
 	old_scan = READ_ONCE(mm->mm_cid_next_scan);
 	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
 	if (!old_scan) {
@@ -10643,23 +10634,47 @@ void init_sched_mm_cid(struct task_struct *t)
 		if (mm_users == 1)
 			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
 	}
-	t->cid_work.next = &t->cid_work;	/* Protect against double add */
-	init_task_work(&t->cid_work, task_mm_cid_work);
 }
 
 void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
 {
-	struct callback_head *work = &curr->cid_work;
-	unsigned long now = jiffies;
+	u64 rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
 
+	/*
+	 * If a task is running unpreempted for a long time, it won't get its
+	 * mm_cid compacted and won't update its mm_cid value after a
+	 * compaction occurs.
+	 * For such a task, this function does two things:
+	 * A) trigger the mm_cid recompaction,
+	 * B) trigger an update of the task's rseq->mm_cid field at some point
+	 * after recompaction, so it can get a mm_cid value closer to 0.
+	 * A change in the mm_cid triggers an rseq_preempt.
+	 *
+	 * B occurs once after the compaction work completes, neither A nor B
+	 * run as long as the compaction work is pending, the task is exiting
+	 * or is not a userspace task.
+	 */
 	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
-	    work->next != work)
+	    test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
 		return;
-	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+	if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
 		return;
-
-	/* No page allocation under rq lock */
-	task_work_add(curr, work, TWA_RESUME);
+	if (mm_cid_needs_scan(curr->mm)) {
+		/* Trigger mm_cid recompaction */
+		rseq_set_notify_resume(curr);
+	} else if (time_after(jiffies, curr->last_cid_reset +
+			      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
+		/* Update mm_cid field */
+		int old_cid = curr->mm_cid;
+
+		if (!curr->mm_cid_active)
+			return;
+		mm_cid_snapshot_time(rq, curr->mm);
+		mm_cid_put_lazy(curr);
+		curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq, curr, curr->mm);
+		if (old_cid != curr->mm_cid)
+			rseq_preempt(curr);
+	}
 }
 
 void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 475bb5998295e..90a5b58188232 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3606,6 +3606,7 @@ extern const char *preempt_modes[];
 
 #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
 #define MM_CID_SCAN_DELAY	100			/* 100ms */
+#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
 
 extern raw_spinlock_t cid_lock;
 extern int use_cid_lock;
@@ -3809,6 +3810,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
 	int cid;
 
 	lockdep_assert_rq_held(rq);
+	t->last_cid_reset = jiffies;
 	cpumask = mm_cidmask(mm);
 	cid = __this_cpu_read(pcpu_cid->cid);
 	if (mm_cid_is_valid(cid)) {
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches
       [not found] <20250716160603.138385-6-gmonaco@redhat.com>
  2025-07-16 16:06 ` [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify_resume() Gabriele Monaco
@ 2025-07-16 16:06 ` Gabriele Monaco
  2025-08-26 18:10   ` Mathieu Desnoyers
  1 sibling, 1 reply; 6+ messages in thread
From: Gabriele Monaco @ 2025-07-16 16:06 UTC (permalink / raw)
  To: linux-kernel, Andrew Morton, David Hildenbrand, Ingo Molnar,
	Peter Zijlstra, Mathieu Desnoyers, linux-mm
  Cc: Gabriele Monaco, Ingo Molnar

Currently, task_mm_cid_work() is called from resume_user_mode_work().
This can delay the execution of the corresponding thread for the entire
duration of the function, negatively affecting the response in case of
real time tasks.
In practice, we observe task_mm_cid_work increasing the latency of
30-35us on a 128 cores system, this order of magnitude is meaningful
under PREEMPT_RT.

Run the task_mm_cid_work in batches of up to CONFIG_RSEQ_CID_SCAN_BATCH
CPUs, this reduces the duration of the delay for each scan.

The task_mm_cid_work contains a mechanism to avoid running more
frequently than every 100ms. Keep this pseudo-periodicity only on
complete scans.
This means each call to task_mm_cid_work returns prematurely if the
period did not elapse and a scan is not ongoing (i.e. the next batch to
scan is not the first).
This way full scans are not excessively delayed while still keeping each
run, and introduced latency, short.

Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/linux/mm_types.h | 15 +++++++++++++++
 init/Kconfig             | 12 ++++++++++++
 kernel/sched/core.c      | 37 ++++++++++++++++++++++++++++++++++---
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e6d6e468e64b4..a822966a584f3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -995,6 +995,13 @@ struct mm_struct {
 		 * When the next mm_cid scan is due (in jiffies).
 		 */
 		unsigned long mm_cid_next_scan;
+		/*
+		 * @mm_cid_scan_batch: Counter for batch used in the next scan.
+		 *
+		 * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH. This field
+		 * increments at each scan and reset when all batches are done.
+		 */
+		unsigned int mm_cid_scan_batch;
 		/**
 		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
 		 *
@@ -1385,6 +1392,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
 	raw_spin_lock_init(&mm->cpus_allowed_lock);
 	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
 	cpumask_clear(mm_cidmask(mm));
+	mm->mm_cid_scan_batch = 0;
 }
 
 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
@@ -1423,8 +1431,15 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
 
 static inline bool mm_cid_needs_scan(struct mm_struct *mm)
 {
+	unsigned int next_batch;
+
 	if (!mm)
 		return false;
+	next_batch = READ_ONCE(mm->mm_cid_scan_batch);
+	/* Always needs scan unless it's the first batch. */
+	if (CONFIG_RSEQ_CID_SCAN_BATCH * next_batch < num_possible_cpus() &&
+	    next_batch)
+		return true;
 	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
 }
 #else /* CONFIG_SCHED_MM_CID */
diff --git a/init/Kconfig b/init/Kconfig
index 666783eb50abd..98d7f078cd6df 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1860,6 +1860,18 @@ config DEBUG_RSEQ
 
 	  If unsure, say N.
 
+config RSEQ_CID_SCAN_BATCH
+	int "Number of CPUs to scan at every mm_cid compaction attempt"
+	range 1 NR_CPUS
+	default 8
+	depends on SCHED_MM_CID
+	help
+	  CPUs are scanned pseudo-periodically to compact the CID of each task,
+	  this operation can take a longer amount of time on systems with many
+	  CPUs, resulting in higher scheduling latency for the current task.
+	  A higher value means the CID is compacted faster, but results in
+	  higher scheduling latency.
+
 config CACHESTAT_SYSCALL
 	bool "Enable cachestat() system call" if EXPERT
 	default y
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 27b856a1cb0a9..eae4c8faf980b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10591,11 +10591,26 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
 
 void task_mm_cid_work(struct task_struct *t)
 {
+	int weight, cpu, from_cpu, this_batch, next_batch, idx;
 	unsigned long now = jiffies, old_scan, next_scan;
 	struct cpumask *cidmask;
-	int weight, cpu;
 	struct mm_struct *mm = t->mm;
 
+	/*
+	 * This function is called from __rseq_handle_notify_resume, which
+	 * makes sure t is a user thread and is not exiting.
+	 */
+	this_batch = READ_ONCE(mm->mm_cid_scan_batch);
+	next_batch = this_batch + 1;
+	from_cpu = cpumask_nth(this_batch * CONFIG_RSEQ_CID_SCAN_BATCH,
+			       cpu_possible_mask);
+	if (from_cpu >= nr_cpu_ids) {
+		from_cpu = 0;
+		next_batch = 1;
+	}
+	/* Delay scan only if we are done with all cpus. */
+	if (from_cpu != 0)
+		goto cid_compact;
 	old_scan = READ_ONCE(mm->mm_cid_next_scan);
 	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
 	if (!old_scan) {
@@ -10611,17 +10626,33 @@ void task_mm_cid_work(struct task_struct *t)
 		return;
 	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
 		return;
+
+cid_compact:
+	if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch, next_batch))
+		return;
 	cidmask = mm_cidmask(mm);
 	/* Clear cids that were not recently used. */
-	for_each_possible_cpu(cpu)
+	idx = 0;
+	cpu = from_cpu;
+	for_each_cpu_from(cpu, cpu_possible_mask) {
+		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
+			break;
 		sched_mm_cid_remote_clear_old(mm, cpu);
+		++idx;
+	}
 	weight = cpumask_weight(cidmask);
 	/*
 	 * Clear cids that are greater or equal to the cidmask weight to
 	 * recompact it.
 	 */
-	for_each_possible_cpu(cpu)
+	idx = 0;
+	cpu = from_cpu;
+	for_each_cpu_from(cpu, cpu_possible_mask) {
+		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
+			break;
 		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+		++idx;
+	}
 }
 
 void init_sched_mm_cid(struct task_struct *t)
-- 
2.50.1



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify_resume()
  2025-07-16 16:06 ` [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify_resume() Gabriele Monaco
@ 2025-08-26 18:01   ` Mathieu Desnoyers
  2025-08-27  6:55     ` Gabriele Monaco
  0 siblings, 1 reply; 6+ messages in thread
From: Mathieu Desnoyers @ 2025-08-26 18:01 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, Andrew Morton, David Hildenbrand,
	Ingo Molnar, Peter Zijlstra, Paul E. McKenney, linux-mm,
	Thomas Gleixner
  Cc: Ingo Molnar

On 2025-07-16 12:06, Gabriele Monaco wrote:
> Currently the mm_cid_compaction is triggered by the scheduler tick and
> runs in a task_work, behaviour is more unpredictable with periodic tasks
> with short runtime, which may rarely run during a tick.
> 
> Run the mm_cid_compaction from the rseq_handle_notify_resume() call,
> which runs from resume_user_mode_work. Since the context is the same
> where the task_work would run, skip this step and call the compaction
> function directly.
> The compaction function still exits prematurely in case the scan is not
> required, that is when the pseudo-period of 100ms did not elapse.
> 
> Keep a tick handler used for long running tasks that are never preempted
> (i.e. that never call rseq_handle_notify_resume), which triggers a
> compaction and mm_cid update only in that case.

Your approach looks good, but please note that this will probably
need to be rebased on top of the rseq rework from Thomas Gleixner.

Latest version can be found here:

https://lore.kernel.org/lkml/20250823161326.635281786@linutronix.de/

Thanks,

Mathieu

> 
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
>   include/linux/mm.h       |  2 ++
>   include/linux/mm_types.h | 11 ++++++++
>   include/linux/sched.h    |  2 +-
>   kernel/rseq.c            |  2 ++
>   kernel/sched/core.c      | 55 +++++++++++++++++++++++++---------------
>   kernel/sched/sched.h     |  2 ++
>   6 files changed, 53 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa538feaa8d95..cc8c1c9ae26c1 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2294,6 +2294,7 @@ void sched_mm_cid_before_execve(struct task_struct *t);
>   void sched_mm_cid_after_execve(struct task_struct *t);
>   void sched_mm_cid_fork(struct task_struct *t);
>   void sched_mm_cid_exit_signals(struct task_struct *t);
> +void task_mm_cid_work(struct task_struct *t);
>   static inline int task_mm_cid(struct task_struct *t)
>   {
>   	return t->mm_cid;
> @@ -2303,6 +2304,7 @@ static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
>   static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
>   static inline void sched_mm_cid_fork(struct task_struct *t) { }
>   static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
> +static inline void task_mm_cid_work(struct task_struct *t) { }
>   static inline int task_mm_cid(struct task_struct *t)
>   {
>   	/*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d6b91e8a66d6d..e6d6e468e64b4 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1420,6 +1420,13 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>   	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
>   	raw_spin_unlock(&mm->cpus_allowed_lock);
>   }
> +
> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> +{
> +	if (!mm)
> +		return false;
> +	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
> +}
>   #else /* CONFIG_SCHED_MM_CID */
>   static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
>   static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
> @@ -1430,6 +1437,10 @@ static inline unsigned int mm_cid_size(void)
>   	return 0;
>   }
>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> +{
> +	return false;
> +}
>   #endif /* CONFIG_SCHED_MM_CID */
>   
>   struct mmu_gather;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index aa9c5be7a6325..a75f61cea2271 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1428,7 +1428,7 @@ struct task_struct {
>   	int				last_mm_cid;	/* Most recent cid in mm */
>   	int				migrate_from_cpu;
>   	int				mm_cid_active;	/* Whether cid bitmap is active */
> -	struct callback_head		cid_work;
> +	unsigned long			last_cid_reset;	/* Time of last reset in jiffies */
>   #endif
>   
>   	struct tlbflush_unmap_batch	tlb_ubc;
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index b7a1ec327e811..100f81e330dc6 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>   	}
>   	if (unlikely(rseq_update_cpu_node_id(t)))
>   		goto error;
> +	/* The mm_cid compaction returns prematurely if scan is not needed. */
> +	task_mm_cid_work(t);
>   	return;
>   
>   error:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 81c6df746df17..27b856a1cb0a9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10589,22 +10589,13 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>   	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
>   }
>   
> -static void task_mm_cid_work(struct callback_head *work)
> +void task_mm_cid_work(struct task_struct *t)
>   {
>   	unsigned long now = jiffies, old_scan, next_scan;
> -	struct task_struct *t = current;
>   	struct cpumask *cidmask;
> -	struct mm_struct *mm;
>   	int weight, cpu;
> +	struct mm_struct *mm = t->mm;
>   
> -	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
> -
> -	work->next = work;	/* Prevent double-add */
> -	if (t->flags & PF_EXITING)
> -		return;
> -	mm = t->mm;
> -	if (!mm)
> -		return;
>   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
>   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	if (!old_scan) {
> @@ -10643,23 +10634,47 @@ void init_sched_mm_cid(struct task_struct *t)
>   		if (mm_users == 1)
>   			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	}
> -	t->cid_work.next = &t->cid_work;	/* Protect against double add */
> -	init_task_work(&t->cid_work, task_mm_cid_work);
>   }
>   
>   void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
>   {
> -	struct callback_head *work = &curr->cid_work;
> -	unsigned long now = jiffies;
> +	u64 rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
>   
> +	/*
> +	 * If a task is running unpreempted for a long time, it won't get its
> +	 * mm_cid compacted and won't update its mm_cid value after a
> +	 * compaction occurs.
> +	 * For such a task, this function does two things:
> +	 * A) trigger the mm_cid recompaction,
> +	 * B) trigger an update of the task's rseq->mm_cid field at some point
> +	 * after recompaction, so it can get a mm_cid value closer to 0.
> +	 * A change in the mm_cid triggers an rseq_preempt.
> +	 *
> +	 * B occurs once after the compaction work completes, neither A nor B
> +	 * run as long as the compaction work is pending, the task is exiting
> +	 * or is not a userspace task.
> +	 */
>   	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
> -	    work->next != work)
> +	    test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
>   		return;
> -	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
> +	if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
>   		return;
> -
> -	/* No page allocation under rq lock */
> -	task_work_add(curr, work, TWA_RESUME);
> +	if (mm_cid_needs_scan(curr->mm)) {
> +		/* Trigger mm_cid recompaction */
> +		rseq_set_notify_resume(curr);
> +	} else if (time_after(jiffies, curr->last_cid_reset +
> +			      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> +		/* Update mm_cid field */
> +		int old_cid = curr->mm_cid;
> +
> +		if (!curr->mm_cid_active)
> +			return;
> +		mm_cid_snapshot_time(rq, curr->mm);
> +		mm_cid_put_lazy(curr);
> +		curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq, curr, curr->mm);
> +		if (old_cid != curr->mm_cid)
> +			rseq_preempt(curr);
> +	}
>   }
>   
>   void sched_mm_cid_exit_signals(struct task_struct *t)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 475bb5998295e..90a5b58188232 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3606,6 +3606,7 @@ extern const char *preempt_modes[];
>   
>   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
>   #define MM_CID_SCAN_DELAY	100			/* 100ms */
> +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
>   
>   extern raw_spinlock_t cid_lock;
>   extern int use_cid_lock;
> @@ -3809,6 +3810,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
>   	int cid;
>   
>   	lockdep_assert_rq_held(rq);
> +	t->last_cid_reset = jiffies;
>   	cpumask = mm_cidmask(mm);
>   	cid = __this_cpu_read(pcpu_cid->cid);
>   	if (mm_cid_is_valid(cid)) {


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches
  2025-07-16 16:06 ` [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches Gabriele Monaco
@ 2025-08-26 18:10   ` Mathieu Desnoyers
  2025-08-28  8:36     ` Gabriele Monaco
  0 siblings, 1 reply; 6+ messages in thread
From: Mathieu Desnoyers @ 2025-08-26 18:10 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, Andrew Morton, David Hildenbrand,
	Ingo Molnar, Peter Zijlstra, linux-mm, Thomas Gleixner
  Cc: Ingo Molnar

On 2025-07-16 12:06, Gabriele Monaco wrote:
> Currently, task_mm_cid_work() is called from resume_user_mode_work().
> This can delay the execution of the corresponding thread for the entire
> duration of the function, negatively affecting the response in case of
> real time tasks.
> In practice, we observe task_mm_cid_work increasing the latency of
> 30-35us on a 128 cores system, this order of magnitude is meaningful
> under PREEMPT_RT.
> 
> Run the task_mm_cid_work in batches of up to CONFIG_RSEQ_CID_SCAN_BATCH
> CPUs, this reduces the duration of the delay for each scan.
> 
> The task_mm_cid_work contains a mechanism to avoid running more
> frequently than every 100ms. Keep this pseudo-periodicity only on
> complete scans.
> This means each call to task_mm_cid_work returns prematurely if the
> period did not elapse and a scan is not ongoing (i.e. the next batch to
> scan is not the first).
> This way full scans are not excessively delayed while still keeping each
> run, and introduced latency, short.

With your test hardware/workload as reference, do you have an idea of
how many CPUs would be needed to require more than 100ms to iterate on
all CPUs with the default scan batch size (8) ?

> 
> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
>   include/linux/mm_types.h | 15 +++++++++++++++
>   init/Kconfig             | 12 ++++++++++++
>   kernel/sched/core.c      | 37 ++++++++++++++++++++++++++++++++++---
>   3 files changed, 61 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index e6d6e468e64b4..a822966a584f3 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -995,6 +995,13 @@ struct mm_struct {
>   		 * When the next mm_cid scan is due (in jiffies).
>   		 */
>   		unsigned long mm_cid_next_scan;
> +		/*
> +		 * @mm_cid_scan_batch: Counter for batch used in the next scan.
> +		 *
> +		 * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH. This field
> +		 * increments at each scan and reset when all batches are done.
> +		 */
> +		unsigned int mm_cid_scan_batch;
>   		/**
>   		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
>   		 *
> @@ -1385,6 +1392,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
>   	raw_spin_lock_init(&mm->cpus_allowed_lock);
>   	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
>   	cpumask_clear(mm_cidmask(mm));
> +	mm->mm_cid_scan_batch = 0;
>   }
>   
>   static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
> @@ -1423,8 +1431,15 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>   
>   static inline bool mm_cid_needs_scan(struct mm_struct *mm)
>   {
> +	unsigned int next_batch;
> +
>   	if (!mm)
>   		return false;
> +	next_batch = READ_ONCE(mm->mm_cid_scan_batch);
> +	/* Always needs scan unless it's the first batch. */
> +	if (CONFIG_RSEQ_CID_SCAN_BATCH * next_batch < num_possible_cpus() &&
> +	    next_batch)
> +		return true;
>   	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
>   }
>   #else /* CONFIG_SCHED_MM_CID */
> diff --git a/init/Kconfig b/init/Kconfig
> index 666783eb50abd..98d7f078cd6df 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1860,6 +1860,18 @@ config DEBUG_RSEQ
>   
>   	  If unsure, say N.
>   
> +config RSEQ_CID_SCAN_BATCH
> +	int "Number of CPUs to scan at every mm_cid compaction attempt"
> +	range 1 NR_CPUS
> +	default 8
> +	depends on SCHED_MM_CID
> +	help
> +	  CPUs are scanned pseudo-periodically to compact the CID of each task,
> +	  this operation can take a longer amount of time on systems with many
> +	  CPUs, resulting in higher scheduling latency for the current task.
> +	  A higher value means the CID is compacted faster, but results in
> +	  higher scheduling latency.
> +
>   config CACHESTAT_SYSCALL
>   	bool "Enable cachestat() system call" if EXPERT
>   	default y
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 27b856a1cb0a9..eae4c8faf980b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10591,11 +10591,26 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>   
>   void task_mm_cid_work(struct task_struct *t)
>   {
> +	int weight, cpu, from_cpu, this_batch, next_batch, idx;
>   	unsigned long now = jiffies, old_scan, next_scan;
>   	struct cpumask *cidmask;
> -	int weight, cpu;
>   	struct mm_struct *mm = t->mm;
>   
> +	/*
> +	 * This function is called from __rseq_handle_notify_resume, which
> +	 * makes sure t is a user thread and is not exiting.
> +	 */
> +	this_batch = READ_ONCE(mm->mm_cid_scan_batch);
> +	next_batch = this_batch + 1;
> +	from_cpu = cpumask_nth(this_batch * CONFIG_RSEQ_CID_SCAN_BATCH,
> +			       cpu_possible_mask);
> +	if (from_cpu >= nr_cpu_ids) {
> +		from_cpu = 0;
> +		next_batch = 1;
> +	}
> +	/* Delay scan only if we are done with all cpus. */
> +	if (from_cpu != 0)
> +		goto cid_compact;
>   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
>   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	if (!old_scan) {
> @@ -10611,17 +10626,33 @@ void task_mm_cid_work(struct task_struct *t)
>   		return;
>   	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
>   		return;
> +
> +cid_compact:
> +	if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch, next_batch))
> +		return;
>   	cidmask = mm_cidmask(mm);
>   	/* Clear cids that were not recently used. */
> -	for_each_possible_cpu(cpu)
> +	idx = 0;
> +	cpu = from_cpu;
> +	for_each_cpu_from(cpu, cpu_possible_mask) {
> +		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)

could do "if (idx++ == CONFIG_RSEQ_CID_SCAN_BATCH)"

> +			break;
>   		sched_mm_cid_remote_clear_old(mm, cpu);
> +		++idx;

and remove this ^

> +	}
>   	weight = cpumask_weight(cidmask);
>   	/*
>   	 * Clear cids that are greater or equal to the cidmask weight to
>   	 * recompact it.
>   	 */
> -	for_each_possible_cpu(cpu)
> +	idx = 0;
> +	cpu = from_cpu;
> +	for_each_cpu_from(cpu, cpu_possible_mask) {
> +		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)

Likewise.

> +			break;
>   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> +		++idx;

Likewise.

Thanks,

Mathieu

> +	}
>   }
>   
>   void init_sched_mm_cid(struct task_struct *t)


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify_resume()
  2025-08-26 18:01   ` Mathieu Desnoyers
@ 2025-08-27  6:55     ` Gabriele Monaco
  0 siblings, 0 replies; 6+ messages in thread
From: Gabriele Monaco @ 2025-08-27  6:55 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-kernel, Andrew Morton, David Hildenbrand, Ingo Molnar,
	Peter Zijlstra, Paul E. McKenney, linux-mm, Thomas Gleixner

On Tue, 2025-08-26 at 14:01 -0400, Mathieu Desnoyers wrote:
> On 2025-07-16 12:06, Gabriele Monaco wrote:
> > Currently the mm_cid_compaction is triggered by the scheduler tick
> > and
> > runs in a task_work, behaviour is more unpredictable with periodic
> > tasks
> > with short runtime, which may rarely run during a tick.
> > 
> > Run the mm_cid_compaction from the rseq_handle_notify_resume()
> > call,
> > which runs from resume_user_mode_work. Since the context is the
> > same
> > where the task_work would run, skip this step and call the
> > compaction
> > function directly.
> > The compaction function still exits prematurely in case the scan is
> > not
> > required, that is when the pseudo-period of 100ms did not elapse.
> > 
> > Keep a tick handler used for long running tasks that are never
> > preempted
> > (i.e. that never call rseq_handle_notify_resume), which triggers a
> > compaction and mm_cid update only in that case.
> 
> Your approach looks good, but please note that this will probably
> need to be rebased on top of the rseq rework from Thomas Gleixner.
> 
> Latest version can be found here:
> 
> https://lore.kernel.org/lkml/20250823161326.635281786@linutronix.de/
> 

Mmh that's quite a large one, thanks for sharing!
I'm going to have a look but it might make sense to wait until that's
included, I guess.

Thanks,
Gabriele


> Thanks,
> 
> Mathieu
> 
> > 
> > Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> > ---
> >   include/linux/mm.h       |  2 ++
> >   include/linux/mm_types.h | 11 ++++++++
> >   include/linux/sched.h    |  2 +-
> >   kernel/rseq.c            |  2 ++
> >   kernel/sched/core.c      | 55 +++++++++++++++++++++++++----------
> > -----
> >   kernel/sched/sched.h     |  2 ++
> >   6 files changed, 53 insertions(+), 21 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index fa538feaa8d95..cc8c1c9ae26c1 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -2294,6 +2294,7 @@ void sched_mm_cid_before_execve(struct
> > task_struct *t);
> >   void sched_mm_cid_after_execve(struct task_struct *t);
> >   void sched_mm_cid_fork(struct task_struct *t);
> >   void sched_mm_cid_exit_signals(struct task_struct *t);
> > +void task_mm_cid_work(struct task_struct *t);
> >   static inline int task_mm_cid(struct task_struct *t)
> >   {
> >   	return t->mm_cid;
> > @@ -2303,6 +2304,7 @@ static inline void
> > sched_mm_cid_before_execve(struct task_struct *t) { }
> >   static inline void sched_mm_cid_after_execve(struct task_struct
> > *t) { }
> >   static inline void sched_mm_cid_fork(struct task_struct *t) { }
> >   static inline void sched_mm_cid_exit_signals(struct task_struct
> > *t) { }
> > +static inline void task_mm_cid_work(struct task_struct *t) { }
> >   static inline int task_mm_cid(struct task_struct *t)
> >   {
> >   	/*
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index d6b91e8a66d6d..e6d6e468e64b4 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -1420,6 +1420,13 @@ static inline void
> > mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
> >   	WRITE_ONCE(mm->nr_cpus_allowed,
> > cpumask_weight(mm_allowed));
> >   	raw_spin_unlock(&mm->cpus_allowed_lock);
> >   }
> > +
> > +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> > +{
> > +	if (!mm)
> > +		return false;
> > +	return time_after(jiffies, READ_ONCE(mm-
> > >mm_cid_next_scan));
> > +}
> >   #else /* CONFIG_SCHED_MM_CID */
> >   static inline void mm_init_cid(struct mm_struct *mm, struct
> > task_struct *p) { }
> >   static inline int mm_alloc_cid(struct mm_struct *mm, struct
> > task_struct *p) { return 0; }
> > @@ -1430,6 +1437,10 @@ static inline unsigned int mm_cid_size(void)
> >   	return 0;
> >   }
> >   static inline void mm_set_cpus_allowed(struct mm_struct *mm,
> > const struct cpumask *cpumask) { }
> > +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> > +{
> > +	return false;
> > +}
> >   #endif /* CONFIG_SCHED_MM_CID */
> >   
> >   struct mmu_gather;
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index aa9c5be7a6325..a75f61cea2271 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1428,7 +1428,7 @@ struct task_struct {
> >   	int				last_mm_cid;	/* Most
> > recent cid in mm */
> >   	int				migrate_from_cpu;
> >   	int				mm_cid_active;	/* Whether
> > cid bitmap is active */
> > -	struct callback_head		cid_work;
> > +	unsigned long			last_cid_reset;	/*
> > Time of last reset in jiffies */
> >   #endif
> >   
> >   	struct tlbflush_unmap_batch	tlb_ubc;
> > diff --git a/kernel/rseq.c b/kernel/rseq.c
> > index b7a1ec327e811..100f81e330dc6 100644
> > --- a/kernel/rseq.c
> > +++ b/kernel/rseq.c
> > @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal
> > *ksig, struct pt_regs *regs)
> >   	}
> >   	if (unlikely(rseq_update_cpu_node_id(t)))
> >   		goto error;
> > +	/* The mm_cid compaction returns prematurely if scan is
> > not needed. */
> > +	task_mm_cid_work(t);
> >   	return;
> >   
> >   error:
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 81c6df746df17..27b856a1cb0a9 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -10589,22 +10589,13 @@ static void
> > sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
> >   	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
> >   }
> >   
> > -static void task_mm_cid_work(struct callback_head *work)
> > +void task_mm_cid_work(struct task_struct *t)
> >   {
> >   	unsigned long now = jiffies, old_scan, next_scan;
> > -	struct task_struct *t = current;
> >   	struct cpumask *cidmask;
> > -	struct mm_struct *mm;
> >   	int weight, cpu;
> > +	struct mm_struct *mm = t->mm;
> >   
> > -	WARN_ON_ONCE(t != container_of(work, struct task_struct,
> > cid_work));
> > -
> > -	work->next = work;	/* Prevent double-add */
> > -	if (t->flags & PF_EXITING)
> > -		return;
> > -	mm = t->mm;
> > -	if (!mm)
> > -		return;
> >   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
> >   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> >   	if (!old_scan) {
> > @@ -10643,23 +10634,47 @@ void init_sched_mm_cid(struct task_struct
> > *t)
> >   		if (mm_users == 1)
> >   			mm->mm_cid_next_scan = jiffies +
> > msecs_to_jiffies(MM_CID_SCAN_DELAY);
> >   	}
> > -	t->cid_work.next = &t->cid_work;	/* Protect against
> > double add */
> > -	init_task_work(&t->cid_work, task_mm_cid_work);
> >   }
> >   
> >   void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
> >   {
> > -	struct callback_head *work = &curr->cid_work;
> > -	unsigned long now = jiffies;
> > +	u64 rtime = curr->se.sum_exec_runtime - curr-
> > >se.prev_sum_exec_runtime;
> >   
> > +	/*
> > +	 * If a task is running unpreempted for a long time, it
> > won't get its
> > +	 * mm_cid compacted and won't update its mm_cid value
> > after a
> > +	 * compaction occurs.
> > +	 * For such a task, this function does two things:
> > +	 * A) trigger the mm_cid recompaction,
> > +	 * B) trigger an update of the task's rseq->mm_cid field
> > at some point
> > +	 * after recompaction, so it can get a mm_cid value closer
> > to 0.
> > +	 * A change in the mm_cid triggers an rseq_preempt.
> > +	 *
> > +	 * B occurs once after the compaction work completes,
> > neither A nor B
> > +	 * run as long as the compaction work is pending, the task
> > is exiting
> > +	 * or is not a userspace task.
> > +	 */
> >   	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD))
> > ||
> > -	    work->next != work)
> > +	    test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
> >   		return;
> > -	if (time_before(now, READ_ONCE(curr->mm-
> > >mm_cid_next_scan)))
> > +	if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
> >   		return;
> > -
> > -	/* No page allocation under rq lock */
> > -	task_work_add(curr, work, TWA_RESUME);
> > +	if (mm_cid_needs_scan(curr->mm)) {
> > +		/* Trigger mm_cid recompaction */
> > +		rseq_set_notify_resume(curr);
> > +	} else if (time_after(jiffies, curr->last_cid_reset +
> > +			     
> > msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> > +		/* Update mm_cid field */
> > +		int old_cid = curr->mm_cid;
> > +
> > +		if (!curr->mm_cid_active)
> > +			return;
> > +		mm_cid_snapshot_time(rq, curr->mm);
> > +		mm_cid_put_lazy(curr);
> > +		curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq,
> > curr, curr->mm);
> > +		if (old_cid != curr->mm_cid)
> > +			rseq_preempt(curr);
> > +	}
> >   }
> >   
> >   void sched_mm_cid_exit_signals(struct task_struct *t)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 475bb5998295e..90a5b58188232 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3606,6 +3606,7 @@ extern const char *preempt_modes[];
> >   
> >   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/*
> > 100ms */
> >   #define MM_CID_SCAN_DELAY	100			/* 100ms
> > */
> > +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
> >   
> >   extern raw_spinlock_t cid_lock;
> >   extern int use_cid_lock;
> > @@ -3809,6 +3810,7 @@ static inline int mm_cid_get(struct rq *rq,
> > struct task_struct *t,
> >   	int cid;
> >   
> >   	lockdep_assert_rq_held(rq);
> > +	t->last_cid_reset = jiffies;
> >   	cpumask = mm_cidmask(mm);
> >   	cid = __this_cpu_read(pcpu_cid->cid);
> >   	if (mm_cid_is_valid(cid)) {
> 



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches
  2025-08-26 18:10   ` Mathieu Desnoyers
@ 2025-08-28  8:36     ` Gabriele Monaco
  0 siblings, 0 replies; 6+ messages in thread
From: Gabriele Monaco @ 2025-08-28  8:36 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Ingo Molnar, linux-kernel, Andrew Morton, David Hildenbrand,
	Ingo Molnar, Peter Zijlstra, linux-mm, Thomas Gleixner

On Tue, 2025-08-26 at 14:10 -0400, Mathieu Desnoyers wrote:
> On 2025-07-16 12:06, Gabriele Monaco wrote:
> > Currently, task_mm_cid_work() is called from
> > resume_user_mode_work().
> > This can delay the execution of the corresponding thread for the
> > entire duration of the function, negatively affecting the response
> > in case of real time tasks.
> > In practice, we observe task_mm_cid_work increasing the latency of
> > 30-35us on a 128 cores system, this order of magnitude is
> > meaningful under PREEMPT_RT.
> > 
> > Run the task_mm_cid_work in batches of up to
> > CONFIG_RSEQ_CID_SCAN_BATCH CPUs, this reduces the duration of the
> > delay for each scan.
> > 
> > The task_mm_cid_work contains a mechanism to avoid running more
> > frequently than every 100ms. Keep this pseudo-periodicity only on
> > complete scans.
> > This means each call to task_mm_cid_work returns prematurely if the
> > period did not elapse and a scan is not ongoing (i.e. the next
> > batch to scan is not the first).
> > This way full scans are not excessively delayed while still keeping
> > each run, and introduced latency, short.
> 
> With your test hardware/workload as reference, do you have an idea of
> how many CPUs would be needed to require more than 100ms to iterate
> on all CPUs with the default scan batch size (8) ?

As you guessed, this is strongly dependent on the workload, where
workloads with less threads are more likely to take longer.
I used cyclictest (threads with 100us period) and hackbench (processes)
on a 128 CPUs machine and measured the time to complete the scan (16
iterations) as well as the time between non-complete scans (not delayed
by 100ms):

cyclictest: delay 0-400 us , complete scan 1.5-2 ms
hackbench: delay 5us - 3ms , complete scan 1.5-15 ms

So to answer your question, in the observed worst case for hackbench,
it would take more than 800 CPUs to reach the 100ms limit.

That said, the problematic latency was observed on a full scan (128
CPUs), so perhaps the default of 8 is a bit too conservative and could
easily be doubled.

Measurements showed these durations for each call to task_mm_cid_scan:

batch size  8:  1-11 us (majority below 10)
batch size 16:  3-16 us (majority below 10)
batch size 32: 10-21 us (majority above 15)

20 us is considered a relevant latency on this machine, so 16 seems a
good tradeoff for a batch size to me.


I'm going to include those numbers in the next iteration of the series.

...
> > +cid_compact:
> > +	if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch,
> > next_batch))
> > +		return;
> >   	cidmask = mm_cidmask(mm);
> >   	/* Clear cids that were not recently used. */
> > -	for_each_possible_cpu(cpu)
> > +	idx = 0;
> > +	cpu = from_cpu;
> > +	for_each_cpu_from(cpu, cpu_possible_mask) {
> > +		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
> 
> could do "if (idx++ == CONFIG_RSEQ_CID_SCAN_BATCH)"
> 
> > +			break;
> >   		sched_mm_cid_remote_clear_old(mm, cpu);
> > +		++idx;
> 
> and remove this ^
> 
> > +	}
> >   	weight = cpumask_weight(cidmask);
> >   	/*
> >   	 * Clear cids that are greater or equal to the cidmask
> > weight to
> >   	 * recompact it.
> >   	 */
> > -	for_each_possible_cpu(cpu)
> > +	idx = 0;
> > +	cpu = from_cpu;
> > +	for_each_cpu_from(cpu, cpu_possible_mask) {
> > +		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
> 
> Likewise.
> 
> > +			break;
> >   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> > +		++idx;
> 
> Likewise.

Sure, will do.

Thanks,
Gabriele



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-08-28  8:36 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20250716160603.138385-6-gmonaco@redhat.com>
2025-07-16 16:06 ` [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify_resume() Gabriele Monaco
2025-08-26 18:01   ` Mathieu Desnoyers
2025-08-27  6:55     ` Gabriele Monaco
2025-07-16 16:06 ` [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches Gabriele Monaco
2025-08-26 18:10   ` Mathieu Desnoyers
2025-08-28  8:36     ` Gabriele Monaco

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).