Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v1 08/11] rcu: Wake NOCB rcuog kthreads on expedited grace period completion
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

When an expedited grace period completes, rcu_exp_wait_wake() wakes
waiters on rnp->exp_wq[] but does not notify the NOCB rcuog kthreads.  An
rcuog kthread that is waiting for a grace period sleeps on the leaf
rcu_node's ->nocb_gp_wq[] with a wait condition based on the grace-period
state, so without a wakeup, callbacks on offloaded CPUs that could
benefit from the expedited GP wait until the rcuog kthread wakes for some
other reason (e.g. the next normal GP or a timer).

Make the rcuog grace-period wait honour expedited GPs and wake it when
one completes:

 - nocb_gp_wait() now records the grace period to wait for as a struct
   rcu_gp_seq (both normal and expedited), tracks the earliest pending
   normal and expedited sequence across the group, and releases the wait
   via poll_state_synchronize_rcu_full() so it wakes for whichever
   completes first.  ->nocb_gp_seq is widened to struct rcu_gp_seq
   accordingly.

 - rcu_exp_wait_wake() calls the new rcu_nocb_exp_cleanup() on leaf
   nodes, which wakes both ->nocb_gp_wq[0] and ->nocb_gp_wq[1] (the
   expedited sequence does not share parity with the normal ->gp_seq the
   waiter indexed with).  Both this path and rcu_nocb_gp_cleanup() use
   the shared rcu_nocb_cleanup_wake() helper, which checks swait_active()
   first; the smp_mb() in rcu_gp_cleanup()/rcu_exp_wait_wake() orders the
   grace-period state update before that check.

A stub rcu_nocb_exp_cleanup() is provided for CONFIG_RCU_NOCB_CPU=n.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/tree.c      | 11 ++++-
 kernel/rcu/tree.h      |  3 +-
 kernel/rcu/tree_exp.h  |  2 +
 kernel/rcu/tree_nocb.h | 95 +++++++++++++++++++++++++++++++++++-------
 4 files changed, 94 insertions(+), 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d7e47dfcf702e..169d98ed52bbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2224,8 +2224,15 @@ static noinline void rcu_gp_cleanup(void)
 			dump_blkd_tasks(rnp, 10);
 		WARN_ON_ONCE(rnp->qsmask);
 		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
-		if (!rnp->parent)
-			smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
+		if (!rnp->parent) {
+			/*
+			 * Order against failing poll_state_synchronize_rcu_full(),
+			 * and also against rcu_nocb_gp_cleanup() -> swait_active(),
+			 * which relies on this barrier to observe a waiter that
+			 * enqueued before re-checking the grace-period state.
+			 */
+			smp_mb();
+		}
 		rdp = this_cpu_ptr(&rcu_data);
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rnp, rdp) || needgp;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 36330739d937c..79d3a656e5f73 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -268,7 +268,7 @@ struct rcu_data {
 	u8 nocb_gp_sleep;		/* Is the nocb GP thread asleep? */
 	u8 nocb_gp_bypass;		/* Found a bypass on last scan? */
 	u8 nocb_gp_gp;			/* GP to wait for on last scan? */
-	unsigned long nocb_gp_seq;	/*  If so, ->gp_seq to wait for. */
+	struct rcu_gp_seq nocb_gp_seq; /* If so, GP state to wait for. */
 	unsigned long nocb_gp_loops;	/* # passes through wait code. */
 	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
 	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */
@@ -511,6 +511,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
+static void rcu_nocb_exp_cleanup(struct rcu_node *rnp);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool wake_nocb_gp(struct rcu_data *rdp);
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0569d8e40e86d..5c35e28708640 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -708,6 +708,8 @@ static void rcu_exp_wait_wake(unsigned long s)
 		}
 		smp_mb(); /* All above changes before wakeup. */
 		wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
+		if (rcu_is_leaf_node(rnp))
+			rcu_nocb_exp_cleanup(rnp);
 	}
 	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
 	mutex_unlock(&rcu_state.exp_wake_mutex);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 263bb8a65a988..6da1b8f524768 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -170,13 +170,35 @@ static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
 		lockdep_assert_held(&rdp->nocb_lock);
 }
 
+static void rcu_nocb_cleanup_wake(struct swait_queue_head *sq)
+{
+	if (swait_active(sq))
+		swake_up_all(sq);
+}
+
 /*
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
-	swake_up_all(sq);
+	/*
+	 * swait_active() can be checked first because of the following
+	 * ordering, which pairs the smp_mb() in rcu_gp_cleanup() against
+	 * the implicit barrier in prepare_to_swait()/set_current_state()
+	 * on the nocb_gp_wait() side:
+	 *
+	 * rcu_gp_cleanup()                          nocb_gp_wait()
+	 * ---------------                           --------------
+	 * WRITE_ONCE(root->gp_seq, new_gp_seq);     swait_event_interruptible_exclusive(sq)
+	 * smp_mb()                                     prepare_to_swait()
+	 * if swait_active(sq)                             list_add_tail(...)
+	 *    swake_up_all(sq)                            set_current_state()
+	 *                                                  smp_mb()
+	 *                                             if (poll_state_synchronize_rcu_full())
+	 *                                                ...
+	 */
+	rcu_nocb_cleanup_wake(sq);
 }
 
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
@@ -190,6 +212,38 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
+/*
+ * Wake NOCB rcuog kthreads on a leaf node so that they can advance
+ * callbacks that were waiting for the just-completed expedited GP.
+ *
+ * The rcuog kthread waiting for a grace period sleeps on the per-leaf-node
+ * ->nocb_gp_wq[] (not on its rdp_gp's ->nocb_gp_wq, which only signals that
+ * new callbacks have shown up), so this is the queue that must be woken.
+ * Both the even and odd waitqueues are woken because the expedited sequence
+ * does not share parity with the normal ->gp_seq the waiter indexed with.
+ */
+static void rcu_nocb_exp_cleanup(struct rcu_node *rnp)
+{
+	/*
+	 * swait_active() can be checked first because of the following
+	 * ordering, which pairs the smp_mb() in rcu_exp_wait_wake() against
+	 * the implicit barrier in prepare_to_swait()/set_current_state()
+	 * on the nocb_gp_wait() side:
+	 *
+	 * rcu_exp_wait_wake()                          nocb_gp_wait()
+	 * ---------------                              --------------
+	 * rcu_seq_end(&rcu_state.expedited_sequence);  swait_event_interruptible_exclusive(sq)
+	 * smp_mb()                                         prepare_to_swait()
+	 * if swait_active(sq)                                 list_add_tail(...)
+	 *    swake_up_all(sq)                                set_current_state()
+	 *                                                      smp_mb()
+	 *                                                 if (poll_state_synchronize_rcu_full())
+	 *                                                    ...
+	 */
+	rcu_nocb_cleanup_wake(&rnp->nocb_gp_wq[0]);
+	rcu_nocb_cleanup_wake(&rnp->nocb_gp_wq[1]);
+}
+
 /* Clear any pending deferred wakeup timer (nocb_gp_lock must be held). */
 static void nocb_defer_wakeup_cancel(struct rcu_data *rdp_gp)
 {
@@ -659,7 +713,6 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 {
 	bool bypass = false;
 	int __maybe_unused cpu = my_rdp->cpu;
-	struct rcu_gp_seq cur_gp_seq;
 	unsigned long flags;
 	bool gotcbs = false;
 	unsigned long j = jiffies;
@@ -669,7 +722,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 	bool needwake_gp;
 	struct rcu_data *rdp, *rdp_toggling = NULL;
 	struct rcu_node *rnp;
-	unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+	struct rcu_gp_seq wait_gp_seq = {0}; // Suppress "use uninitialized" warning.
 	bool wasempty = false;
 
 	/*
@@ -693,6 +746,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 	 * won't be ignored for long.
 	 */
 	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+		struct rcu_gp_seq cur_gp_seq;
 		long bypass_ncbs;
 		bool flush_bypass = false;
 		long lazy_ncbs;
@@ -754,9 +808,15 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		 */
 		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
 		    !poll_state_synchronize_rcu_full(&cur_gp_seq)) {
-			if (!needwait_gp ||
-			    ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq))
-				wait_gp_seq = cur_gp_seq.norm;
+			/*
+			 * Track the earliest pending normal and expedited GP
+			 * across the group so the wait below can be released by
+			 * whichever completes first.
+			 */
+			if (!needwait_gp || ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq.norm))
+				wait_gp_seq.norm = cur_gp_seq.norm;
+			if (!needwait_gp || ULONG_CMP_LT(cur_gp_seq.exp, wait_gp_seq.exp))
+				wait_gp_seq.exp = cur_gp_seq.exp;
 			needwait_gp = true;
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("NeedWaitGP"));
@@ -778,7 +838,8 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 
 	my_rdp->nocb_gp_bypass = bypass;
 	my_rdp->nocb_gp_gp = needwait_gp;
-	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+	if (needwait_gp)
+		my_rdp->nocb_gp_seq = wait_gp_seq;
 
 	// At least one child with non-empty ->nocb_bypass, so set
 	// timer in order to avoid stranding its callbacks.
@@ -813,12 +874,12 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		nocb_gp_sleep(my_rdp, cpu);
 	} else {
 		rnp = my_rdp->mynode;
-		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq.norm, TPS("StartWait"));
 		swait_event_interruptible_exclusive(
-			rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
-			rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+			rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq.norm) & 0x1],
+			poll_state_synchronize_rcu_full(&wait_gp_seq) ||
 			!READ_ONCE(my_rdp->nocb_gp_sleep));
-		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq.norm, TPS("EndWait"));
 	}
 
 	if (!rcu_nocb_poll) {
@@ -852,7 +913,8 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		swake_up_one(&rdp_toggling->nocb_state_wq);
 	}
 
-	my_rdp->nocb_gp_seq = -1;
+	my_rdp->nocb_gp_seq.norm = -1;
+	my_rdp->nocb_gp_seq.exp = -1;
 	WARN_ON(signal_pending(current));
 }
 
@@ -1536,7 +1598,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 {
 	struct rcu_node *rnp = rdp->mynode;
 
-	pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+	pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld/%ld rnp %d:%d %lu %c CPU %d%s\n",
 		rdp->cpu,
 		"kK"[!!rdp->nocb_gp_kthread],
 		"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
@@ -1548,7 +1610,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 		".W"[swait_active(&rnp->nocb_gp_wq[1])],
 		".B"[!!rdp->nocb_gp_bypass],
 		".G"[!!rdp->nocb_gp_gp],
-		(long)rdp->nocb_gp_seq,
+		(long)rdp->nocb_gp_seq.norm,
+		(long)rdp->nocb_gp_seq.exp,
 		rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
 		rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
 		rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
@@ -1668,6 +1731,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
 
+static void rcu_nocb_exp_cleanup(struct rcu_node *rnp)
+{
+}
+
 static bool wake_nocb_gp(struct rcu_data *rdp)
 {
 	return false;
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 07/11] rcu: Update comments for gp_seq and expedited GP tracking
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Update documentation comments throughout the RCU callback infrastructure
to reflect the transition from a single grace-period sequence number to
the full struct rcu_gp_seq that tracks both normal and expedited grace
periods.

The ->gp_seq[] array documentation in rcu_segcblist.h is updated to
describe dual (normal and expedited) GP tracking.  The
rcu_segcblist_advance(), rcu_segcblist_accelerate(), and
rcu_advance_cbs() comments are updated to refer to the struct rcu_gp_seq
state (gsp) instead of the old bare grace-period sequence number (seq).

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/rcu_segcblist.h | 14 +++++++-----
 kernel/rcu/rcu_segcblist.c    | 43 +++++++++++++++++++++++------------
 kernel/rcu/tree.c             |  6 ++---
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 137cc23b024c5..08b63ecf719b2 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -50,12 +50,14 @@ struct rcu_cblist {
  * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
  * empty.
  *
- * The ->gp_seq[] array contains the grace-period number at which the
- * corresponding segment of callbacks will be ready to invoke.  A given
- * element of this array is meaningful only when the corresponding segment
- * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
- * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
- * not yet been assigned a grace-period number).
+ * The ->gp_seq[] array contains the grace-period state at which the
+ * corresponding segment of callbacks will be ready to invoke.  This tracks
+ * both normal and expedited grace periods, allowing callbacks to complete
+ * when either type of GP finishes.  A given element of this array is
+ * meaningful only when the corresponding segment is non-empty, and it is
+ * never valid for RCU_DONE_TAIL (whose callbacks are already ready to
+ * invoke) or for RCU_NEXT_TAIL (whose callbacks have not yet been assigned
+ * a grace-period state).
  */
 #define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
 #define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index cf8951d33e767..dd770006e7f8b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -495,7 +495,8 @@ static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
 
 /*
  * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value of the grace-period counter.
+ * on the current grace-period state.  Checks both normal and expedited
+ * grace periods, advancing callbacks when either GP type completes.
  */
 void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
 {
@@ -506,8 +507,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
 		return;
 
 	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+	 * Find all callbacks whose grace periods have completed (either
+	 * normal or expedited) and put them into the RCU_DONE_TAIL segment.
+	 * We check against the current global GP state, which includes
+	 * proper memory barriers and handles special completion values.
 	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
 		if (!poll_state_synchronize_rcu_full(&rsclp->gp_seq[i]))
@@ -534,9 +537,9 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
  * them to complete at the end of the earlier grace period.
  *
  * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number seq at which new callbacks would become
+ * grace-period state gsp at which new callbacks would become
  * ready to invoke.  Returns true if there are callbacks that won't be
- * ready to invoke until seq, false otherwise.
+ * ready to invoke until the grace period represented by gsp, false otherwise.
  */
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
@@ -548,11 +551,11 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 
 	/*
 	 * Find the segment preceding the oldest segment of callbacks
-	 * whose ->gp_seq[] completion is at or after that passed in via
-	 * "seq", skipping any empty segments.  This oldest segment, along
+	 * whose grace period completion is at or after that passed in via
+	 * "gsp", skipping any empty segments.  This oldest segment, along
 	 * with any later segments, can be merged in with any newly arrived
-	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
-	 * as their ->gp_seq[] grace-period completion sequence number.
+	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "gsp"
+	 * as their grace-period completion state.
 	 */
 	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
 		if (!rcu_segcblist_segempty(rsclp, i) &&
@@ -561,7 +564,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 
 	/*
 	 * If all the segments contain callbacks that correspond to
-	 * earlier grace-period sequence numbers than "seq", leave.
+	 * earlier grace-period sequence numbers than "gsp", leave.
 	 * Assuming that the rcu_segcblist structure has enough
 	 * segments in its arrays, this can only happen if some of
 	 * the non-done segments contain callbacks that really are
@@ -569,15 +572,15 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 	 * out by the next call to rcu_segcblist_advance().
 	 *
 	 * Also advance to the oldest segment of callbacks whose
-	 * ->gp_seq[] completion is at or after that passed in via "seq",
+	 * ->gp_seq[] completion is at or after that passed in via "gsp",
 	 * skipping any empty segments.
 	 *
 	 * Note that segment "i" (and any lower-numbered segments
 	 * containing older callbacks) will be unaffected, and their
-	 * grace-period numbers remain unchanged.  For example, if i ==
+	 * grace-period states remain unchanged.  For example, if i ==
 	 * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched.
 	 * Instead, the CBs in NEXT_TAIL will be merged with those in
-	 * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL
+	 * NEXT_READY_TAIL and the grace-period state of NEXT_READY_TAIL
 	 * would be updated.  NEXT_TAIL would then be empty.
 	 */
 	if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
@@ -589,8 +592,8 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 
 	/*
 	 * Merge all later callbacks, including newly arrived callbacks,
-	 * into the segment located by the for-loop above.  Assign "seq"
-	 * as the ->gp_seq[] value in order to correctly handle the case
+	 * into the segment located by the for-loop above.  Assign "gsp"
+	 * as the grace-period state in order to correctly handle the case
 	 * where there were no pending callbacks in the rcu_segcblist
 	 * structure other than in the RCU_NEXT_TAIL segment.
 	 */
@@ -644,6 +647,10 @@ void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
 		return;
 
+	/*
+	 * Find all callbacks whose normal GP sequence numbers indicate
+	 * that they are ready to invoke.  For SRCU, we only check norm.
+	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
 		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i].norm))
 			break;
@@ -658,6 +665,12 @@ void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	rcu_segcblist_advance_compact(rsclp, i);
 }
 
+/*
+ * SRCU wrapper for rcu_segcblist_accelerate() - converts SRCU's unsigned
+ * long GP sequence to rcu_gp_seq format with exp set to
+ * RCU_GET_STATE_NOT_TRACKED (since SRCU does not use expedited GPs)
+ * and calls the core rcu_segcblist_accelerate().
+ */
 bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
 	struct rcu_gp_seq gs = { .norm = seq, .exp = RCU_GET_STATE_NOT_TRACKED };
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91c03887a1228..d7e47dfcf702e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1209,7 +1209,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
 /*
  * Move any callbacks whose grace period has completed to the
  * RCU_DONE_TAIL sublist, then compact the remaining sublists and
- * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
+ * assign ->gp_seq[] state to any callbacks in the RCU_NEXT_TAIL
  * sublist.  This function is idempotent, so it does not hurt to
  * invoke it repeatedly.  As long as it is not invoked -too- often...
  * Returns true if the RCU grace-period kthread needs to be awakened.
@@ -1226,8 +1226,8 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 		return false;
 
 	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+	 * Find all callbacks whose grace periods have completed (either
+	 * normal or expedited) and put them into the RCU_DONE_TAIL sublist.
 	 */
 	rcu_segcblist_advance(&rdp->cblist);
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 06/11] rcu: Enable RCU callbacks to benefit from expedited grace periods
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Currently, RCU callbacks only track normal grace-period sequence
numbers.  This means callbacks must wait for normal grace periods to
complete even when expedited grace periods have already elapsed.

Use the full struct rcu_gp_seq (which tracks both the normal and
expedited grace-period sequences) throughout the callback
infrastructure.

rcu_segcblist_advance() now checks both normal and expedited GP
completion via poll_state_synchronize_rcu_full(), and becomes
parameterless since it reads the grace-period state internally.
rcu_segcblist_accelerate() stores the full state (both sequences)
instead of just the normal one.  rcu_accelerate_cbs() and
rcu_accelerate_cbs_unlocked() use get_state_synchronize_rcu_full() to
capture both sequences, and the NOCB advance checks use
poll_state_synchronize_rcu_full() instead of comparing only the normal
sequence.

srcu_segcblist_advance() becomes a standalone implementation because it
compares SRCU sequences directly and cannot use
poll_state_synchronize_rcu_full(), which reads RCU-specific globals.
srcu_segcblist_accelerate() sets the ->exp field to
RCU_GET_STATE_NOT_TRACKED so that poll_state_synchronize_rcu_full()
compares only ->norm and ignores ->exp.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 30 +++++++++++++++++++++++-------
 kernel/rcu/rcu_segcblist.h |  2 +-
 kernel/rcu/tree.c          |  9 +++------
 kernel/rcu/tree_nocb.h     | 33 +++++++++++++++++++++++----------
 4 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 4e3dfe42bc097..cf8951d33e767 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+#include "rcu.h"
 #include "rcu_segcblist.h"
 
 /* Initialize simple callback list. */
@@ -494,9 +495,9 @@ static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
 
 /*
  * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value passed in for the grace-period counter.
+ * on the current value of the grace-period counter.
  */
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
 {
 	int i;
 
@@ -509,7 +510,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
 	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(gsp->norm, rsclp->gp_seq[i].norm))
+		if (!poll_state_synchronize_rcu_full(&rsclp->gp_seq[i]))
 			break;
 		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
 		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
@@ -595,7 +596,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 	 */
 	for (; i < RCU_NEXT_TAIL; i++) {
 		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
-		rsclp->gp_seq[i].norm = gsp->norm;
+		rsclp->gp_seq[i] = *gsp;
 	}
 	return true;
 }
@@ -637,14 +638,29 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 
 void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	struct rcu_gp_seq gs = { .norm = seq };
+	int i;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return;
+
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i].norm))
+			break;
+		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
+	}
+
+	/* If no callbacks moved, nothing more need be done. */
+	if (i == RCU_WAIT_TAIL)
+		return;
 
-	rcu_segcblist_advance(rsclp, &gs);
+	rcu_segcblist_advance_compact(rsclp, i);
 }
 
 bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	struct rcu_gp_seq gs = { .norm = seq };
+	struct rcu_gp_seq gs = { .norm = seq, .exp = RCU_GET_STATE_NOT_TRACKED };
 
 	return rcu_segcblist_accelerate(rsclp, &gs);
 }
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 16b0cb6b32507..431c4466b8898 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -139,7 +139,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
 void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 095a023b19f1f..91c03887a1228 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1164,7 +1164,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * accelerating callback invocation to an earlier grace-period
 	 * number.
 	 */
-	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	get_state_synchronize_rcu_full(&gs);
 	if (rcu_segcblist_accelerate(&rdp->cblist, &gs))
 		ret = rcu_start_this_gp(rnp, rdp, gs.norm);
 
@@ -1193,7 +1193,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
 	bool needwake;
 
 	rcu_lockdep_assert_cblist_protected(rdp);
-	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	get_state_synchronize_rcu_full(&gs);
 	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, gs.norm)) {
 		/* Old request still live, so mark recent callbacks. */
 		(void)rcu_segcblist_accelerate(&rdp->cblist, &gs);
@@ -1218,8 +1218,6 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
  */
 static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	struct rcu_gp_seq gs;
-
 	rcu_lockdep_assert_cblist_protected(rdp);
 	raw_lockdep_assert_held_rcu_node(rnp);
 
@@ -1231,8 +1229,7 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * Find all callbacks whose ->gp_seq numbers indicate that they
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
 	 */
-	gs.norm = rnp->gp_seq;
-	rcu_segcblist_advance(&rdp->cblist, &gs);
+	rcu_segcblist_advance(&rdp->cblist);
 
 	/* Classify any remaining callbacks. */
 	return rcu_accelerate_cbs(rnp, rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e0274a2e1c1ae..263bb8a65a988 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -502,7 +502,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		}
 		if (j != rdp->nocb_gp_adv_time &&
 		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq.norm)) {
+		    poll_state_synchronize_rcu_full(&cur_gp_seq)) {
 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
 			rdp->nocb_gp_adv_time = j;
 		}
@@ -731,7 +731,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		if (!rcu_segcblist_restempty(&rdp->cblist,
 					     RCU_NEXT_READY_TAIL) ||
 		    (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm))) {
+		     poll_state_synchronize_rcu_full(&cur_gp_seq))) {
 			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
 			needwake_gp = rcu_advance_cbs(rnp, rdp);
 			wasempty = rcu_segcblist_restempty(&rdp->cblist,
@@ -742,7 +742,18 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		WARN_ON_ONCE(wasempty &&
 			     !rcu_segcblist_restempty(&rdp->cblist,
 						      RCU_NEXT_READY_TAIL));
-		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+		/*
+		 * Only request a GP wait if the next pending callback's
+		 * GP has not already completed (normal or expedited).
+		 * If poll_state_synchronize_rcu_full() says it completed,
+		 * then rcu_advance_cbs() above already moved those
+		 * callbacks to RCU_DONE_TAIL, so there is no GP to wait
+		 * for.  Any remaining callbacks got new (future) GP
+		 * numbers from rcu_accelerate_cbs() inside
+		 * rcu_advance_cbs() and will be handled on the next pass.
+		 */
+		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+		    !poll_state_synchronize_rcu_full(&cur_gp_seq)) {
 			if (!needwait_gp ||
 			    ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq))
 				wait_gp_seq = cur_gp_seq.norm;
@@ -919,7 +930,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 	lockdep_assert_irqs_enabled();
 	rcu_nocb_lock_irqsave(rdp, flags);
 	if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
-	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm) &&
+	    poll_state_synchronize_rcu_full(&cur_gp_seq) &&
 	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
 		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
@@ -1548,8 +1559,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 static void show_rcu_nocb_state(struct rcu_data *rdp)
 {
 	char bufd[22];
-	char bufw[45];
-	char bufr[45];
+	char bufw[64];
+	char bufr[64];
 	char bufn[22];
 	char bufb[22];
 	struct rcu_data *nocb_next_rdp;
@@ -1569,10 +1580,12 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 					      nocb_entry_rdp);
 
 	sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
-	sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL],
-		rsclp->gp_seq[RCU_WAIT_TAIL].norm);
-	sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
-		rsclp->gp_seq[RCU_NEXT_READY_TAIL].norm);
+	sprintf(bufw, "%ld(%ld/%ld)", rsclp->seglen[RCU_WAIT_TAIL],
+		rsclp->gp_seq[RCU_WAIT_TAIL].norm,
+		rsclp->gp_seq[RCU_WAIT_TAIL].exp);
+	sprintf(bufr, "%ld(%ld/%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+		rsclp->gp_seq[RCU_NEXT_READY_TAIL].norm,
+		rsclp->gp_seq[RCU_NEXT_READY_TAIL].exp);
 	sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
 	sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	pr_info("   CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 05/11] rcu: Add RCU_GET_STATE_NOT_TRACKED for subsystems without expedited GPs
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

SRCU and Tasks RCU do not track expedited grace periods. When their
callback state is checked via poll_state_synchronize_rcu_full(), the
uninitialized or zeroed exp field could cause false-positive
completion detection.

This commit adds an RCU_GET_STATE_NOT_TRACKED sentinel value (0x2) that
these subsystems can place into exp to indicate that expedited GP
tracking is not applicable. The expedited sequence check in
poll_state_synchronize_rcu_full() is guarded to skip entries marked with
this sentinel.

This is needed to allow rcu_segcblist_advance() and rcu_accelerate_cbs()
to work with both normal and expedited grace periods via
get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full().

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu.h  | 13 +++++++++++--
 kernel/rcu/tree.c |  3 ++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 14faa11ef23cd..39a9f6fa9a7b2 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -46,16 +46,25 @@
  *					the number of pending readers that will use
  *					this inactive index is bounded).
  *
- * RCU polled GP special control value:
+ * RCU polled GP special control values:
  *
  *	RCU_GET_STATE_COMPLETED :	State value indicating an already-completed
  *					polled GP has completed.  This value covers
  *					both the state and the counter of the
  *					grace-period sequence number.
+ *
+ *	RCU_GET_STATE_NOT_TRACKED :	State value indicating that a GP component
+ *					is not tracked by this subsystem and should
+ *					not be checked.  Used by SRCU and RCU Tasks
+ *					which do not track expedited GPs, to prevent
+ *					false-positive completion when their
+ *					gp_seq entries are checked via
+ *					poll_state_synchronize_rcu_full().
  */
 
-/* Low-order bit definition for polled grace-period APIs. */
+/* Low-order bit definitions for polled grace-period APIs. */
 #define RCU_GET_STATE_COMPLETED	0x1
+#define RCU_GET_STATE_NOT_TRACKED	0x2
 
 /* A complete grace period count */
 #define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1d65505460bc7..095a023b19f1f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3635,7 +3635,8 @@ bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 	if (gsp->norm == RCU_GET_STATE_COMPLETED ||
 	    rcu_seq_done_exact(&rnp->gp_seq, gsp->norm) ||
 	    gsp->exp == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp)) {
+	    (gsp->exp != RCU_GET_STATE_NOT_TRACKED &&
+	     rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp))) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 04/11] rcu/segcblist: Track segment grace periods with struct rcu_gp_seq
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Change the type of the per-segment ->gp_seq[] array in struct
rcu_segcblist from unsigned long to struct rcu_gp_seq.  This prepares the
callback tracking infrastructure to record both normal and expedited
grace periods per segment.

The rcu_segcblist_nextgp(), rcu_segcblist_advance(), and
rcu_segcblist_accelerate() helpers now take a struct rcu_gp_seq * instead
of an unsigned long, and all callers use the .norm field for comparisons
and assignments.  The SRCU and Tasks RCU wrappers construct a struct
rcu_gp_seq with only .norm set and forward to the core helpers.

No functional change: only the .norm field is used.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/rcu_segcblist.h |  2 +-
 include/trace/events/rcu.h    |  5 +++--
 kernel/rcu/rcu_segcblist.c    | 24 ++++++++++++++----------
 kernel/rcu/rcu_segcblist.h    |  6 +++---
 kernel/rcu/tree.c             | 25 ++++++++++++++-----------
 kernel/rcu/tree_nocb.h        | 21 +++++++++++----------
 6 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 2fdc2208f1ca3..137cc23b024c5 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -190,7 +190,7 @@ struct rcu_cblist {
 struct rcu_segcblist {
 	struct rcu_head *head;
 	struct rcu_head **tails[RCU_CBLIST_NSEGS];
-	unsigned long gp_seq[RCU_CBLIST_NSEGS];
+	struct rcu_gp_seq gp_seq[RCU_CBLIST_NSEGS];
 #ifdef CONFIG_RCU_NOCB_CPU
 	atomic_long_t len;
 #else
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 5fbdabe3faead..c84309c388343 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -547,10 +547,11 @@ TRACE_EVENT_RCU(rcu_segcb_stats,
 		),
 
 		TP_fast_assign(
+			int i;
 			__entry->ctx = ctx;
 			memcpy(__entry->seglen, rs->seglen, RCU_CBLIST_NSEGS * sizeof(long));
-			memcpy(__entry->gp_seq, rs->gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
-
+			for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+				__entry->gp_seq[i] = rs->gp_seq[i].norm;
 		),
 
 		TP_printk("%s seglen: (DONE=%ld, WAIT=%ld, NEXT_READY=%ld, NEXT=%ld) "
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 421f1dadb5e55..4e3dfe42bc097 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -307,13 +307,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
 
 /*
  * Return false if there are no CBs awaiting grace periods, otherwise,
- * return true and store the nearest waited-upon grace period into *lp.
+ * return true and store the nearest waited-upon grace period state into *gsp.
  */
-bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
 	if (!rcu_segcblist_pend_cbs(rsclp))
 		return false;
-	*lp = rsclp->gp_seq[RCU_WAIT_TAIL];
+	*gsp = rsclp->gp_seq[RCU_WAIT_TAIL];
 	return true;
 }
 
@@ -496,7 +496,7 @@ static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
  * Advance the callbacks in the specified rcu_segcblist structure based
  * on the current value passed in for the grace-period counter.
  */
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
 	int i;
 
@@ -509,7 +509,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
 	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+		if (ULONG_CMP_LT(gsp->norm, rsclp->gp_seq[i].norm))
 			break;
 		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
 		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
@@ -537,7 +537,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
  * ready to invoke.  Returns true if there are callbacks that won't be
  * ready to invoke until seq, false otherwise.
  */
-bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
 	int i, j;
 
@@ -555,7 +555,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 */
 	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
 		if (!rcu_segcblist_segempty(rsclp, i) &&
-		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+		    ULONG_CMP_LT(rsclp->gp_seq[i].norm, gsp->norm))
 			break;
 
 	/*
@@ -595,7 +595,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 */
 	for (; i < RCU_NEXT_TAIL; i++) {
 		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
-		rsclp->gp_seq[i] = seq;
+		rsclp->gp_seq[i].norm = gsp->norm;
 	}
 	return true;
 }
@@ -637,10 +637,14 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 
 void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	rcu_segcblist_advance(rsclp, seq);
+	struct rcu_gp_seq gs = { .norm = seq };
+
+	rcu_segcblist_advance(rsclp, &gs);
 }
 
 bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	return rcu_segcblist_accelerate(rsclp, seq);
+	struct rcu_gp_seq gs = { .norm = seq };
+
+	return rcu_segcblist_accelerate(rsclp, &gs);
 }
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 956f2967d9d29..16b0cb6b32507 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -124,7 +124,7 @@ bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
 void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 			   struct rcu_head *rhp);
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -139,8 +139,8 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
 void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
 void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index af4b6daf6a0ff..1d65505460bc7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1142,7 +1142,7 @@ static void rcu_gp_kthread_wake(void)
  */
 static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	unsigned long gp_seq_req;
+	struct rcu_gp_seq gs;
 	bool ret = false;
 
 	rcu_lockdep_assert_cblist_protected(rdp);
@@ -1164,15 +1164,15 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * accelerating callback invocation to an earlier grace-period
 	 * number.
 	 */
-	gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
-	if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
-		ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
+	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	if (rcu_segcblist_accelerate(&rdp->cblist, &gs))
+		ret = rcu_start_this_gp(rnp, rdp, gs.norm);
 
 	/* Trace depending on how much we were able to accelerate. */
 	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
-		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
+		trace_rcu_grace_period(rcu_state.name, gs.norm, TPS("AccWaitCB"));
 	else
-		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+		trace_rcu_grace_period(rcu_state.name, gs.norm, TPS("AccReadyCB"));
 
 	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
 
@@ -1189,14 +1189,14 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
 					struct rcu_data *rdp)
 {
-	unsigned long c;
+	struct rcu_gp_seq gs;
 	bool needwake;
 
 	rcu_lockdep_assert_cblist_protected(rdp);
-	c = rcu_seq_snap(&rcu_state.gp_seq);
-	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
+	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, gs.norm)) {
 		/* Old request still live, so mark recent callbacks. */
-		(void)rcu_segcblist_accelerate(&rdp->cblist, c);
+		(void)rcu_segcblist_accelerate(&rdp->cblist, &gs);
 		return;
 	}
 	raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1218,6 +1218,8 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
  */
 static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 {
+	struct rcu_gp_seq gs;
+
 	rcu_lockdep_assert_cblist_protected(rdp);
 	raw_lockdep_assert_held_rcu_node(rnp);
 
@@ -1229,7 +1231,8 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * Find all callbacks whose ->gp_seq numbers indicate that they
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
 	 */
-	rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
+	gs.norm = rnp->gp_seq;
+	rcu_segcblist_advance(&rdp->cblist, &gs);
 
 	/* Classify any remaining callbacks. */
 	return rcu_accelerate_cbs(rnp, rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 373b877cf171d..e0274a2e1c1ae 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				bool lazy)
 {
 	unsigned long c;
-	unsigned long cur_gp_seq;
+	struct rcu_gp_seq cur_gp_seq;
 	unsigned long j = jiffies;
 	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 	long lazy_len = READ_ONCE(rdp->lazy_len);
@@ -502,7 +502,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		}
 		if (j != rdp->nocb_gp_adv_time &&
 		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq.norm)) {
 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
 			rdp->nocb_gp_adv_time = j;
 		}
@@ -659,7 +659,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 {
 	bool bypass = false;
 	int __maybe_unused cpu = my_rdp->cpu;
-	unsigned long cur_gp_seq;
+	struct rcu_gp_seq cur_gp_seq;
 	unsigned long flags;
 	bool gotcbs = false;
 	unsigned long j = jiffies;
@@ -731,7 +731,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		if (!rcu_segcblist_restempty(&rdp->cblist,
 					     RCU_NEXT_READY_TAIL) ||
 		    (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm))) {
 			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
 			needwake_gp = rcu_advance_cbs(rnp, rdp);
 			wasempty = rcu_segcblist_restempty(&rdp->cblist,
@@ -744,8 +744,8 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 						      RCU_NEXT_READY_TAIL));
 		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
 			if (!needwait_gp ||
-			    ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
-				wait_gp_seq = cur_gp_seq;
+			    ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq))
+				wait_gp_seq = cur_gp_seq.norm;
 			needwait_gp = true;
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("NeedWaitGP"));
@@ -877,7 +877,7 @@ static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
 static void nocb_cb_wait(struct rcu_data *rdp)
 {
 	struct rcu_segcblist *cblist = &rdp->cblist;
-	unsigned long cur_gp_seq;
+	struct rcu_gp_seq cur_gp_seq;
 	unsigned long flags;
 	bool needwake_gp = false;
 	struct rcu_node *rnp = rdp->mynode;
@@ -919,7 +919,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 	lockdep_assert_irqs_enabled();
 	rcu_nocb_lock_irqsave(rdp, flags);
 	if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
-	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm) &&
 	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
 		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
@@ -1569,9 +1569,10 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 					      nocb_entry_rdp);
 
 	sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
-	sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+	sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL],
+		rsclp->gp_seq[RCU_WAIT_TAIL].norm);
 	sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
-		      rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+		rsclp->gp_seq[RCU_NEXT_READY_TAIL].norm);
 	sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
 	sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	pr_info("   CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 03/11] rcu/segcblist: Factor out rcu_segcblist_advance_compact() helper
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

This commit extracts the tail-pointer cleanup and segment compaction
logic from rcu_segcblist_advance() into a new static helper function,
rcu_segcblist_advance_compact(). This shared logic will be reused by the
upcoming srcu_segcblist_advance() standalone implementation, which
cannot call the core rcu_segcblist_advance() because that function will
use RCU-specific globals.

No functional change.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 50 ++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index da39d818b01b1..421f1dadb5e55 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -462,13 +462,43 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 	WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
 }
 
+/*
+ * Clean up and compact the segmented callback list after callbacks have been
+ * advanced to the RCU_DONE_TAIL segment.  The @i parameter is the index of the
+ * first segment that was NOT advanced (i.e., the segment after the last one
+ * moved to RCU_DONE_TAIL). This function fixes up tail pointers and compacts
+ * any gaps left by the moved segments.
+ */
+static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
+{
+	int j;
+
+	/* Clean up tail pointers that might have been misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
+
+	/*
+	 * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+	 * and a non-empty RCU_NEXT_READY_TAIL.  If so, copy the
+	 * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+	 * created by the now-ready-to-invoke segments.
+	 */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+			break;  /* No more callbacks. */
+		WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+		rcu_segcblist_move_seglen(rsclp, i, j);
+		rsclp->gp_seq[j] = rsclp->gp_seq[i];
+	}
+}
+
 /*
  * Advance the callbacks in the specified rcu_segcblist structure based
  * on the current value passed in for the grace-period counter.
  */
 void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	int i, j;
+	int i;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
 	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -489,23 +519,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	if (i == RCU_WAIT_TAIL)
 		return;
 
-	/* Clean up tail pointers that might have been misordered above. */
-	for (j = RCU_WAIT_TAIL; j < i; j++)
-		WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
-
-	/*
-	 * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
-	 * and a non-empty RCU_NEXT_READY_TAIL.  If so, copy the
-	 * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
-	 * created by the now-ready-to-invoke segments.
-	 */
-	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-			break;  /* No more callbacks. */
-		WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
-		rcu_segcblist_move_seglen(rsclp, i, j);
-		rsclp->gp_seq[j] = rsclp->gp_seq[i];
-	}
+	rcu_segcblist_advance_compact(rsclp, i);
 }
 
 /*
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 02/11] rcu/segcblist: Add SRCU and Tasks RCU wrapper functions
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Add srcu_segcblist_advance() and srcu_segcblist_accelerate() wrappers
that forward to the core rcu_segcblist_advance() and
rcu_segcblist_accelerate() functions, and switch all SRCU (srcutree.c)
and Tasks RCU (tasks.h) callers to use these wrappers.

This isolates SRCU and Tasks RCU from upcoming changes to the core
advance/accelerate functions, which will switch to struct
rcu_gp_seq for dual normal/expedited GP tracking. Because SRCU and
Tasks RCU use only normal GP sequences, their wrappers will maintain the
existing unsigned long interface.

No functional change.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 10 ++++++++++
 kernel/rcu/rcu_segcblist.h |  2 ++
 kernel/rcu/srcutree.c      | 14 +++++++-------
 kernel/rcu/tasks.h         |  8 ++++----
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 298a2c573f02c..da39d818b01b1 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -620,3 +620,13 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 
 	rcu_segcblist_init(src_rsclp);
 }
+
+void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+	rcu_segcblist_advance(rsclp, seq);
+}
+
+bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+	return rcu_segcblist_accelerate(rsclp, seq);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index fadc08ad4b7b6..956f2967d9d29 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -143,3 +143,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
+void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
+bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 7c2f7cc131f7a..519a35719c896 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1351,7 +1351,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	 *  2) The grace period for RCU_WAIT_TAIL is seen as started but not
 	 *     completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
 	 *
-	 *  3) This value is passed to rcu_segcblist_advance() which can't move
+	 *  3) This value is passed to srcu_segcblist_advance() which can't move
 	 *     any segment forward and fails.
 	 *
 	 *  4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
@@ -1360,15 +1360,15 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	 *     RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
 	 *     so it returns a snapshot of the next grace period, which is X + 12.
 	 *
-	 *  5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+	 *  5) The value of X + 12 is passed to srcu_segcblist_accelerate() but the
 	 *     freshly enqueued callback in RCU_NEXT_TAIL can't move to
 	 *     RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
 	 *     period (gp_num = X + 8). So acceleration fails.
 	 */
 	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
 	if (rhp) {
-		rcu_segcblist_advance(&sdp->srcu_cblist,
-				      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+		srcu_segcblist_advance(&sdp->srcu_cblist,
+				       rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 		/*
 		 * Acceleration can never fail because the base current gp_seq
 		 * used for acceleration is <= the value of gp_seq used for
@@ -1376,7 +1376,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 		 * always be able to be emptied by the acceleration into the
 		 * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
 		 */
-		WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+		WARN_ON_ONCE(!srcu_segcblist_accelerate(&sdp->srcu_cblist, s));
 	}
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
@@ -1891,8 +1891,8 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	rcu_cblist_init(&ready_cbs);
 	raw_spin_lock_irq_rcu_node(sdp);
 	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
-	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+	srcu_segcblist_advance(&sdp->srcu_cblist,
+			       rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 	/*
 	 * Although this function is theoretically re-entrant, concurrent
 	 * callbacks invocation is disallowed to avoid executing an SRCU barrier
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index f4da5fad70f51..92971499a12c5 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -481,8 +481,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 			if (cpu > 0)
 				ncbsnz += n;
 		}
-		rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
-		(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+		srcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+		(void)srcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
 		if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
 			if (rtp->lazy_jiffies)
 				rtpcp->urgent_gp--;
@@ -565,7 +565,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	if (rcu_segcblist_empty(&rtpcp->cblist))
 		return;
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
-	rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+	srcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
 	rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 	len = rcl.len;
@@ -578,7 +578,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	}
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 	rcu_segcblist_add_len(&rtpcp->cblist, -len);
-	(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+	(void)srcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 }
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 01/11] rcu: Rename struct rcu_gp_oldstate to rcu_gp_seq
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

The polled grace-period state structure rcu_gp_oldstate holds a snapshot
of the normal (and, on SMP, expedited) grace-period sequence numbers.
Upcoming changes store this structure in the callback segment list, where
the "oldstate" name reads poorly: there it represents the grace period a
segment is waiting on and is also compared against the current
grace-period state.

Rename struct rcu_gp_oldstate to the more neutral struct rcu_gp_seq, and
shorten its members rgos_norm and rgos_exp to norm and exp.  Local
variables and parameters of this type are renamed from rgosp/rgos to
gsp/gs accordingly.

While at it, provide a single definition of the structure in rcupdate.h
rather than separate Tiny-RCU and Tree-RCU definitions, and give it the
->exp field unconditionally.  Tiny RCU does not track expedited grace
periods and leaves ->exp unused, but a single definition that always has
->exp lets the shared callback code in rcu_segcblist.c reference it
without CONFIG_SMP guards, including on !SMP builds.

No functional change.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/rcupdate.h      | 13 ++++++--
 include/linux/rcupdate_wait.h |  2 +-
 include/linux/rcutiny.h       | 36 +++++++++-----------
 include/linux/rcutree.h       | 29 +++++++---------
 kernel/rcu/rcutorture.c       | 30 ++++++++---------
 kernel/rcu/tiny.c             |  4 +--
 kernel/rcu/tree.c             | 62 +++++++++++++++++------------------
 kernel/rcu/tree_exp.h         | 18 +++++-----
 mm/slab_common.c              |  6 ++--
 9 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e95acc33989b..ce00f1726e95e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,9 +52,18 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
 void synchronize_rcu(void);
 
-struct rcu_gp_oldstate;
+/*
+ * Grace-period sequence snapshot for the polled RCU APIs: ->norm for the
+ * normal grace period and ->exp for the expedited one.  ->exp is unused by
+ * Tiny RCU, but is present unconditionally so that a single definition
+ * serves both Tiny RCU and Tree RCU.
+ */
+struct rcu_gp_seq {
+	unsigned long norm;
+	unsigned long exp;
+};
 unsigned long get_completed_synchronize_rcu(void);
-void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void get_completed_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 
 // Maximum number of unsigned long values corresponding to
 // not-yet-completed RCU grace periods.
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 4c92d4291cce7..fa884704a3b79 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -18,7 +18,7 @@ struct rcu_synchronize {
 	struct completion completion;
 
 	/* This is for debugging. */
-	struct rcu_gp_oldstate oldstate;
+	struct rcu_gp_seq oldstate;
 };
 void wakeme_after_rcu(struct rcu_head *head);
 
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index f519cd6802286..e56ded733b1b5 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,11 +14,7 @@
 
 #include <asm/param.h> /* for HZ */
 
-struct rcu_gp_oldstate {
-	unsigned long rgos_norm;
-};
-
-// Maximum number of rcu_gp_oldstate values corresponding to
+// Maximum number of rcu_gp_seq values corresponding to
 // not-yet-completed RCU grace periods.
 #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 2
 
@@ -26,31 +22,31 @@ struct rcu_gp_oldstate {
  * Are the two oldstate values the same?  See the Tree RCU version for
  * docbook header.
  */
-static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
-						   struct rcu_gp_oldstate *rgosp2)
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_seq *rgosp1,
+						   struct rcu_gp_seq *rgosp2)
 {
-	return rgosp1->rgos_norm == rgosp2->rgos_norm;
+	return rgosp1->norm == rgosp2->norm;
 }
 
 unsigned long get_state_synchronize_rcu(void);
 
-static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline void get_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = get_state_synchronize_rcu();
+	gsp->norm = get_state_synchronize_rcu();
 }
 
 unsigned long start_poll_synchronize_rcu(void);
 
-static inline void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline void start_poll_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = start_poll_synchronize_rcu();
+	gsp->norm = start_poll_synchronize_rcu();
 }
 
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
-static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	return poll_state_synchronize_rcu(rgosp->rgos_norm);
+	return poll_state_synchronize_rcu(gsp->norm);
 }
 
 static inline void cond_synchronize_rcu(unsigned long oldstate)
@@ -58,9 +54,9 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
-static inline void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline void cond_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	cond_synchronize_rcu(rgosp->rgos_norm);
+	cond_synchronize_rcu(gsp->norm);
 }
 
 static inline unsigned long start_poll_synchronize_rcu_expedited(void)
@@ -68,9 +64,9 @@ static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 	return start_poll_synchronize_rcu();
 }
 
-static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = start_poll_synchronize_rcu_expedited();
+	gsp->norm = start_poll_synchronize_rcu_expedited();
 }
 
 static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
@@ -78,9 +74,9 @@ static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 	cond_synchronize_rcu(oldstate);
 }
 
-static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	cond_synchronize_rcu_expedited(rgosp->rgos_norm);
+	cond_synchronize_rcu_expedited(gsp->norm);
 }
 
 extern void rcu_barrier(void);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9d2d7bd251d4f..16a04202888b4 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -38,12 +38,7 @@ void synchronize_rcu_expedited(void);
 void rcu_barrier(void);
 void rcu_momentary_eqs(void);
 
-struct rcu_gp_oldstate {
-	unsigned long rgos_norm;
-	unsigned long rgos_exp;
-};
-
-// Maximum number of rcu_gp_oldstate values corresponding to
+// Maximum number of rcu_gp_seq values corresponding to
 // not-yet-completed RCU grace periods.
 #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4
 
@@ -60,29 +55,29 @@ struct rcu_gp_oldstate {
  * to a list header, allowing those structures to be slightly smaller.
  *
  * Note that equality is judged on a bitwise basis, so that an
- * @rcu_gp_oldstate structure with an already-completed state in one field
+ * @rcu_gp_seq structure with an already-completed state in one field
  * will compare not-equal to a structure with an already-completed state
- * in the other field.  After all, the @rcu_gp_oldstate structure is opaque
+ * in the other field.  After all, the @rcu_gp_seq structure is opaque
  * so how did such a situation come to pass in the first place?
  */
-static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
-						   struct rcu_gp_oldstate *rgosp2)
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_seq *rgosp1,
+						   struct rcu_gp_seq *rgosp2)
 {
-	return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp;
+	return rgosp1->norm == rgosp2->norm && rgosp1->exp == rgosp2->exp;
 }
 
 unsigned long start_poll_synchronize_rcu_expedited(void);
-void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
-void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp);
 unsigned long get_state_synchronize_rcu(void);
-void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void get_state_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 unsigned long start_poll_synchronize_rcu(void);
-void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void start_poll_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
-bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 void cond_synchronize_rcu(unsigned long oldstate);
-void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void cond_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 
 #ifdef CONFIG_PROVE_RCU
 void rcu_irq_exit_check_preempt(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d4ebeeeab440..b09e15746a08c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -393,23 +393,23 @@ struct rcu_torture_ops {
 	void (*exp_current)(void);
 	unsigned long (*get_gp_state_exp)(void);
 	unsigned long (*start_gp_poll_exp)(void);
-	void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
+	void (*start_gp_poll_exp_full)(struct rcu_gp_seq *gsp);
 	bool (*poll_gp_state_exp)(unsigned long oldstate);
 	void (*cond_sync_exp)(unsigned long oldstate);
-	void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+	void (*cond_sync_exp_full)(struct rcu_gp_seq *gsp);
 	unsigned long (*get_comp_state)(void);
-	void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+	void (*get_comp_state_full)(struct rcu_gp_seq *gsp);
 	bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
-	bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
+	bool (*same_gp_state_full)(struct rcu_gp_seq *rgosp1, struct rcu_gp_seq *rgosp2);
 	unsigned long (*get_gp_state)(void);
-	void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+	void (*get_gp_state_full)(struct rcu_gp_seq *gsp);
 	unsigned long (*start_gp_poll)(void);
-	void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
+	void (*start_gp_poll_full)(struct rcu_gp_seq *gsp);
 	bool (*poll_gp_state)(unsigned long oldstate);
-	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+	bool (*poll_gp_state_full)(struct rcu_gp_seq *gsp);
 	bool (*poll_need_2gp)(bool poll, bool poll_full);
 	void (*cond_sync)(unsigned long oldstate);
-	void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
+	void (*cond_sync_full)(struct rcu_gp_seq *gsp);
 	int poll_active;
 	int poll_active_full;
 	call_rcu_func_t call;
@@ -1608,7 +1608,7 @@ static void rcu_torture_write_types(void)
 static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
 {
 	unsigned long cookie;
-	struct rcu_gp_oldstate cookie_full;
+	struct rcu_gp_seq cookie_full;
 	bool dopoll;
 	bool dopoll_full;
 	unsigned long r = torture_random(trsp);
@@ -1656,18 +1656,18 @@ rcu_torture_writer(void *arg)
 	bool booting_still = false;
 	bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
 	unsigned long cookie;
-	struct rcu_gp_oldstate cookie_full;
+	struct rcu_gp_seq cookie_full;
 	int expediting = 0;
 	unsigned long gp_snap;
 	unsigned long gp_snap1;
-	struct rcu_gp_oldstate gp_snap_full;
-	struct rcu_gp_oldstate gp_snap1_full;
+	struct rcu_gp_seq gp_snap_full;
+	struct rcu_gp_seq gp_snap1_full;
 	int i;
 	int idx;
 	unsigned long j;
 	struct work_struct lazy_work;
 	int oldnice = task_nice(current);
-	struct rcu_gp_oldstate *rgo = NULL;
+	struct rcu_gp_seq *rgo = NULL;
 	int rgo_size = 0;
 	struct rcu_torture *rp;
 	struct rcu_torture *old_rp;
@@ -1966,7 +1966,7 @@ static int
 rcu_torture_fakewriter(void *arg)
 {
 	unsigned long gp_snap;
-	struct rcu_gp_oldstate gp_snap_full;
+	struct rcu_gp_seq gp_snap_full;
 	DEFINE_TORTURE_RANDOM(rand);
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
@@ -2404,7 +2404,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct
 struct rcu_torture_one_read_state {
 	bool checkpolling;
 	unsigned long cookie;
-	struct rcu_gp_oldstate cookie_full;
+	struct rcu_gp_seq cookie_full;
 	unsigned long started;
 	struct rcu_torture *p;
 	int readstate;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 585cade21010e..dccccd6be9411 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -187,9 +187,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * Store a grace-period-counter "cookie".  For more information,
  * see the Tree RCU header comment.
  */
-void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void get_completed_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+	gsp->norm = RCU_GET_STATE_COMPLETED;
 }
 EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e23d57f743912..af4b6daf6a0ff 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3290,7 +3290,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * Later on, this could in theory be the case for kernels built with
  * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
  * is not a common case.  Furthermore, this optimization would cause
- * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * the rcu_gp_seq structure to expand by 50%, so this potential
  * grace-period optimization is ignored once the scheduler is running.
  */
 static int rcu_blocking_is_gp(void)
@@ -3419,16 +3419,16 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 /**
  * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
- * @rgosp: Place to put state cookie
+ * @gsp: Place to put state cookie
  *
- * Stores into @rgosp a value that will always be treated by functions
+ * Stores into @gsp a value that will always be treated by functions
  * like poll_state_synchronize_rcu_full() as a cookie whose grace period
  * has already completed.
  */
-void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void get_completed_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
-	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+	gsp->norm = RCU_GET_STATE_COMPLETED;
+	gsp->exp = RCU_GET_STATE_COMPLETED;
 }
 EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 
@@ -3452,13 +3452,13 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
 /**
  * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
- * @rgosp: location to place combined normal/expedited grace-period state
+ * @gsp: location to place combined normal/expedited grace-period state
  *
- * Places the normal and expedited grace-period states in @rgosp.  This
+ * Places the normal and expedited grace-period states in @gsp.  This
  * state value can be passed to a later call to cond_synchronize_rcu_full()
  * or poll_state_synchronize_rcu_full() to determine whether or not a
  * grace period (whether normal or expedited) has elapsed in the meantime.
- * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * The rcu_gp_seq structure takes up twice the memory of an unsigned
  * long, but is guaranteed to see all grace periods.  In contrast, the
  * combined state occupies less memory, but can sometimes fail to take
  * grace periods into account.
@@ -3466,7 +3466,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
  * This does not guarantee that the needed grace period will actually
  * start.
  */
-void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void get_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
 	/*
 	 * Any prior manipulation of RCU-protected data must happen
@@ -3478,8 +3478,8 @@ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 	// in poll_state_synchronize_rcu_full() notwithstanding.  Use of
 	// the latter here would result in too-short grace periods due to
 	// interactions with newly onlined CPUs.
-	rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
-	rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+	gsp->norm = rcu_seq_snap(&rcu_state.gp_seq);
+	gsp->exp = rcu_seq_snap(&rcu_state.expedited_sequence);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
 
@@ -3530,18 +3530,18 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 
 /**
  * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
- * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ * @gsp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
  *
- * Places the normal and expedited grace-period states in *@rgos.  This
+ * Places the normal and expedited grace-period states in *@gs.  This
  * state value can be passed to a later call to cond_synchronize_rcu_full()
  * or poll_state_synchronize_rcu_full() to determine whether or not a
  * grace period (whether normal or expedited) has elapsed in the meantime.
  * If the needed grace period is not already slated to start, notifies
  * RCU core of the need for that grace period.
  */
-void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void start_poll_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	get_state_synchronize_rcu_full(rgosp);
+	get_state_synchronize_rcu_full(gsp);
 
 	start_poll_synchronize_rcu_common();
 }
@@ -3593,19 +3593,19 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 
 /**
  * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
- * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ * @gsp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
  *
  * If a full RCU grace period has elapsed since the earlier call from
- * which *rgosp was obtained, return @true, otherwise return @false.
+ * which *gsp was obtained, return @true, otherwise return @false.
  * If @false is returned, it is the caller's responsibility to invoke this
  * function later on until it does return @true.  Alternatively, the caller
- * can explicitly wait for a grace period, for example, by passing @rgosp
+ * can explicitly wait for a grace period, for example, by passing @gsp
  * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
  *
  * Yes, this function does not take counter wrap into account.
  * But counter wrap is harmless.  If the counter wraps, we have waited
  * for more than a billion grace periods (and way more on a 64-bit
- * system!).  Those needing to keep rcu_gp_oldstate values for very
+ * system!).  Those needing to keep rcu_gp_seq values for very
  * long time periods (many hours even on 32-bit systems) should check
  * them occasionally and either refresh them or set a flag indicating
  * that the grace period has completed.  Alternatively, they can use
@@ -3614,7 +3614,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  *
  * This function provides the same memory-ordering guarantees that would
  * be provided by a synchronize_rcu() that was invoked at the call to
- * the function that provided @rgosp, and that returned at the end of this
+ * the function that provided @gsp, and that returned at the end of this
  * function.  And this guarantee requires that the root rcu_node structure's
  * ->gp_seq field be checked instead of that of the rcu_state structure.
  * The problem is that the just-ending grace-period's callbacks can be
@@ -3624,15 +3624,15 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
  * then the root rcu_node structure is the one that needs to be polled.
  */
-bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
 	struct rcu_node *rnp = rcu_get_root();
 
 	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
-	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
-	    rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+	if (gsp->norm == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rnp->gp_seq, gsp->norm) ||
+	    gsp->exp == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
@@ -3667,11 +3667,11 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
 /**
  * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
- * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ * @gsp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
  *
  * If a full RCU grace period has elapsed since the call to
  * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
- * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * or start_poll_synchronize_rcu_expedited_full() from which @gsp was
  * obtained, just return.  Otherwise, invoke synchronize_rcu() to wait
  * for a full grace period.
  *
@@ -3682,12 +3682,12 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @rgosp and that returned at the end of
+ * to the function that provided @gsp and that returned at the end of
  * this function.
  */
-void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void cond_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	if (!poll_state_synchronize_rcu_full(rgosp))
+	if (!poll_state_synchronize_rcu_full(gsp))
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a43469da39269..0569d8e40e86d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1064,18 +1064,18 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
 
 /**
  * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
- * @rgosp: Place to put snapshot of grace-period state
+ * @gsp: Place to put snapshot of grace-period state
  *
- * Places the normal and expedited grace-period states in rgosp.  This
+ * Places the normal and expedited grace-period states in gsp.  This
  * state value can be passed to a later call to cond_synchronize_rcu_full()
  * or poll_state_synchronize_rcu_full() to determine whether or not a
  * grace period (whether normal or expedited) has elapsed in the meantime.
  * If the needed expedited grace period is not already slated to start,
  * initiates that grace period.
  */
-void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	get_state_synchronize_rcu_full(rgosp);
+	get_state_synchronize_rcu_full(gsp);
 	(void)start_poll_synchronize_rcu_expedited();
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
@@ -1109,11 +1109,11 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
 
 /**
  * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
- * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ * @gsp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
  *
  * If a full RCU grace period has elapsed since the call to
  * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
- * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * or start_poll_synchronize_rcu_expedited_full() from which @gsp was
  * obtained, just return.  Otherwise, invoke synchronize_rcu_expedited()
  * to wait for a full grace period.
  *
@@ -1124,12 +1124,12 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @rgosp and that returned at the end of
+ * to the function that provided @gsp and that returned at the end of
  * this function.
  */
-void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	if (!poll_state_synchronize_rcu_full(rgosp))
+	if (!poll_state_synchronize_rcu_full(gsp))
 		synchronize_rcu_expedited();
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d5a70a831a2a5..f4ff50527db3a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1322,7 +1322,7 @@ static struct workqueue_struct *rcu_reclaim_wq;
  */
 struct kvfree_rcu_bulk_data {
 	struct list_head list;
-	struct rcu_gp_oldstate gp_snap;
+	struct rcu_gp_seq gp_snap;
 	unsigned long nr_records;
 	void *records[] __counted_by(nr_records);
 };
@@ -1347,7 +1347,7 @@ struct kvfree_rcu_bulk_data {
 struct kfree_rcu_cpu_work {
 	struct rcu_work rcu_work;
 	struct rcu_head *head_free;
-	struct rcu_gp_oldstate head_free_gp_snap;
+	struct rcu_gp_seq head_free_gp_snap;
 	struct list_head bulk_head_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
 };
@@ -1555,7 +1555,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	struct rcu_head *head;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
-	struct rcu_gp_oldstate head_gp_snap;
+	struct rcu_gp_seq head_gp_snap;
 	int i;
 
 	krwp = container_of(to_rcu_work(work),
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 00/11] RCU: Enable callbacks to benefit from expedited grace periods
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao

This series lets call_rcu() callbacks be reclaimed as soon as either a
normal or an expedited grace period that covers them has elapsed, rather
than always waiting for a normal grace period.

Motivation
==========
Today there is an asymmetry: synchronize_rcu_expedited() callers get fast
reclaim, but call_rcu() callers never benefit from those same expedited
grace periods, even though an expedited GP proves exactly the same thing
as a normal one -- all pre-existing readers are done.  When expedited GPs
are running on the system (driven by other subsystems), call_rcu()
callbacks that could already be freed instead sit in RCU_WAIT_TAIL until
the next normal GP.  This series treats a grace period as a grace period
regardless of how it was driven, so memory is reclaimed sooner.

Design
======
Callback segments now record both the normal and expedited grace-period
sequence in struct rcu_gp_seq, and rcu_segcblist_advance() releases a
segment as soon as poll_state_synchronize_rcu_full() reports that either
has completed.  Three notification paths are taught about expedited
completion so the advance actually happens: the NOCB rcuog kthreads,
the rcu_pending() tick gate, and rcu_core().

Changelog:
RFC: https://lore.kernel.org/all/20260417231203.785172-1-puranjay@kernel.org/
Changes in v1:
 - New prep patch 1 renames struct rcu_gp_oldstate to struct rcu_gp_seq
   and its fields rgos_norm/rgos_exp to norm/exp tree-wide (Frederic).
 - The rcu_segcblist segment field stays named gp_seq; only its type
   changes (Frederic).
 - Patch 8 (NOCB wake) is reworked.  v1 woke the wrong waitqueue
   (rdp_gp->nocb_gp_wq via wake_nocb_gp() rather than the leaf
   rnp->nocb_gp_wq[] that an rcuog kthread waiting for a GP sleeps on),
   and the wait condition only checked the normal ->gp_seq.  The rcuog
   grace-period wait now tracks a struct rcu_gp_seq and is released via
   poll_state_synchronize_rcu_full(); rcu_exp_wait_wake() wakes the leaf
   node through the new rcu_nocb_exp_cleanup() (Frederic).
 - rcu_pending() uses a new memory-ordering-free
   poll_state_synchronize_rcu_full_unordered() to avoid memory barriers
   on every tick, leaving the ordering duty to rcu_core() (Frederic).

Still open: Frederic asked whether the first smp_mb() in
poll_state_synchronize_rcu_full() is needed on the callback-advance path
(patch 6).  That path still uses the fully ordered helper; only
rcu_pending() was switched to the unordered variant.  Happy to revisit.

Puranjay Mohan (11):
  rcu: Rename struct rcu_gp_oldstate to rcu_gp_seq
  rcu/segcblist: Add SRCU and Tasks RCU wrapper functions
  rcu/segcblist: Factor out rcu_segcblist_advance_compact() helper
  rcu/segcblist: Track segment grace periods with struct rcu_gp_seq
  rcu: Add RCU_GET_STATE_NOT_TRACKED for subsystems without expedited
    GPs
  rcu: Enable RCU callbacks to benefit from expedited grace periods
  rcu: Update comments for gp_seq and expedited GP tracking
  rcu: Wake NOCB rcuog kthreads on expedited grace period completion
  rcu: Detect expedited grace period completion in rcu_pending()
  rcu: Advance callbacks for expedited GP completion in rcu_core()
  rcuscale: Add concurrent expedited GP threads for callback scaling
    tests

 include/linux/rcu_segcblist.h |  16 ++--
 include/linux/rcupdate.h      |  13 ++-
 include/linux/rcupdate_wait.h |   2 +-
 include/linux/rcutiny.h       |  36 ++++-----
 include/linux/rcutree.h       |  29 +++----
 include/trace/events/rcu.h    |   5 +-
 kernel/rcu/rcu.h              |  13 ++-
 kernel/rcu/rcu_segcblist.c    | 139 ++++++++++++++++++++++----------
 kernel/rcu/rcu_segcblist.h    |   8 +-
 kernel/rcu/rcuscale.c         |  84 ++++++++++++++++++-
 kernel/rcu/rcutorture.c       |  30 +++----
 kernel/rcu/srcutree.c         |  14 ++--
 kernel/rcu/tasks.h            |   8 +-
 kernel/rcu/tiny.c             |   4 +-
 kernel/rcu/tree.c             | 147 ++++++++++++++++++++++------------
 kernel/rcu/tree.h             |   3 +-
 kernel/rcu/tree_exp.h         |  20 ++---
 kernel/rcu/tree_nocb.h        | 131 ++++++++++++++++++++++++------
 mm/slab_common.c              |   6 +-
 19 files changed, 496 insertions(+), 212 deletions(-)


base-commit: 709d17a22bfac78765f6cbaec42e15bcd4aa4f08
-- 
2.53.0-Meta


^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Christian König @ 2026-06-24 13:23 UTC (permalink / raw)
  To: Kaitao Cheng, David Laight
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt, David Howells,
	Simona Vetter, Randy Dunlap, Luca Ceresoli, Philipp Stanner,
	linux-block, linux-kernel, cgroups, linux-ntfs-dev, linux-fsdevel,
	io-uring, audit, bpf, netdev, dri-devel, linux-perf-users,
	linux-trace-kernel, kexec, live-patching, linux-modules,
	linux-crypto, linux-pm, rcu, sched-ext, linux-mm, virtualization,
	damon, llvm, Kaitao Cheng
In-Reply-To: <351a6b67-b394-4c58-aee2-88b6c8089ad5@linux.dev>

On 6/24/26 15:14, Kaitao Cheng wrote:
> 
> 
> 在 2026/6/22 16:42, David Laight 写道:
>> On Mon, 22 Jun 2026 12:05:31 +0800
>> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>
>>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>>
>>> The list_for_each*_safe() helpers are used when the loop body may
>>> remove the current entry.  Their API exposes the temporary cursor at
>>> every call site, even though most users only need it for the iterator
>>> implementation and never reference it in the loop body.
>>>
>>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>>> support both forms: callers may keep passing an explicit temporary cursor
>>> when they need to inspect or reset it, or omit it and let the helper use
>>> a unique internal cursor.
>>
>> I'm not really sure 'mutable' means anything either.
>> It is possible to make it valid for the loop body (or even other threads)
>> to delete arbitrary list items - but that needs significant extra overheads.
>>
>> It might be worth doing something that doesn't need the extra variable,
>> but there is little point doing all the churn just to rename things.
>>
>>>
>>> This makes call sites that only mutate the list through the current entry
>>> less noisy, while keeping the existing *_safe() helpers available for
>>> compatibility.
>>>
>>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>>> ---
>>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/include/linux/list.h b/include/linux/list.h
>>> index 09d979976b3b..1081def7cea9 100644
>>> --- a/include/linux/list.h
>>> +++ b/include/linux/list.h
>>> @@ -7,6 +7,7 @@
>>>  #include <linux/stddef.h>
>>>  #include <linux/poison.h>
>>>  #include <linux/const.h>
>>> +#include <linux/args.h>
>>>  
>>>  #include <asm/barrier.h>
>>>  
>>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>>  #define list_for_each_prev(pos, head) \
>>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>>  
>>> -/**
>>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>>> - * @pos:	the &struct list_head to use as a loop cursor.
>>> - * @n:		another &struct list_head to use as temporary storage
>>> - * @head:	the head for your list.
>>> +/*
>>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>>   */
>>>  #define list_for_each_safe(pos, n, head) \
>>>  	for (pos = (head)->next, n = pos->next; \
>>>  	     !list_is_head(pos, (head)); \
>>>  	     pos = n, n = pos->next)
>>>  
>>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
>>
>> Use auto
>>
>>> +	     !list_is_head(pos, (head));				\
>>> +	     pos = tmp, tmp = pos->next)
>>> +
>>> +#define __list_for_each_mutable1(pos, head)				\
>>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>>> +
>>> +#define __list_for_each_mutable2(pos, next, head)			\
>>> +	list_for_each_safe(pos, next, head)
>>> +
>>>  /**
>>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>>   * @pos:	the &struct list_head to use as a loop cursor.
>>> - * @n:		another &struct list_head to use as temporary storage
>>> - * @head:	the head for your list.
>>> + * @...:	either (head) or (next, head)
>>> + *
>>> + * next:	another &struct list_head to use as optional temporary storage.
>>> + *		The temporary cursor is internal unless explicitly supplied by
>>> + *		the caller.
>>> + * head:	the head for your list.
>>> + */
>>> +#define list_for_each_mutable(pos, ...)					\
>>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>>> +		(pos, __VA_ARGS__)
>>
>> The variable argument count logic really just slows down compilation.
>> Maybe there aren't enough copies of this code to make that significant.
>> But just because you can do it doesn't mean it is a gooD idea.
>> I'm also not sure it really adds anything to the readability.
>>
>> And, it you are going to make the middle argument optional there is
>> no need to change the macro name.
> 
> Christian König and Jani Nikula also disagree with the variadic-argument
> implementation approach. If we abandon that method, it means we will
> inevitably need to add some new macros. If mutable is not a good name,
> suggestions for better alternatives would be welcome; coming up with a
> suitable name is indeed rather tricky.

I don't think you need to add a new macro for the specific use case that people want to modify the next element of the iteration.

If I remember your numbers correctly that is a really corner case and keeping using the existing *_safe() macros for that sounds perfectly fine to me.

Regards,
Christian.

^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Kaitao Cheng @ 2026-06-24 13:14 UTC (permalink / raw)
  To: David Laight
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König, David Howells, Simona Vetter, Randy Dunlap,
	Luca Ceresoli, Philipp Stanner, linux-block, linux-kernel,
	cgroups, linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf,
	netdev, dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, Kaitao Cheng
In-Reply-To: <20260622094242.64531b9a@pumpkin>



在 2026/6/22 16:42, David Laight 写道:
> On Mon, 22 Jun 2026 12:05:31 +0800
> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
> 
>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>
>> The list_for_each*_safe() helpers are used when the loop body may
>> remove the current entry.  Their API exposes the temporary cursor at
>> every call site, even though most users only need it for the iterator
>> implementation and never reference it in the loop body.
>>
>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>> support both forms: callers may keep passing an explicit temporary cursor
>> when they need to inspect or reset it, or omit it and let the helper use
>> a unique internal cursor.
> 
> I'm not really sure 'mutable' means anything either.
> It is possible to make it valid for the loop body (or even other threads)
> to delete arbitrary list items - but that needs significant extra overheads.
> 
> It might be worth doing something that doesn't need the extra variable,
> but there is little point doing all the churn just to rename things.
> 
>>
>> This makes call sites that only mutate the list through the current entry
>> less noisy, while keeping the existing *_safe() helpers available for
>> compatibility.
>>
>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>> ---
>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>
>> diff --git a/include/linux/list.h b/include/linux/list.h
>> index 09d979976b3b..1081def7cea9 100644
>> --- a/include/linux/list.h
>> +++ b/include/linux/list.h
>> @@ -7,6 +7,7 @@
>>  #include <linux/stddef.h>
>>  #include <linux/poison.h>
>>  #include <linux/const.h>
>> +#include <linux/args.h>
>>  
>>  #include <asm/barrier.h>
>>  
>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>  #define list_for_each_prev(pos, head) \
>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>  
>> -/**
>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>> - * @pos:	the &struct list_head to use as a loop cursor.
>> - * @n:		another &struct list_head to use as temporary storage
>> - * @head:	the head for your list.
>> +/*
>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>   */
>>  #define list_for_each_safe(pos, n, head) \
>>  	for (pos = (head)->next, n = pos->next; \
>>  	     !list_is_head(pos, (head)); \
>>  	     pos = n, n = pos->next)
>>  
>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
> 
> Use auto
> 
>> +	     !list_is_head(pos, (head));				\
>> +	     pos = tmp, tmp = pos->next)
>> +
>> +#define __list_for_each_mutable1(pos, head)				\
>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>> +
>> +#define __list_for_each_mutable2(pos, next, head)			\
>> +	list_for_each_safe(pos, next, head)
>> +
>>  /**
>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>   * @pos:	the &struct list_head to use as a loop cursor.
>> - * @n:		another &struct list_head to use as temporary storage
>> - * @head:	the head for your list.
>> + * @...:	either (head) or (next, head)
>> + *
>> + * next:	another &struct list_head to use as optional temporary storage.
>> + *		The temporary cursor is internal unless explicitly supplied by
>> + *		the caller.
>> + * head:	the head for your list.
>> + */
>> +#define list_for_each_mutable(pos, ...)					\
>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>> +		(pos, __VA_ARGS__)
> 
> The variable argument count logic really just slows down compilation.
> Maybe there aren't enough copies of this code to make that significant.
> But just because you can do it doesn't mean it is a gooD idea.
> I'm also not sure it really adds anything to the readability.
> 
> And, it you are going to make the middle argument optional there is
> no need to change the macro name.

Christian König and Jani Nikula also disagree with the variadic-argument
implementation approach. If we abandon that method, it means we will
inevitably need to add some new macros. If mutable is not a good name,
suggestions for better alternatives would be welcome; coming up with a
suitable name is indeed rather tricky.

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Kaitao Cheng @ 2026-06-24 13:05 UTC (permalink / raw)
  To: Jani Nikula, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König
  Cc: David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, linux-kernel, cgroups,
	linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf, netdev,
	dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, chengkaitao
In-Reply-To: <88f34c7fa5a3d1700cc8005818751d6aa31f09df@intel.com>



在 2026/6/22 16:37, Jani Nikula 写道:
> On Mon, 22 Jun 2026, Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>> Add *_mutable() iterator variants for list, hlist and llist.  The new
>> helpers are variadic and support both forms.  In the common case, the
>> caller omits the temporary cursor and the macro creates a unique internal
>> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
>> explicit temporary cursor, the caller can still pass it and the helper
>> keeps the existing *_safe() behaviour.
>>
>> For example, a call site may use the shorter form:
>>
>>   list_for_each_entry_mutable(pos, head, member)
>>
>> or keep the explicit temporary cursor form:
>>
>>   list_for_each_entry_mutable(pos, tmp, head, member)
> 
> I'm unconvinced it's a good idea to allow two forms with macro trickery,
> *especially* when it's not the last argument you can omit. I think it's
> a footgun.
> 
> IMO stick with the first form only, and there'll always be the _safe
> variant that can be used when the temp pointer is needed.

Could we go back to the v1 version? What do you think of that
implementation approach?

https://lore.kernel.org/all/20260529082149.76764-1-kaitao.cheng@linux.dev/

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Kaitao Cheng @ 2026-06-24 12:58 UTC (permalink / raw)
  To: David Hildenbrand (Arm), Alexei Starovoitov
  Cc: Andrew Morton, Jens Axboe, Tejun Heo, Alexander Viro,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Thomas Gleixner,
	Juri Lelli, Vincent Guittot, Paul Moore, Andy Shevchenko,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao
In-Reply-To: <8f98a3a6-f97b-4673-964f-fb09c8879e2e@kernel.org>



在 2026/6/22 19:27, David Hildenbrand (Arm) 写道:
> On 6/22/26 07:28, Alexei Starovoitov wrote:
>> On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>>
>>> From: chengkaitao <chengkaitao@kylinos.cn>
>>>
>>> The list_for_each*_safe() helpers are used when the loop body may remove
>>> the current entry.  Their current interface, however, forces every caller
>>> to define a temporary cursor outside the macro and pass it in, even when
>>> the caller never uses that cursor directly.  For most call sites this
>>> extra cursor is just boilerplate required by the macro implementation.
>>>
>>> This is awkward because the saved next pointer is an internal detail of
>>> the iteration.  Callers that only remove or move the current entry do not
>>> need to spell it out.
>>>
>>> The _safe() suffix has also caused confusion.  Christian Koenig pointed
>>> out that the name is easy to read as a thread-safe variant, especially
>>> for beginners, even though it only means that the iterator keeps enough
>>> state to tolerate removal of the current entry.  He suggested _mutable()
>>> as a clearer description of what the loop permits.
>>>
>>> Add *_mutable() iterator variants for list, hlist and llist.  The new
>>> helpers are variadic and support both forms.  In the common case, the
>>> caller omits the temporary cursor and the macro creates a unique internal
>>> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
>>> explicit temporary cursor, the caller can still pass it and the helper
>>> keeps the existing *_safe() behaviour.
>>>
>>> For example, a call site may use the shorter form:
>>>
>>>   list_for_each_entry_mutable(pos, head, member)
>>>
>>> or keep the explicit temporary cursor form:
>>>
>>>   list_for_each_entry_mutable(pos, tmp, head, member)
>>>
>>> The existing *_safe() helpers remain available for compatibility.  This
>>> series only converts users in mm, block, kernel, init and io_uring.  If
>>> this approach looks acceptable, the remaining users can be converted in
>>> follow-up series.
>>>
>>> Changes in v3 (Christian König, Andy Shevchenko):
>>> - Convert safe list walks to mutable iterators
>>>
>>> Changes in v2 (Muchun Song, Andy Shevchenko):
>>> - Drop the list_for_each_entry_mutable*() helpers from v1 and make the
>>>   cursor change directly in the existing list_for_each_entry*() helpers.
>>> - Open-code special list walks that rely on updating the loop cursor in
>>>   the body, preserving their existing traversal semantics.
>>>
>>> Link to v2:
>>> https://lore.kernel.org/all/20260609061347.93688-1-kaitao.cheng@linux.dev/
>>>
>>> Link to v1:
>>> https://lore.kernel.org/all/20260529082149.76764-1-kaitao.cheng@linux.dev/
>>>
>>> Kaitao Cheng (7):
>>>   list: Add mutable iterator variants
>>>   llist: Add mutable iterator variants
>>>   mm: Use mutable list iterators
>>>   block: Use mutable list iterators
>>>   kernel: Use mutable list iterators
>>>   initramfs: Use mutable list iterator
>>>   io_uring: Use mutable list iterators
>>>
>>>  block/bfq-iosched.c                 |  17 +-
>>>  block/blk-cgroup.c                  |  12 +-
>>>  block/blk-flush.c                   |   4 +-
>>>  block/blk-iocost.c                  |  18 +-
>>>  block/blk-mq.c                      |   8 +-
>>>  block/blk-throttle.c                |   4 +-
>>>  block/kyber-iosched.c               |   4 +-
>>>  block/partitions/ldm.c              |   8 +-
>>>  block/sed-opal.c                    |   4 +-
>>>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
>>>  include/linux/llist.h               |  81 +++++++--
>>>  init/initramfs.c                    |   5 +-
>>>  io_uring/cancel.c                   |   6 +-
>>>  io_uring/poll.c                     |   3 +-
>>>  io_uring/rw.c                       |   4 +-
>>>  io_uring/timeout.c                  |   8 +-
>>>  io_uring/uring_cmd.c                |   3 +-
>>>  kernel/audit_tree.c                 |   4 +-
>>>  kernel/audit_watch.c                |  16 +-
>>>  kernel/auditfilter.c                |   4 +-
>>>  kernel/auditsc.c                    |   4 +-
>>>  kernel/bpf/arena.c                  |  10 +-
>>>  kernel/bpf/arraymap.c               |   8 +-
>>>  kernel/bpf/bpf_local_storage.c      |   3 +-
>>>  kernel/bpf/bpf_lru_list.c           |  25 ++-
>>>  kernel/bpf/btf.c                    |  18 +-
>>>  kernel/bpf/cgroup.c                 |   7 +-
>>>  kernel/bpf/cpumap.c                 |   4 +-
>>>  kernel/bpf/devmap.c                 |  10 +-
>>>  kernel/bpf/helpers.c                |   8 +-
>>>  kernel/bpf/local_storage.c          |   4 +-
>>>  kernel/bpf/memalloc.c               |  16 +-
>>>  kernel/bpf/offload.c                |   8 +-
>>>  kernel/bpf/states.c                 |   4 +-
>>>  kernel/bpf/stream.c                 |   4 +-
>>>  kernel/bpf/verifier.c               |   6 +-
>>>  kernel/cgroup/cgroup-v1.c           |   4 +-
>>>  kernel/cgroup/cgroup.c              |  54 +++---
>>>  kernel/cgroup/dmem.c                |  12 +-
>>>  kernel/cgroup/rdma.c                |   8 +-
>>>  kernel/events/core.c                |  44 +++--
>>>  kernel/events/uprobes.c             |  12 +-
>>>  kernel/exit.c                       |   8 +-
>>>  kernel/fail_function.c              |   4 +-
>>>  kernel/gcov/clang.c                 |   4 +-
>>>  kernel/irq_work.c                   |   4 +-
>>>  kernel/kexec_core.c                 |   4 +-
>>>  kernel/kprobes.c                    |  16 +-
>>>  kernel/livepatch/core.c             |   4 +-
>>>  kernel/livepatch/core.h             |   4 +-
>>>  kernel/liveupdate/kho_block.c       |   4 +-
>>>  kernel/liveupdate/luo_flb.c         |   4 +-
>>>  kernel/locking/rwsem.c              |   2 +-
>>>  kernel/locking/test-ww_mutex.c      |   2 +-
>>>  kernel/module/main.c                |  11 +-
>>>  kernel/padata.c                     |   4 +-
>>>  kernel/power/snapshot.c             |   8 +-
>>>  kernel/power/wakelock.c             |   4 +-
>>>  kernel/printk/printk.c              |  11 +-
>>>  kernel/ptrace.c                     |   4 +-
>>>  kernel/rcu/rcutorture.c             |   3 +-
>>>  kernel/rcu/tasks.h                  |   9 +-
>>>  kernel/rcu/tree.c                   |   6 +-
>>>  kernel/resource.c                   |   4 +-
>>>  kernel/sched/core.c                 |   4 +-
>>>  kernel/sched/ext.c                  |  22 +--
>>>  kernel/sched/fair.c                 |  28 +--
>>>  kernel/sched/topology.c             |   4 +-
>>>  kernel/sched/wait.c                 |   4 +-
>>>  kernel/seccomp.c                    |   4 +-
>>>  kernel/signal.c                     |  11 +-
>>>  kernel/smp.c                        |   4 +-
>>>  kernel/taskstats.c                  |   8 +-
>>>  kernel/time/clockevents.c           |   6 +-
>>>  kernel/time/clocksource.c           |   4 +-
>>>  kernel/time/posix-cpu-timers.c      |   4 +-
>>>  kernel/time/posix-timers.c          |   3 +-
>>>  kernel/torture.c                    |   3 +-
>>>  kernel/trace/bpf_trace.c            |   4 +-
>>>  kernel/trace/ftrace.c               |  49 +++--
>>>  kernel/trace/ring_buffer.c          |  25 ++-
>>>  kernel/trace/trace.c                |  12 +-
>>>  kernel/trace/trace_dynevent.c       |   6 +-
>>>  kernel/trace/trace_dynevent.h       |   5 +-
>>>  kernel/trace/trace_events.c         |  35 ++--
>>>  kernel/trace/trace_events_filter.c  |   4 +-
>>>  kernel/trace/trace_events_hist.c    |   8 +-
>>>  kernel/trace/trace_events_trigger.c |  17 +-
>>>  kernel/trace/trace_events_user.c    |  16 +-
>>>  kernel/trace/trace_stat.c           |   4 +-
>>>  kernel/user-return-notifier.c       |   3 +-
>>>  kernel/workqueue.c                  |  16 +-
>>>  mm/backing-dev.c                    |   8 +-
>>>  mm/balloon.c                        |   8 +-
>>>  mm/cma.c                            |   4 +-
>>>  mm/compaction.c                     |   4 +-
>>>  mm/damon/core.c                     |   4 +-
>>>  mm/damon/sysfs-schemes.c            |   4 +-
>>>  mm/dmapool.c                        |   4 +-
>>>  mm/huge_memory.c                    |   8 +-
>>>  mm/hugetlb.c                        |  56 +++---
>>>  mm/hugetlb_vmemmap.c                |  16 +-
>>>  mm/khugepaged.c                     |  14 +-
>>>  mm/kmemleak.c                       |   7 +-
>>>  mm/ksm.c                            |  25 +--
>>>  mm/list_lru.c                       |   4 +-
>>>  mm/memcontrol-v1.c                  |   8 +-
>>>  mm/memory-failure.c                 |  12 +-
>>>  mm/memory-tiers.c                   |   4 +-
>>>  mm/migrate.c                        |  23 ++-
>>>  mm/mmu_notifier.c                   |   9 +-
>>>  mm/page_alloc.c                     |   8 +-
>>>  mm/page_reporting.c                 |   2 +-
>>>  mm/percpu.c                         |  11 +-
>>>  mm/pgtable-generic.c                |   4 +-
>>>  mm/rmap.c                           |  10 +-
>>>  mm/shmem.c                          |   9 +-
>>>  mm/slab_common.c                    |  14 +-
>>>  mm/slub.c                           |  33 ++--
>>>  mm/swapfile.c                       |   4 +-
>>>  mm/userfaultfd.c                    |  12 +-
>>>  mm/vmalloc.c                        |  24 +--
>>>  mm/vmscan.c                         |   7 +-
>>>  mm/zsmalloc.c                       |   4 +-
>>>  124 files changed, 875 insertions(+), 681 deletions(-)
>>
>> Not sure what you were thinking, but this diff stat
>> is not landable.
> 
> Agreed. If we decide we want this, I guess we should target per-subsystem
> conversions.
> 
> If this goes through the MM tree, I would even appreciate doing this on a per-MM
> component granularity.
> 
> (unless we have some magic "Linus converts all of them" script, which I doubt we
> will have)

I strongly agree with the point above.

> Is there a way forward to replace list_for_each_*_safe entirely, possibly just
> reusing the old name but simply the parameter?

David Laight, Christian König, and Jani Nikula do not agree with using
clever macro syntax to support both calling forms at the same time,
so for now it is not possible to keep the original macro name and only
simplify the parameter. I may revert to the v1 version and ask everyone
for their opinions again.

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Kaitao Cheng @ 2026-06-24 12:29 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Alexei Starovoitov, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao, Muchun Song
In-Reply-To: <ajkSftEbdGoiJXYs@ashevche-desk.local>



在 2026/6/22 18:46, Andy Shevchenko 写道:
> On Mon, Jun 22, 2026 at 02:15:01PM +0800, Kaitao Cheng wrote:
>> 在 2026/6/22 13:28, Alexei Starovoitov 写道:
>>> On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
> 
> ...
> 
>>>>  block/bfq-iosched.c                 |  17 +-
>>>>  block/blk-cgroup.c                  |  12 +-
>>>>  block/blk-flush.c                   |   4 +-
>>>>  block/blk-iocost.c                  |  18 +-
>>>>  block/blk-mq.c                      |   8 +-
>>>>  block/blk-throttle.c                |   4 +-
>>>>  block/kyber-iosched.c               |   4 +-
>>>>  block/partitions/ldm.c              |   8 +-
>>>>  block/sed-opal.c                    |   4 +-
>>>>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
>>>>  include/linux/llist.h               |  81 +++++++--
>>>>  init/initramfs.c                    |   5 +-
>>>>  io_uring/cancel.c                   |   6 +-
>>>>  io_uring/poll.c                     |   3 +-
>>>>  io_uring/rw.c                       |   4 +-
>>>>  io_uring/timeout.c                  |   8 +-
>>>>  io_uring/uring_cmd.c                |   3 +-
>>>>  kernel/audit_tree.c                 |   4 +-
>>>>  kernel/audit_watch.c                |  16 +-
>>>>  kernel/auditfilter.c                |   4 +-
>>>>  kernel/auditsc.c                    |   4 +-
>>>>  kernel/bpf/arena.c                  |  10 +-
>>>>  kernel/bpf/arraymap.c               |   8 +-
>>>>  kernel/bpf/bpf_local_storage.c      |   3 +-
>>>>  kernel/bpf/bpf_lru_list.c           |  25 ++-
>>>>  kernel/bpf/btf.c                    |  18 +-
>>>>  kernel/bpf/cgroup.c                 |   7 +-
>>>>  kernel/bpf/cpumap.c                 |   4 +-
>>>>  kernel/bpf/devmap.c                 |  10 +-
>>>>  kernel/bpf/helpers.c                |   8 +-
>>>>  kernel/bpf/local_storage.c          |   4 +-
>>>>  kernel/bpf/memalloc.c               |  16 +-
>>>>  kernel/bpf/offload.c                |   8 +-
>>>>  kernel/bpf/states.c                 |   4 +-
>>>>  kernel/bpf/stream.c                 |   4 +-
>>>>  kernel/bpf/verifier.c               |   6 +-
>>>>  kernel/cgroup/cgroup-v1.c           |   4 +-
>>>>  kernel/cgroup/cgroup.c              |  54 +++---
>>>>  kernel/cgroup/dmem.c                |  12 +-
>>>>  kernel/cgroup/rdma.c                |   8 +-
>>>>  kernel/events/core.c                |  44 +++--
>>>>  kernel/events/uprobes.c             |  12 +-
>>>>  kernel/exit.c                       |   8 +-
>>>>  kernel/fail_function.c              |   4 +-
>>>>  kernel/gcov/clang.c                 |   4 +-
>>>>  kernel/irq_work.c                   |   4 +-
>>>>  kernel/kexec_core.c                 |   4 +-
>>>>  kernel/kprobes.c                    |  16 +-
>>>>  kernel/livepatch/core.c             |   4 +-
>>>>  kernel/livepatch/core.h             |   4 +-
>>>>  kernel/liveupdate/kho_block.c       |   4 +-
>>>>  kernel/liveupdate/luo_flb.c         |   4 +-
>>>>  kernel/locking/rwsem.c              |   2 +-
>>>>  kernel/locking/test-ww_mutex.c      |   2 +-
>>>>  kernel/module/main.c                |  11 +-
>>>>  kernel/padata.c                     |   4 +-
>>>>  kernel/power/snapshot.c             |   8 +-
>>>>  kernel/power/wakelock.c             |   4 +-
>>>>  kernel/printk/printk.c              |  11 +-
>>>>  kernel/ptrace.c                     |   4 +-
>>>>  kernel/rcu/rcutorture.c             |   3 +-
>>>>  kernel/rcu/tasks.h                  |   9 +-
>>>>  kernel/rcu/tree.c                   |   6 +-
>>>>  kernel/resource.c                   |   4 +-
>>>>  kernel/sched/core.c                 |   4 +-
>>>>  kernel/sched/ext.c                  |  22 +--
>>>>  kernel/sched/fair.c                 |  28 +--
>>>>  kernel/sched/topology.c             |   4 +-
>>>>  kernel/sched/wait.c                 |   4 +-
>>>>  kernel/seccomp.c                    |   4 +-
>>>>  kernel/signal.c                     |  11 +-
>>>>  kernel/smp.c                        |   4 +-
>>>>  kernel/taskstats.c                  |   8 +-
>>>>  kernel/time/clockevents.c           |   6 +-
>>>>  kernel/time/clocksource.c           |   4 +-
>>>>  kernel/time/posix-cpu-timers.c      |   4 +-
>>>>  kernel/time/posix-timers.c          |   3 +-
>>>>  kernel/torture.c                    |   3 +-
>>>>  kernel/trace/bpf_trace.c            |   4 +-
>>>>  kernel/trace/ftrace.c               |  49 +++--
>>>>  kernel/trace/ring_buffer.c          |  25 ++-
>>>>  kernel/trace/trace.c                |  12 +-
>>>>  kernel/trace/trace_dynevent.c       |   6 +-
>>>>  kernel/trace/trace_dynevent.h       |   5 +-
>>>>  kernel/trace/trace_events.c         |  35 ++--
>>>>  kernel/trace/trace_events_filter.c  |   4 +-
>>>>  kernel/trace/trace_events_hist.c    |   8 +-
>>>>  kernel/trace/trace_events_trigger.c |  17 +-
>>>>  kernel/trace/trace_events_user.c    |  16 +-
>>>>  kernel/trace/trace_stat.c           |   4 +-
>>>>  kernel/user-return-notifier.c       |   3 +-
>>>>  kernel/workqueue.c                  |  16 +-
>>>>  mm/backing-dev.c                    |   8 +-
>>>>  mm/balloon.c                        |   8 +-
>>>>  mm/cma.c                            |   4 +-
>>>>  mm/compaction.c                     |   4 +-
>>>>  mm/damon/core.c                     |   4 +-
>>>>  mm/damon/sysfs-schemes.c            |   4 +-
>>>>  mm/dmapool.c                        |   4 +-
>>>>  mm/huge_memory.c                    |   8 +-
>>>>  mm/hugetlb.c                        |  56 +++---
>>>>  mm/hugetlb_vmemmap.c                |  16 +-
>>>>  mm/khugepaged.c                     |  14 +-
>>>>  mm/kmemleak.c                       |   7 +-
>>>>  mm/ksm.c                            |  25 +--
>>>>  mm/list_lru.c                       |   4 +-
>>>>  mm/memcontrol-v1.c                  |   8 +-
>>>>  mm/memory-failure.c                 |  12 +-
>>>>  mm/memory-tiers.c                   |   4 +-
>>>>  mm/migrate.c                        |  23 ++-
>>>>  mm/mmu_notifier.c                   |   9 +-
>>>>  mm/page_alloc.c                     |   8 +-
>>>>  mm/page_reporting.c                 |   2 +-
>>>>  mm/percpu.c                         |  11 +-
>>>>  mm/pgtable-generic.c                |   4 +-
>>>>  mm/rmap.c                           |  10 +-
>>>>  mm/shmem.c                          |   9 +-
>>>>  mm/slab_common.c                    |  14 +-
>>>>  mm/slub.c                           |  33 ++--
>>>>  mm/swapfile.c                       |   4 +-
>>>>  mm/userfaultfd.c                    |  12 +-
>>>>  mm/vmalloc.c                        |  24 +--
>>>>  mm/vmscan.c                         |   7 +-
>>>>  mm/zsmalloc.c                       |   4 +-
>>>>  124 files changed, 875 insertions(+), 681 deletions(-)
>>>
>>> Not sure what you were thinking, but this diff stat
>>> is not landable.
>>
>> [PATCH v3 1/7] and [PATCH v3 2/7] contain the main logic and can
>> be merged directly. They are also compatible with the old API.
>> [PATCH v3 3/7] through [PATCH v3 7/7] are just simple interface
>> replacements and do not change any functional logic. They can be
>> left unmerged for now; individual modules can pick them up later
>> if needed.
>>
>> In v2, Andy Shevchenko mentioned: "If it's done by Linus himself
>> during the day when he prepares -rc1, it's fine."
> 
> Yes, but you need to get his blessing first to go with this.
> Have you communicated with him on this?

Not yet, because the overall approach is still not mature. People
have different opinions on the implementation details and on how
to move this forward, so I think we should iterate through a few
versions first before making a final decision.

>> Even so, the
>> changes in this patch series are indeed quite large and touch
>> almost every subsystem. I have only converted part of them for
>> now, so I wanted to send this out first and see what people think.
> 
> That's why it's better to provide a script to convert (e.g., coccinelle)
> instead of tons of patches.

I tried writing conversion scripts with Coccinelle, but there were
always cases that got missed. In contrast, I found that using AI
for focused replacements was actually more efficient.

As David Hildenbrand mentioned, "If we decide we want this, I guess
we should target per-subsystem conversions." I would like to provide
the new interface first; adapting each subsystem on demand later may
be easier to achieve.
-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* [PATCH 5.10.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:24 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 7d5a78f49d43..be5c12092246 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,9 +128,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 03a7127efc5a..05f2d7f16670 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4858,28 +4858,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -4904,51 +4896,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5bcd4cbeeb4f..ed32d3c4f0e7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4480,21 +4480,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 9da76104f7a2..18d1551db2b0 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH 5.15.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:23 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3e7bfc0f65ae..b53335ed2d0e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -130,9 +130,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e44115db0efe..770dc7c60656 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5037,28 +5037,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5083,51 +5075,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 537360be8e4e..1a29a9d9e868 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4803,21 +4803,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH 6.1.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:23 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3e7bfc0f65ae..b53335ed2d0e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -130,9 +130,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d3a31ba7c710..5edc4126d0c6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5082,28 +5082,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5128,51 +5120,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 25f31d7718c6..5ef1c79dc5c9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4819,21 +4819,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* [PATCH 6.6.y] ring-buffer: Remove ring_buffer_read_prepare_sync()
From: Bjoern Doebel @ 2026-06-24 12:22 UTC (permalink / raw)
  To: stable
  Cc: Bjoern Doebel, Steven Rostedt, Masami Hiramatsu,
	linux-trace-kernel, linux-kernel, Mathieu Desnoyers,
	David Howells

[ Upstream commit 119a5d573622ae90ba730d18acfae9bb75d77b9a ]

When the ring buffer was first introduced, reading the non-consuming
"trace" file required disabling the writing of the ring buffer. To make
sure the writing was fully disabled before iterating the buffer with a
non-consuming read, it would set the disable flag of the buffer and then
call an RCU synchronization to make sure all the buffers were
synchronized.

The function ring_buffer_read_start() originally  would initialize the
iterator and call an RCU synchronization, but this was for each individual
per CPU buffer where this would get called many times on a machine with
many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf
("ring-buffer: Make non-consuming read less expensive with lots of cpus.")
separated ring_buffer_read_start into ring_buffer_read_prepare(),
ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of
the per CPU buffers to be prepared, call the read_buffer_read_sync() once,
and then the ring_buffer_read_start() for each of the CPUs which made
things much faster.

The commit 1039221cc278 ("ring-buffer: Do not disable recording when there
is an iterator") removed the requirement of disabling the recording of the
ring buffer in order to iterate it, but it did not remove the
synchronization that was happening that was required to wait for all the
buffers to have no more writers. It's now OK for the buffers to have
writers and no synchronization is needed.

Remove the synchronization and put back the interface for the ring buffer
iterator back before commit 72c9ddfd4c5bf was applied.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home
Reported-by: David Howells <dhowells@redhat.com>
Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator")
Tested-by: David Howells <dhowells@redhat.com>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Assisted-by: Kiro:claude-opus-4.8
Signed-off-by: Bjoern Doebel <doebel@amazon.de>
---
 include/linux/ring_buffer.h |  4 +--
 kernel/trace/ring_buffer.c  | 67 ++++++-------------------------------
 kernel/trace/trace.c        | 14 +++-----
 kernel/trace/trace_kdb.c    |  8 ++---
 4 files changed, 18 insertions(+), 75 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index ded528d23f85..382fbaa701f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -129,9 +129,7 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags);
-void ring_buffer_read_prepare_sync(void);
-void ring_buffer_read_start(struct ring_buffer_iter *iter);
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags);
 void ring_buffer_read_finish(struct ring_buffer_iter *iter);
 
 struct ring_buffer_event *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 508edf1f3f1e..52c7dbccafed 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5084,28 +5084,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
 
 /**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
  * @buffer: The ring buffer to read from
  * @cpu: The cpu buffer to iterate over
  * @flags: gfp flags to use for memory allocation
  *
- * This performs the initial preparations necessary to iterate
- * through the buffer.  Memory is allocated, buffer recording
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * Disabling buffer recording prevents the reading from being
- * corrupted. This is not a consuming read, so a producer is not
- * expected.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
  *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
  */
 struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_iter *iter;
@@ -5130,51 +5122,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
 
 	atomic_inc(&cpu_buffer->resize_disabled);
 
-	return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized.  Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
-	synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
-	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long flags;
-
-	if (!iter)
-		return;
-
-	cpu_buffer = iter->cpu_buffer;
-
-	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
 	arch_spin_lock(&cpu_buffer->lock);
 	rb_iter_reset(iter);
 	arch_spin_unlock(&cpu_buffer->lock);
-	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6b35666a4e0b..f57baf67726d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4792,21 +4792,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->array_buffer->buffer,
-							 cpu, GFP_KERNEL);
-		}
-		ring_buffer_read_prepare_sync();
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_read_start(iter->buffer_iter[cpu]);
+				ring_buffer_read_start(iter->array_buffer->buffer,
+						       cpu, GFP_KERNEL);
 			tracing_iter_reset(iter, cpu);
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->array_buffer->buffer,
-						 cpu, GFP_KERNEL);
-		ring_buffer_read_prepare_sync();
-		ring_buffer_read_start(iter->buffer_iter[cpu]);
+			ring_buffer_read_start(iter->array_buffer->buffer,
+					       cpu, GFP_KERNEL);
 		tracing_iter_reset(iter, cpu);
 	}
 
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 59857a1ee44c..628c25693cef 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
-						 cpu, GFP_ATOMIC);
-			ring_buffer_read_start(iter.buffer_iter[cpu]);
+			ring_buffer_read_start(iter.array_buffer->buffer,
+					       cpu, GFP_ATOMIC);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.array_buffer->buffer,
+			ring_buffer_read_start(iter.array_buffer->buffer,
 						 cpu_file, GFP_ATOMIC);
-		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
 
-- 
2.50.1




Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christof Hellmis, Andreas Stieger
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597


^ permalink raw reply related

* Re: [RFC PATCH 1/3] mm/compaction: skip isolate mlocked folios when compact_unevictable_allowed=0
From: Wandun @ 2026-06-24 11:08 UTC (permalink / raw)
  To: Vlastimil Babka (SUSE), linux-mm, linux-kernel,
	linux-trace-kernel, linux-rt-devel
  Cc: akpm, surenb, mhocko, jackmanb, hannes, ziy, rostedt, mhiramat,
	mathieu.desnoyers, david, ljs, liam, rppt, bigeasy, clrkwllms,
	Alexander.Krabler, Hugh Dickins
In-Reply-To: <c8793c0f-7156-4cb7-9e6e-7909397e2fff@kernel.org>



On 6/22/26 17:55, Vlastimil Babka (SUSE) wrote:
> On 6/18/26 13:43, Wandun wrote:
>>
>>
>> On 6/18/26 02:52, Vlastimil Babka (SUSE) wrote:
>>> On 6/4/26 04:38, Wandun Chen wrote:
>>>> From: Wandun Chen <chenwandun@lixiang.com>
>>>>
>>>> compact_unevictable_allowed is default 0 under PREEMPT_RT,
>>>> isolate_migratepages_block() skips folios with PG_unevictable set.
>>>> However, mlock_folio() sets PG_mlocked immediately but defers
>>>> PG_unevictable to mlock_folio_batch(), result in a folio with
>>>> PG_mlocked=1 but PG_unevictable=0. Compaction will isolate such a
>>>> folio.
>>>>
>>>> Fix by checking folio_test_mlocked() together with the existing
>>>> folio_test_unevictable() check.
>>>>
>>>> A similar issue has been reported by Alexander Krabler on a 6.12-rt
>>>> aarch64 system. Vlastimil suggested to check the mlocked flag [1].
>>>>
>>>> Reported-by: Alexander Krabler <Alexander.Krabler@kuka.com>
>>>> Closes: https://lore.kernel.org/all/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/
>>>> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
>>>> Signed-off-by: Wandun Chen <chenwandun@lixiang.com>
>>>> Link: https://lore.kernel.org/all/33275585-f2db-4779-89f0-3ae24b455a67@suse.cz/ [1]
>>>
>>> Well in that thread, Hugh doubted my suggestion and then it seems we didn't
>>> concluded anything. Did you actually in practice observe the issue that
>>> Alexander had, and that this patch fixed it, or is that theoretical?
>>>
>> Yes, I wrote a test case that can reproduce it in a few second.
>>
>> The test case contains 3 steps:
>> 1. mlockall
>> 2. mmap file(2GB) + trigger file write page fault;
>> 3. during step 1, trigger compact via /proc/sys/vm/compact_memory
>>
>>
>> My reproduction environment is qemu with 4GB ram, 8 core, aarch64,
>> preempt_rt and includes the tracepoint in patch 02.
>> After running the reproduction program for a few seconds, the
>> following output appears.
> 
> Ah, nice.
> 
>> repro-403     [004] ....1   101.270505: mm_compaction_isolate_folio: pfn=0x71e3a mode=0x0 flags=referenced|uptodate|mlocked
>> repro-403     [004] ....1   101.270507: mm_compaction_isolate_folio: pfn=0x71e3b mode=0x0 flags=referenced|uptodate|mlocked
>> repro-403     [004] ....1   101.270513: mm_compaction_isolate_folio: pfn=0x71e3c mode=0x0 flags=referenced|uptodate|mlocked
>> repro-403     [004] ....1   101.270515: mm_compaction_isolate_folio: pfn=0x71e3d mode=0x0 flags=uptodate|mlocked
>> repro-403     [004] ....1   101.270517: mm_compaction_isolate_folio: pfn=0x71e3e mode=0x0 flags=uptodate|mlocked
>> repro-403     [004] ....1   101.270520: mm_compaction_isolate_folio: pfn=0x71e3f mode=0x0 flags=uptodate|mlocked
>>
>>
>> Unfortunately, I recently found that there is still a bug in the
>> fix patch. Setting mlocked in the mlock_folio function could happen
>> even after the page is successfully isolated, so it still cannot
>> prevent migration. Because of this, I need to think more about how
>> to fix it.
>>
>> Perhaps we should double-check whether the page is mlocked during
>> the actual migration phase.
> 
> So IIUC the isolation+migration might be started between the folio is
> allocated, and mlocked? In that case the check during migration could still

Yes, in that case it still be racy, it is not a good idea to check page flags.

> be racy, and if the page is isolated, it's already bad for the RT process.

IIUC, more accurately, the migration entry in the page talbe is real a bad for
RT process, because isolate page doesn't modify the page table, so memory
access continues as usual, therefore a new idea occur.

S1. In the mlock[all] syscall, if mlock_vma_pages_range hit a migration entry,
    then, it should wait for the migration to complete.

S2. During the unmap phase of memory migration, prevent a page from being unmapped
    if the page's associated vma is markd with VM_LOCKED, similar to how reclaim is
    disabled for pages in a VM_LOCKED vma(try_to_unmap_one).


For a page handled during the mlock[all] syscall:
  - if migration has been already finished, there is noting to do;
  - if migration is in progress and the migration etnry is already filled, we
    wait (S1)
  - if the page is in-fight, going to be isolated/migrated, S2 prevents the unmap.

For a page handled during a page fault: VM_LOCKED is already set on the vma,
so S2 guarantees it will not be unmapped, hence no migration entry.


Thanks a lot for the detailed feedback, Vlastimil.

Best regards,
Wandun


> 
> So this would only be a short-term problem after the mlockall, but we don't
> have a way for the RT process to know the moment it's all settled, right?

Yes, some pages may have been isolated and will do migration.

> Probably the proper solution would be for mlock[all]() itself to wait for an
> isolated page, and only continue once it knows it can't be isolated anymore.
> This might howver would go against some of the folio batching optimizations?
> 
>> What do you think of this best-effort approach?
>>
>>
>> Best regards,
>> Wandun
>>
>>
>>
>>
>>
>> The full reproducer is as below:
>>
>> /* gcc repro.c -o repro -lpthread */
>>
>> #define _GNU_SOURCE
>> #include <fcntl.h>
>> #include <pthread.h>
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <sys/mman.h>
>> #include <unistd.h>
>>
>> #define PAGE_SIZE       4096
>> #define NR_PAGES        32
>> #define FILE_SIZE       (2ULL * 1024 * 1024 * 1024)
>>
>> static void *worker_fn(void *arg)
>> {
>> 	int fd = (long)arg;
>> 	size_t len = (size_t)FILE_SIZE;
>> 	char *p = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>> 	if (p == MAP_FAILED)
>> 		return NULL;
>>
>> 	for (size_t off = 0; off + NR_PAGES * PAGE_SIZE <= len;
>> 	     off += NR_PAGES * PAGE_SIZE) {
>> 		for (int i = 0; i < NR_PAGES; i++)
>> 			p[off + i * PAGE_SIZE] = 1;
>> 		usleep(200);
>> 	}
>>
>> 	munmap(p, len);
>> 	return NULL;
>> }
>>
>> static void *compact_fn(void *arg)
>> {
>> 	(void)arg;
>> 	int fd = open("/proc/sys/vm/compact_memory", O_WRONLY);
>> 	if (fd < 0)
>> 		return NULL;
>>
>> 	while (1) {
>> 		if (write(fd, "1", 1) < 0) {}
>> 		usleep(5000);
>> 	}
>> }
>>
>> int main(void)
>> {
>> 	mlockall(MCL_CURRENT | MCL_FUTURE);
>>
>> 	int fd = open("./repro_largefile.dat", O_RDWR | O_CREAT, 0600);
>> 	if (fd < 0)
>> 		return 1;
>> 	unlink("./repro_largefile.dat");
>> 	if (ftruncate(fd, (off_t)FILE_SIZE) < 0)
>> 		return 1;
>>
>> 	printf("repro_largefile: 1 worker, %d pages/batch, Ctrl-C to stop\n",
>> 	       NR_PAGES);
>>
>> 	pthread_t compact, worker;
>> 	pthread_create(&compact, NULL, compact_fn, NULL);
>> 	pthread_create(&worker, NULL, worker_fn, (void *)(long)fd);
>>
>> 	pthread_join(worker, NULL);
>> 	return 0;
>> }
>>
>>>> ---
>>>>  mm/compaction.c | 3 ++-
>>>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/mm/compaction.c b/mm/compaction.c
>>>> index b776f35ad020..7e07b792bcb5 100644
>>>> --- a/mm/compaction.c
>>>> +++ b/mm/compaction.c
>>>> @@ -1116,7 +1116,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
>>>>  		is_unevictable = folio_test_unevictable(folio);
>>>>  
>>>>  		/* Compaction might skip unevictable pages but CMA takes them */
>>>> -		if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
>>>> +		if (!(mode & ISOLATE_UNEVICTABLE) &&
>>>> +		    (is_unevictable || folio_test_mlocked(folio)))
>>>>  			goto isolate_fail_put;
>>>>  
>>>>  		/*
>>>
>>
> 


^ permalink raw reply

* Re: [PATCH v3 2/2] tracing: Remove trace_printk.h from kernel.h
From: David Laight @ 2026-06-24 10:11 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Linus Torvalds,
	Sebastian Andrzej Siewior, John Ogness, Thomas Gleixner,
	Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624081948.301578807@kernel.org>

On Wed, 24 Jun 2026 04:18:08 -0400
Steven Rostedt <rostedt@kernel.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> There have been complaints about trace_printk.h causing more build time
> for being in kernel.h. Move it out of kernel.h and place it in the headers
> and C files that use it.
> 
> Link: https://lore.kernel.org/all/CAHk-=wikCBeVFjVXiY4o-oepdbjAoir5+TcAgtL12c4u1TpZLQ@mail.gmail.com/

That is all about changes to the file causing everything to be rebuilt,
not the contents of the file slowing down builds.
The two are different.

The part you are moving out of normal builds is just a few #defines.
They won't have a significant effect on build times either.

So there is no point splitting out trace_controls.h.

	David



^ permalink raw reply

* Re: [PATCH v6 6/8] Documentation: bootconfig: document build-time cmdline rendering
From: Masami Hiramatsu @ 2026-06-24  8:47 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260623-bootconfig_using_tools-v6-6-640c2f587a3c@debian.org>

On Tue, 23 Jun 2026 09:15:33 -0700
Breno Leitao <leitao@debian.org> wrote:

> Add a section describing CONFIG_CMDLINE_FROM_BOOTCONFIG: what it
> does (renders the embedded "kernel" subtree to a flat cmdline at
> build time so early_param() handlers see the values), what it
> requires (BOOT_CONFIG_EMBED, a non-empty BOOT_CONFIG_EMBED_FILE,
> and ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG -- currently x86 only),
> the bootconfig opt-in semantics, the initrd-vs-embedded precedence,
> and the soft-error overflow behavior.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  Documentation/admin-guide/bootconfig.rst | 81 ++++++++++++++++++++++++++++++++
>  1 file changed, 81 insertions(+)
> 
> diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
> index f712758472d5c..349cefbb2bbcd 100644
> --- a/Documentation/admin-guide/bootconfig.rst
> +++ b/Documentation/admin-guide/bootconfig.rst
> @@ -234,6 +234,87 @@ Kconfig option selected.
>  Note that even if you set this option, you can override the embedded
>  bootconfig by another bootconfig which attached to the initrd.
>  
> +Rendering Embedded kernel.* Keys at Build Time
> +----------------------------------------------
> +
> +By default, the embedded bootconfig (``CONFIG_BOOT_CONFIG_EMBED=y``) is
> +parsed at runtime, after ``parse_early_param()`` has already run. Early
> +parameter handlers (``mem=``, ``earlycon=``, ``loglevel=``, ...) therefore
> +cannot see values supplied via the embedded ``kernel`` subtree.
> +
> +``CONFIG_CMDLINE_FROM_BOOTCONFIG`` resolves this by rendering the
> +``kernel`` subtree of ``CONFIG_BOOT_CONFIG_EMBED_FILE`` into a flat cmdline
> +string at kernel build time (via ``tools/bootconfig -C``) and prepending
> +it to ``boot_command_line`` during early architecture setup, so the keys
> +are visible to ``parse_early_param()``.
> +
> +The option requires ``CONFIG_BOOT_CONFIG_EMBED=y``, a non-empty
> +``CONFIG_BOOT_CONFIG_EMBED_FILE``, and an architecture that selects
> +``CONFIG_ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG``. Currently only x86
> +selects it; on other architectures the embedded bootconfig still works,
> +but only through the late runtime parser.

As commented by Sashiko, here we need to mention that this option requires
CONFIG_CMDLINE to be empty. This means user can NOT set both option
at once (This also means user doesn't have to worry about configuration
conflicts.)

Thanks,



-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v6 8/8] x86/setup: prepend embedded bootconfig cmdline before parse_early_param
From: Masami Hiramatsu @ 2026-06-24  8:47 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260623-bootconfig_using_tools-v6-8-640c2f587a3c@debian.org>

On Tue, 23 Jun 2026 09:15:35 -0700
Breno Leitao <leitao@debian.org> wrote:

> Call xbc_prepend_embedded_cmdline() in setup_arch() right after the
> CONFIG_CMDLINE merge and before strscpy(command_line, ...) so the
> build-time-rendered embedded bootconfig "kernel" subtree is part of
> boot_command_line by the time parse_early_param() runs. early_param()
> handlers (mem=, earlycon=, loglevel=, ...) now see values supplied via
> CONFIG_BOOT_CONFIG_EMBED_FILE without parsing bootconfig at runtime.
> 
> Gate the prepend on the same opt-in the runtime parser uses: prepend
> when "bootconfig" is present on the command line, or when
> CONFIG_BOOT_CONFIG_FORCE is set. Detect it with parse_args(), exactly
> as setup_boot_config() does, so both agree on what counts as opt-in:
> any "bootconfig" key regardless of value (bare, =0, =1, ...), and only
> before the "--" that separates init arguments. Sharing the parser keeps
> the early and late paths from diverging -- e.g. "bootconfig=0" or a
> "-- bootconfig" meant for init must not apply the embedded keys early
> while the runtime parser skips them.
> 
> The prepend necessarily runs before setup_boot_config() detects an
> initrd bootconfig, so an initrd cannot override the embedded "kernel"
> keys for early_param(). This is intentional: the embedded cmdline acts
> like a build-time CONFIG_CMDLINE. An initrd bootconfig's "kernel" keys
> never reached early_param() anyway (they apply late via
> extra_command_line), so nothing is lost -- the initrd keys still apply
> late, with last-wins keeping the embedded values in effect.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  arch/x86/Kconfig        |  1 +
>  arch/x86/kernel/setup.c | 43 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 0de23e6471973..8ab11199c16d5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -127,6 +127,7 @@ config X86
>  	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
>  	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
>  	select ARCH_SUPPORTS_CFI		if X86_64
> +	select ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG
>  	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI
>  	select ARCH_SUPPORTS_LTO_CLANG
>  	select ARCH_SUPPORTS_LTO_CLANG_THIN
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 46882ce79c3a4..c973a2cebcd04 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -6,6 +6,7 @@
>   * parts of early kernel initialization.
>   */
>  #include <linux/acpi.h>
> +#include <linux/bootconfig.h>
>  #include <linux/console.h>
>  #include <linux/cpu.h>
>  #include <linux/crash_dump.h>
> @@ -881,6 +882,37 @@ static void __init x86_report_nx(void)
>   * Note: On x86_64, fixmaps are ready for use even before this is called.
>   */
>  
> +#ifdef CONFIG_CMDLINE_FROM_BOOTCONFIG
> +static int __init bootconfig_optin(char *param, char *val,
> +				   const char *unused, void *arg)
> +{
> +	if (!strcmp(param, "bootconfig"))
> +		*(bool *)arg = true;
> +	return 0;
> +}
> +
> +/*
> + * Did the user opt in to bootconfig on the kernel command line? Use
> + * parse_args() so this matches setup_boot_config() exactly, including
> + * stopping at the "--" that separates init arguments.
> + */
> +static bool __init bootconfig_cmdline_requested(void)
> +{
> +	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
> +	bool found = false;
> +
> +	if (IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE))
> +		return true;
> +
> +	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
> +	if (IS_ERR(parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0,
> +			      &found, bootconfig_optin)))
> +		return false;
> +
> +	return found;
> +}

It seems that this should be placed in a common place because it will
be used from other architectures (and init/main.c too). Maybe we can
introduce something like this?

bool __init bootconfig_cmdline_requested(const char *boot_cmdline, int *end_offset);

Thanks,

> +#endif
> +
>  void __init setup_arch(char **cmdline_p)
>  {
>  #ifdef CONFIG_X86_32
> @@ -924,6 +956,17 @@ void __init setup_arch(char **cmdline_p)
>  	builtin_cmdline_added = true;
>  #endif
>  
> +#ifdef CONFIG_CMDLINE_FROM_BOOTCONFIG
> +	/*
> +	 * Prepend the build-time-rendered embedded "kernel" keys here so
> +	 * parse_early_param() below sees them, gating on the same opt-in
> +	 * as the runtime parser (see bootconfig_cmdline_requested()).
> +	 */
> +	if (bootconfig_cmdline_requested())
> +		xbc_prepend_embedded_cmdline(boot_command_line,
> +					     COMMAND_LINE_SIZE);
> +#endif
> +
>  	strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
>  	*cmdline_p = command_line;
>  
> 
> -- 
> 2.53.0-Meta
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v3 1/2] tracing: Move non-trace_printk prototypes into trace_controls.h
From: Steven Rostedt @ 2026-06-24  8:18 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624081806.120105649@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

In order to remove the include to trace_printk.h from kernel.h the tracing
control prototypes need to be separated into their own header file as they
are used in other common header files like rcu.h. There's no point in
removing trace_printk.h from kernel.h if it just gets added back to other
common headers.

Prototypes are very cheap for the compiler and should not be an issue.

ftrace_dump() and trace_dump_stack() are also moved into trace_controls.h,
as they are used in cases where things go wrong. The main use case is to
do a trace_dump_stack(); tracing_off(); ftrace_dump(); in a place that
detected that something went wrong, whereas, trace_printk() is added to
normal code during debugging and removed before committing upstream. The
dump code is fine to keep in production.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v2: https://patch.msgid.link/20260622131029.655382134@kernel.org

- Updated the change log

 arch/powerpc/xmon/xmon.c         |  1 +
 arch/s390/kernel/ipl.c           |  1 +
 arch/s390/kernel/machine_kexec.c |  1 +
 drivers/gpu/drm/i915/i915_gem.h  |  1 +
 drivers/tty/sysrq.c              |  1 +
 include/linux/trace_controls.h   | 54 ++++++++++++++++++++++++++++++++
 include/linux/trace_printk.h     | 51 ------------------------------
 kernel/debug/debug_core.c        |  1 +
 kernel/panic.c                   |  1 +
 kernel/rcu/rcu.h                 |  2 ++
 kernel/rcu/rcutorture.c          |  1 +
 kernel/trace/trace.h             |  1 +
 kernel/trace/trace_benchmark.c   |  1 +
 lib/sys_info.c                   |  1 +
 14 files changed, 67 insertions(+), 51 deletions(-)
 create mode 100644 include/linux/trace_controls.h

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index cb3a3244ae6f..2135f319e0dd 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/debugfs.h>
+#include <linux/trace_controls.h>
 
 #include <asm/ptrace.h>
 #include <asm/smp.h>
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 3c346b02ceb9..baac66cc4de4 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -22,6 +22,7 @@
 #include <linux/debug_locks.h>
 #include <linux/vmalloc.h>
 #include <linux/secure_boot.h>
+#include <linux/trace_controls.h>
 #include <asm/asm-extable.h>
 #include <asm/machine.h>
 #include <asm/diag.h>
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index baeb3dcfc1c8..33f9a89eb3ad 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/trace_controls.h>
 #include <linux/debug_locks.h>
 #include <linux/cpufeature.h>
 #include <asm/guarded_storage.h>
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 20b3cb29cfff..1da8fb61c09e 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -116,6 +116,7 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file);
 #endif
 
 #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)
+#include <linux/trace_controls.h>
 #define GEM_TRACE(...) trace_printk(__VA_ARGS__)
 #define GEM_TRACE_ERR(...) do {						\
 	pr_err(__VA_ARGS__);						\
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index c2e4b31b699a..d3f72dc430b8 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -324,6 +324,7 @@ static const struct sysrq_key_op sysrq_showstate_blocked_op = {
 };
 
 #ifdef CONFIG_TRACING
+#include <linux/trace_controls.h>
 #include <linux/ftrace.h>
 
 static void sysrq_ftrace_dump(u8 key)
diff --git a/include/linux/trace_controls.h b/include/linux/trace_controls.h
new file mode 100644
index 000000000000..995b97e963b4
--- /dev/null
+++ b/include/linux/trace_controls.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TRACE_CONTROLS_H
+#define _LINUX_TRACE_CONTROLS_H
+
+
+/*
+ * General tracing related utility functions - trace_printk(),
+ * tracing_on/tracing_off and tracing_start()/tracing_stop
+ *
+ * Use tracing_on/tracing_off when you want to quickly turn on or off
+ * tracing. It simply enables or disables the recording of the trace events.
+ * This also corresponds to the user space /sys/kernel/tracing/tracing_on
+ * file, which gives a means for the kernel and userspace to interact.
+ * Place a tracing_off() in the kernel where you want tracing to end.
+ * From user space, examine the trace, and then echo 1 > tracing_on
+ * to continue tracing.
+ *
+ * tracing_stop/tracing_start has slightly more overhead. It is used
+ * by things like suspend to ram where disabling the recording of the
+ * trace is not enough, but tracing must actually stop because things
+ * like calling smp_processor_id() may crash the system.
+ *
+ * Most likely, you want to use tracing_on/tracing_off.
+ */
+enum ftrace_dump_mode {
+	DUMP_NONE,
+	DUMP_ALL,
+	DUMP_ORIG,
+	DUMP_PARAM,
+};
+
+#ifdef CONFIG_TRACING
+void tracing_on(void);
+void tracing_off(void);
+int tracing_is_on(void);
+void tracing_snapshot(void);
+void tracing_snapshot_alloc(void);
+void tracing_start(void);
+void tracing_stop(void);
+void trace_dump_stack(int skip);
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
+#else
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void tracing_on(void) { }
+static inline void tracing_off(void) { }
+static inline int tracing_is_on(void) { return 0; }
+static inline void tracing_snapshot(void) { }
+static inline void tracing_snapshot_alloc(void) { }
+static inline void trace_dump_stack(int skip) { }
+static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
+#endif
+
+#endif /* _LINUX_TRACE_CONTROLS_H */
diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
index 3d54f440dccf..a488ea9e9f85 100644
--- a/include/linux/trace_printk.h
+++ b/include/linux/trace_printk.h
@@ -7,43 +7,7 @@
 #include <linux/stddef.h>
 #include <linux/stringify.h>
 
-/*
- * General tracing related utility functions - trace_printk(),
- * tracing_on/tracing_off and tracing_start()/tracing_stop
- *
- * Use tracing_on/tracing_off when you want to quickly turn on or off
- * tracing. It simply enables or disables the recording of the trace events.
- * This also corresponds to the user space /sys/kernel/tracing/tracing_on
- * file, which gives a means for the kernel and userspace to interact.
- * Place a tracing_off() in the kernel where you want tracing to end.
- * From user space, examine the trace, and then echo 1 > tracing_on
- * to continue tracing.
- *
- * tracing_stop/tracing_start has slightly more overhead. It is used
- * by things like suspend to ram where disabling the recording of the
- * trace is not enough, but tracing must actually stop because things
- * like calling smp_processor_id() may crash the system.
- *
- * Most likely, you want to use tracing_on/tracing_off.
- */
-
-enum ftrace_dump_mode {
-	DUMP_NONE,
-	DUMP_ALL,
-	DUMP_ORIG,
-	DUMP_PARAM,
-};
-
 #ifdef CONFIG_TRACING
-void tracing_on(void);
-void tracing_off(void);
-int tracing_is_on(void);
-void tracing_snapshot(void);
-void tracing_snapshot_alloc(void);
-
-extern void tracing_start(void);
-extern void tracing_stop(void);
-
 static inline __printf(1, 2)
 void ____trace_printk_check_format(const char *fmt, ...)
 {
@@ -149,8 +113,6 @@ int __trace_printk(unsigned long ip, const char *fmt, ...);
 extern int __trace_bputs(unsigned long ip, const char *str);
 extern int __trace_puts(unsigned long ip, const char *str);
 
-extern void trace_dump_stack(int skip);
-
 /*
  * The double __builtin_constant_p is because gcc will give us an error
  * if we try to allocate the static variable to fmt if it is not a
@@ -173,19 +135,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
 
 extern __printf(2, 0) int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
-
-extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode);
 #else
-static inline void tracing_start(void) { }
-static inline void tracing_stop(void) { }
-static inline void trace_dump_stack(int skip) { }
-
-static inline void tracing_on(void) { }
-static inline void tracing_off(void) { }
-static inline int tracing_is_on(void) { return 0; }
-static inline void tracing_snapshot(void) { }
-static inline void tracing_snapshot_alloc(void) { }
-
 static inline __printf(1, 2)
 int trace_printk(const char *fmt, ...)
 {
@@ -196,7 +146,6 @@ ftrace_vprintk(const char *fmt, va_list ap)
 {
 	return 0;
 }
-static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #endif /* CONFIG_TRACING */
 
 #endif
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index b276504c1c6b..f9c83a470c98 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -27,6 +27,7 @@
 
 #define pr_fmt(fmt) "KGDB: " fmt
 
+#include <linux/trace_controls.h>
 #include <linux/pid_namespace.h>
 #include <linux/clocksource.h>
 #include <linux/serial_core.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 213725b612aa..1415e910371d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -9,6 +9,7 @@
  * This function is used through-out the kernel (including mm and fs)
  * to indicate a major problem.
  */
+#include <linux/trace_controls.h>
 #include <linux/debug_locks.h>
 #include <linux/sched/debug.h>
 #include <linux/interrupt.h>
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index fa6d30ce73d1..b3e2c8f25a4f 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -280,6 +280,8 @@ extern int rcu_cpu_stall_notifiers;
 
 #ifdef CONFIG_RCU_STALL_COMMON
 
+#include <linux/trace_controls.h>
+
 extern int rcu_cpu_stall_ftrace_dump;
 extern int rcu_cpu_stall_suppress;
 extern int rcu_cpu_stall_timeout;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 882a158ada7b..76bf0184b267 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -39,6 +39,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_controls.h>
 #include <asm/byteorder.h>
 #include <linux/torture.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 80fe152af1dd..2537c33ddd49 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@
 #include <linux/ctype.h>
 #include <linux/once_lite.h>
 #include <linux/ftrace_regs.h>
+#include <linux/trace_controls.h>
 #include <linux/llist.h>
 
 #include "pid_list.h"
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index e19c32f2a938..69cc39008c36 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/trace_clock.h>
+#include <linux/trace_controls.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace_benchmark.h"
diff --git a/lib/sys_info.c b/lib/sys_info.c
index f32a06ec9ed4..e3c9ca05601b 100644
--- a/lib/sys_info.c
+++ b/lib/sys_info.c
@@ -8,6 +8,7 @@
 #include <linux/ftrace.h>
 #include <linux/nmi.h>
 #include <linux/sched/debug.h>
+#include <linux/trace_controls.h>
 #include <linux/string.h>
 #include <linux/sysctl.h>
 
-- 
2.53.0



^ permalink raw reply related

* [PATCH v3 2/2] tracing: Remove trace_printk.h from kernel.h
From: Steven Rostedt @ 2026-06-24  8:18 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624081806.120105649@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

There have been complaints about trace_printk.h causing more build time
for being in kernel.h. Move it out of kernel.h and place it in the headers
and C files that use it.

Link: https://lore.kernel.org/all/CAHk-=wikCBeVFjVXiY4o-oepdbjAoir5+TcAgtL12c4u1TpZLQ@mail.gmail.com/

Suggested-by: Yury Norov <yury.norov@gmail.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
Changes since v2: https://patch.msgid.link/20260622131029.816825024@kernel.org

- Remove #ifdef DEBUG and just always include trace_printk.h in dummy_stm.c.

 arch/powerpc/kvm/book3s_xics.c         | 1 +
 drivers/gpu/drm/i915/gt/intel_gtt.h    | 1 +
 drivers/gpu/drm/i915/i915_gem.h        | 1 +
 drivers/hwtracing/stm/dummy_stm.c      | 1 +
 drivers/infiniband/hw/hfi1/trace_dbg.h | 1 +
 drivers/usb/early/xhci-dbc.c           | 1 +
 fs/ext4/inline.c                       | 1 +
 include/linux/ftrace.h                 | 2 ++
 include/linux/kernel.h                 | 1 -
 include/linux/sunrpc/debug.h           | 1 +
 include/linux/trace_printk.h           | 5 +++--
 kernel/trace/ring_buffer_benchmark.c   | 1 +
 samples/fprobe/fprobe_example.c        | 1 +
 samples/ftrace/ftrace-direct-too.c     | 1 -
 samples/trace_printk/trace-printk.c    | 1 +
 15 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 74a44fa702b0..ef5eb596a56e 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -26,6 +26,7 @@
 #if 1
 #define XICS_DBG(fmt...) do { } while (0)
 #else
+#include <linux/trace_printk.h>
 #define XICS_DBG(fmt...) trace_printk(fmt)
 #endif
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index b54ee4f25af1..f6f223090760 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -35,6 +35,7 @@
 #define I915_GFP_ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
 
 #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GTT)
+#include <linux/trace_printk.h>
 #define GTT_TRACE(...) trace_printk(__VA_ARGS__)
 #else
 #define GTT_TRACE(...)
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index 1da8fb61c09e..f490052e8964 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -117,6 +117,7 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file);
 
 #if IS_ENABLED(CONFIG_DRM_I915_TRACE_GEM)
 #include <linux/trace_controls.h>
+#include <linux/trace_printk.h>
 #define GEM_TRACE(...) trace_printk(__VA_ARGS__)
 #define GEM_TRACE_ERR(...) do {						\
 	pr_err(__VA_ARGS__);						\
diff --git a/drivers/hwtracing/stm/dummy_stm.c b/drivers/hwtracing/stm/dummy_stm.c
index 38528ffdc0b3..7c5e48ebfb9f 100644
--- a/drivers/hwtracing/stm/dummy_stm.c
+++ b/drivers/hwtracing/stm/dummy_stm.c
@@ -8,6 +8,7 @@
  */
 
 #undef DEBUG
+#include <linux/trace_printk.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index 58304b91380f..30df5e246586 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -103,6 +103,7 @@ __hfi1_trace_def(IOCTL);
  */
 
 #ifdef HFI1_EARLY_DBG
+#include <linux/trace_printk.h>
 #define hfi1_dbg_early(fmt, ...) \
 	trace_printk(fmt, ##__VA_ARGS__)
 #else
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
index 41118bba9197..955c73bd601f 100644
--- a/drivers/usb/early/xhci-dbc.c
+++ b/drivers/usb/early/xhci-dbc.c
@@ -30,6 +30,7 @@ static struct xdbc_state xdbc;
 static bool early_console_keep;
 
 #ifdef XDBC_TRACE
+#include <linux/trace_printk.h>
 #define	xdbc_trace	trace_printk
 #else
 static inline void xdbc_trace(const char *fmt, ...) { }
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8045e4ff270c..0eff4a0c6a6c 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -934,6 +934,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
 }
 
 #ifdef INLINE_DIR_DEBUG
+#include <linux/trace_printk.h>
 void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
 			  void *inline_start, int inline_size)
 {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 02bc5027523a..b5336a81e619 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -8,6 +8,8 @@
 #define _LINUX_FTRACE_H
 
 #include <linux/trace_recursion.h>
+#include <linux/trace_controls.h>
+#include <linux/trace_printk.h>
 #include <linux/trace_clock.h>
 #include <linux/jump_label.h>
 #include <linux/kallsyms.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e5570a16cbb1..e87a40fbd152 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -31,7 +31,6 @@
 #include <linux/build_bug.h>
 #include <linux/sprintf.h>
 #include <linux/static_call_types.h>
-#include <linux/trace_printk.h>
 #include <linux/util_macros.h>
 #include <linux/wordpart.h>
 
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index ab61bed2f7af..7524f5d82fba 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -29,6 +29,7 @@ extern unsigned int		nlm_debug;
 # define ifdebug(fac)		if (unlikely(rpc_debug & RPCDBG_##fac))
 
 # if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE)
+#  include <linux/trace_printk.h>
 #  define __sunrpc_printk(fmt, ...)	trace_printk(fmt, ##__VA_ARGS__)
 # else
 #  define __sunrpc_printk(fmt, ...)	printk(KERN_DEFAULT fmt, ##__VA_ARGS__)
diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
index a488ea9e9f85..74ce4f8995c4 100644
--- a/include/linux/trace_printk.h
+++ b/include/linux/trace_printk.h
@@ -1,11 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_TRACE_PRINTK_H
 #define _LINUX_TRACE_PRINTK_H
+#if !defined(__ASSEMBLY__) && !defined(__GENKSYMS__) && !defined(BUILD_VDSO)
 
-#include <linux/compiler_attributes.h>
 #include <linux/instruction_pointer.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
+#include <linux/stdarg.h>
 
 #ifdef CONFIG_TRACING
 static inline __printf(1, 2)
@@ -147,5 +148,5 @@ ftrace_vprintk(const char *fmt, va_list ap)
 	return 0;
 }
 #endif /* CONFIG_TRACING */
-
+#endif /* !defined(__ASSEMBLY__) && !defined(__GENKSYMS__) && !defined(BUILD_VDSO) */
 #endif
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 593e3b59e42e..2bb25caebb75 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_printk.h>
 #include <linux/completion.h>
 #include <linux/kthread.h>
 #include <uapi/linux/sched/types.h>
diff --git a/samples/fprobe/fprobe_example.c b/samples/fprobe/fprobe_example.c
index bfe98ce826f3..de81b9b4ca7d 100644
--- a/samples/fprobe/fprobe_example.c
+++ b/samples/fprobe/fprobe_example.c
@@ -12,6 +12,7 @@
 
 #define pr_fmt(fmt) "%s: " fmt, __func__
 
+#include <linux/trace_printk.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/fprobe.h>
diff --git a/samples/ftrace/ftrace-direct-too.c b/samples/ftrace/ftrace-direct-too.c
index bf2411aa6fd7..159190f4103f 100644
--- a/samples/ftrace/ftrace-direct-too.c
+++ b/samples/ftrace/ftrace-direct-too.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/module.h>
-
 #include <linux/mm.h> /* for handle_mm_fault() */
 #include <linux/ftrace.h>
 #if !defined(CONFIG_ARM64) && !defined(CONFIG_PPC32)
diff --git a/samples/trace_printk/trace-printk.c b/samples/trace_printk/trace-printk.c
index cfc159580263..ff37aeb8523e 100644
--- a/samples/trace_printk/trace-printk.c
+++ b/samples/trace_printk/trace-printk.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
+#include <linux/trace_printk.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/irq_work.h>
-- 
2.53.0



^ permalink raw reply related

* [PATCH v3 0/2] tracing: Remove trace_printk.h from kernel.h
From: Steven Rostedt @ 2026-06-24  8:18 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Linus Torvalds, Sebastian Andrzej Siewior, John Ogness,
	Thomas Gleixner, Peter Zijlstra, Julia Lawall, Yury Norov

Remove trace_printk.h by creating a trace_controls.h for those places that
need access to tracing prototypes like tracing_off() and for the places that
need trace_printk() directly, to have it included directly.

Changse since v2: https://lore.kernel.org/all/20260622130739.375198646@kernel.org/

- Update change log in patch 1

- Remove #ifdef DEBUG and always include trace_printk.h in patch 2.

Steven Rostedt (2):
      tracing: Move non-trace_printk prototypes into trace_controls.h
      tracing: Remove trace_printk.h from kernel.h

----
 arch/powerpc/kvm/book3s_xics.c         |  1 +
 arch/powerpc/xmon/xmon.c               |  1 +
 arch/s390/kernel/ipl.c                 |  1 +
 arch/s390/kernel/machine_kexec.c       |  1 +
 drivers/gpu/drm/i915/gt/intel_gtt.h    |  1 +
 drivers/gpu/drm/i915/i915_gem.h        |  2 ++
 drivers/hwtracing/stm/dummy_stm.c      |  1 +
 drivers/infiniband/hw/hfi1/trace_dbg.h |  1 +
 drivers/tty/sysrq.c                    |  1 +
 drivers/usb/early/xhci-dbc.c           |  1 +
 fs/ext4/inline.c                       |  1 +
 include/linux/ftrace.h                 |  2 ++
 include/linux/kernel.h                 |  1 -
 include/linux/sunrpc/debug.h           |  1 +
 include/linux/trace_controls.h         | 54 ++++++++++++++++++++++++++++++++
 include/linux/trace_printk.h           | 56 ++--------------------------------
 kernel/debug/debug_core.c              |  1 +
 kernel/panic.c                         |  1 +
 kernel/rcu/rcu.h                       |  2 ++
 kernel/rcu/rcutorture.c                |  1 +
 kernel/trace/ring_buffer_benchmark.c   |  1 +
 kernel/trace/trace.h                   |  1 +
 kernel/trace/trace_benchmark.c         |  1 +
 lib/sys_info.c                         |  1 +
 samples/fprobe/fprobe_example.c        |  1 +
 samples/ftrace/ftrace-direct-too.c     |  1 -
 samples/trace_printk/trace-printk.c    |  1 +
 27 files changed, 83 insertions(+), 55 deletions(-)
 create mode 100644 include/linux/trace_controls.h

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox