Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v8 03/10] tracing/probes: Support dumping fetcharg program for debugging dynamic events
From: Masami Hiramatsu (Google) @ 2026-06-24 14:41 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

For debugging probe events, it is helpful to verify the compiled
fetch instructions for each probe argument. This introduces a new
kernel config CONFIG_PROBE_EVENTS_DUMP_FETCHARG to decode the
instruction sequence of each argument and display it under a
commented line starting with '#' immediately following the dynamic
event definition (such as in dynamic_events, kprobe_events,
uprobe_events, etc.).

For example:
 /sys/kernel/tracing # cat dynamic_events
 p:kprobes/p_vfs_read_0 vfs_read arg1=+0(file):ustring arg2=%ax:x16
 #  arg1: ARG(0) -> ST_USTRING(offset=0,size=4) -> END
 #  arg2: REG(80) -> ST_RAW(size=2) -> END

Assisted-by: Antigravity:gemini-3.5-flash
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - State this feature is only for debugging probe events.
  - Fix dependency list after description in Kconfig.
 Changes in v7:
   - Show trace event field name for FETCH_OP_TP_ARG.
   - Show immediate string value for FETCH_OP_IMMSTR.
   - Fix style issues warned by checkpatch.pl.
 Changes in v6:
   - Newly added.
---
 kernel/trace/Kconfig        |   12 +++++
 kernel/trace/trace_eprobe.c |    2 +
 kernel/trace/trace_fprobe.c |    2 +
 kernel/trace/trace_kprobe.c |    2 +
 kernel/trace/trace_probe.c  |   96 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_probe.h  |   79 +++++++++++++++++++++--------------
 kernel/trace/trace_uprobe.c |    3 +
 7 files changed, 164 insertions(+), 32 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..ca78727ad121 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -779,6 +779,18 @@ config PROBE_EVENTS_BTF_ARGS
 	  kernel function entry or a tracepoint.
 	  This is available only if BTF (BPF Type Format) support is enabled.
 
+config PROBE_EVENTS_DUMP_FETCHARG
+	bool "Dump of dynamic probe event fetch-arguments"
+	depends on PROBE_EVENTS
+	default n
+	help
+	  This shows the dump of fetch-arguments of dynamic probe events
+	  alongside their event definitions in the dynamic_events file
+	  as comment lines. This is useful to debug the probe events.
+	  Since this exposes the raw values in the dynamic_events file,
+	  it might be a security risk. Only enable it if you need to debug
+	  probe events themselves.
+
 config KPROBE_EVENTS
 	depends on KPROBES
 	depends on HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 50518b071414..462c31145733 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -87,6 +87,8 @@ static int eprobe_dyn_event_show(struct seq_file *m, struct dyn_event *ev)
 		seq_printf(m, " %s=%s", ep->tp.args[i].name, ep->tp.args[i].comm);
 	seq_putc(m, '\n');
 
+	trace_probe_dump_args(m, &ep->tp);
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index 4d1abbf66229..536781cd4c47 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -1449,6 +1449,8 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
 		seq_printf(m, " %s=%s", tf->tp.args[i].name, tf->tp.args[i].comm);
 	seq_putc(m, '\n');
 
+	trace_probe_dump_args(m, &tf->tp);
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index a8420e6abb56..cfa807d8e760 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1320,6 +1320,8 @@ static int trace_kprobe_show(struct seq_file *m, struct dyn_event *ev)
 		seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm);
 	seq_putc(m, '\n');
 
+	trace_probe_dump_args(m, &tk->tp);
+
 	return 0;
 }
 
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 2ce7d62471cb..0908019aea12 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -2403,3 +2403,99 @@ int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_a
 	}
 	return 0;
 }
+
+#ifdef CONFIG_PROBE_EVENTS_DUMP_FETCHARG
+
+struct fetch_op_decode {
+	const char *name;
+	void (*decode)(struct seq_file *m, struct fetch_insn *insn);
+};
+
+static const struct fetch_op_decode fetch_op_decode[];
+
+static void fetcharg_decode_none(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_puts(m, fetch_op_decode[insn->op].name);
+}
+
+static void fetcharg_decode_param(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_printf(m, "%s(%u)", fetch_op_decode[insn->op].name, insn->param);
+}
+
+static void fetcharg_decode_imm(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_printf(m, "%s(0x%lx)", fetch_op_decode[insn->op].name, insn->immediate);
+}
+
+static void fetcharg_decode_string(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_printf(m, "%s(%s)", fetch_op_decode[insn->op].name, (char *)insn->data);
+}
+
+static void fetcharg_decode_symbol(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_printf(m, "%s(%s)", fetch_op_decode[insn->op].name, (char *)insn->data);
+}
+
+static void fetcharg_decode_offset(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_printf(m, "%s(offset=%d)", fetch_op_decode[insn->op].name, insn->offset);
+}
+
+static void fetcharg_decode_store(struct seq_file *m, struct fetch_insn *insn)
+{
+	if (insn->op == FETCH_OP_ST_RAW)
+		seq_printf(m, "%s(size=%u)", fetch_op_decode[insn->op].name, insn->size);
+	else
+		seq_printf(m, "%s(offset=%d,size=%u)", fetch_op_decode[insn->op].name,
+			  insn->offset, insn->size);
+}
+
+static void fetcharg_decode_bf(struct seq_file *m, struct fetch_insn *insn)
+{
+	seq_printf(m, "%s(basesize=%u,lshift=%u,rshift=%u)",
+		   fetch_op_decode[insn->op].name, insn->basesize, insn->lshift, insn->rshift);
+}
+
+static void fetcharg_decode_tp_arg(struct seq_file *m, struct fetch_insn *insn)
+{
+	struct ftrace_event_field *field = insn->data;
+
+	seq_printf(m, "%s(%s)", fetch_op_decode[insn->op].name, field->name);
+}
+
+#define FETCH_OP(opname, decode_fn) \
+	[FETCH_OP_##opname] = { .name = #opname, .decode = fetcharg_decode_##decode_fn }
+
+static const struct fetch_op_decode fetch_op_decode[] = FETCH_OP_LIST;
+#undef FETCH_OP
+
+static void trace_probe_dump_arg(struct seq_file *m, struct probe_arg *parg)
+{
+	int i;
+
+	seq_printf(m, "#  %s: ", parg->name);
+	for (i = 0; i < FETCH_INSN_MAX; i++) {
+		struct fetch_insn *insn = parg->code + i;
+
+		if (insn->op >= ARRAY_SIZE(fetch_op_decode) || !fetch_op_decode[insn->op].decode)
+			seq_printf(m, "unknown(%d)", insn->op);
+		else
+			fetch_op_decode[insn->op].decode(m, insn);
+
+		if (insn->op == FETCH_OP_END)
+			break;
+		seq_puts(m, " -> ");
+	}
+	seq_putc(m, '\n');
+}
+
+void trace_probe_dump_args(struct seq_file *m, struct trace_probe *tp)
+{
+	int i;
+
+	for (i = 0; i < tp->nr_args; i++)
+		trace_probe_dump_arg(m, &tp->args[i]);
+}
+#endif /* CONFIG_PROBE_EVENTS_DUMP_FETCHARG */
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 2e0d8384ee5c..e36cfe39e9a8 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -83,38 +83,46 @@ static nokprobe_inline u32 update_data_loc(u32 loc, int consumed)
 /* Printing function type */
 typedef int (*print_type_func_t)(struct trace_seq *, void *, void *);
 
-enum fetch_op {
-	FETCH_OP_NOP = 0,
-	// Stage 1 (load) ops
-	FETCH_OP_REG,		/* Register : .param = offset */
-	FETCH_OP_STACK,		/* Stack : .param = index */
-	FETCH_OP_STACKP,	/* Stack pointer */
-	FETCH_OP_RETVAL,	/* Return value */
-	FETCH_OP_IMM,		/* Immediate : .immediate */
-	FETCH_OP_COMM,		/* Current comm */
-	FETCH_OP_ARG,		/* Function argument : .param */
-	FETCH_OP_FOFFS,		/* File offset: .immediate */
-	FETCH_OP_IMMSTR,	/* Allocated string: .data */
-	FETCH_OP_EDATA,		/* Entry data: .offset */
-	// Stage 2 (dereference) op
-	FETCH_OP_DEREF,		/* Dereference: .offset */
-	FETCH_OP_UDEREF,	/* User-space Dereference: .offset */
-	// Stage 3 (store) ops
-	FETCH_OP_ST_RAW,	/* Raw: .size */
-	FETCH_OP_ST_MEM,	/* Mem: .offset, .size */
-	FETCH_OP_ST_UMEM,	/* Mem: .offset, .size */
-	FETCH_OP_ST_STRING,	/* String: .offset, .size */
-	FETCH_OP_ST_USTRING,	/* User String: .offset, .size */
-	FETCH_OP_ST_SYMSTR,	/* Kernel Symbol String: .offset, .size */
-	FETCH_OP_ST_EDATA,	/* Store Entry Data: .offset */
-	// Stage 4 (modify) op
-	FETCH_OP_MOD_BF,	/* Bitfield: .basesize, .lshift, .rshift */
-	// Stage 5 (loop) op
-	FETCH_OP_LP_ARRAY,	/* Array: .param = loop count */
-	FETCH_OP_TP_ARG,	/* Trace Point argument */
-	FETCH_OP_END,
-	FETCH_NOP_SYMBOL,	/* Unresolved Symbol holder */
-};
+#define FETCH_OP_LIST	{						\
+	/* Stage 1 (load) ops */					\
+	FETCH_OP(NOP, none),		/* NOP */			\
+	FETCH_OP(REG, param),		/* Register: .param = offset */	\
+	FETCH_OP(STACK, param),		/* Stack: .param = index */	\
+	FETCH_OP(STACKP, none),		/* Stack pointer */		\
+	FETCH_OP(RETVAL, none),		/* Return value */		\
+	FETCH_OP(IMM, imm),		/* Immediate: .immediate */	\
+	FETCH_OP(COMM, none),		/* Current comm */		\
+	FETCH_OP(ARG, param),		/* Argument: .param = index */	\
+	FETCH_OP(FOFFS, imm),		/* File offset: .immediate */	\
+	FETCH_OP(IMMSTR, string),	/* Allocated string: .data */	\
+	FETCH_OP(EDATA, offset),	/* Entry data: .offset */	\
+	FETCH_OP(TP_ARG, tp_arg),	/* Tracepoint argument: .data */\
+	/* Stage 2 (dereference) ops */					\
+	FETCH_OP(DEREF, offset),	/* Dereference: .offset */	\
+	FETCH_OP(UDEREF, offset),	/* User-space dereference: .offset */\
+	/* Stage 3 (store) ops */					\
+	FETCH_OP(ST_RAW, store),	/* Raw value: .size */		\
+	FETCH_OP(ST_MEM, store),	/* Memory: .offset, .size */	\
+	FETCH_OP(ST_UMEM, store),	/* User memory: .offset, .size */\
+	FETCH_OP(ST_STRING, store),	/* String: .offset, .size */	\
+	FETCH_OP(ST_USTRING, store),	/* User string: .offset, .size */\
+	FETCH_OP(ST_SYMSTR, store),	/* Symbol name: .offset, .size */\
+	FETCH_OP(ST_EDATA, offset),	/* Entry data: .offset */	\
+	/* Stage 4 (modify) op */					\
+	FETCH_OP(MOD_BF, bf),		/* Bitfield: .basesize, .lshift, .rshift*/\
+	/* Stage 5 (loop) op */						\
+	FETCH_OP(LP_ARRAY, param),	/* Loop array: .param = count */\
+	/* End */							\
+	FETCH_OP(END, none),						\
+	/* Unresolved Symbol holder */					\
+	FETCH_OP(NOP_SYMBOL, symbol),	/* Non loaded symbol: .data = symbol name */\
+}
+
+#define FETCH_OP(opname, decode_fn) FETCH_OP_##opname
+enum fetch_op FETCH_OP_LIST;
+#undef FETCH_OP
+
+#define FETCH_NOP_SYMBOL FETCH_OP_NOP_SYMBOL
 
 struct fetch_insn {
 	enum fetch_op op;
@@ -370,6 +378,13 @@ bool trace_probe_match_command_args(struct trace_probe *tp,
 int trace_probe_create(const char *raw_command, int (*createfn)(int, const char **));
 int trace_probe_print_args(struct trace_seq *s, struct probe_arg *args, int nr_args,
 		 u8 *data, void *field);
+#ifdef CONFIG_PROBE_EVENTS_DUMP_FETCHARG
+void trace_probe_dump_args(struct seq_file *m, struct trace_probe *tp);
+#else
+static inline void trace_probe_dump_args(struct seq_file *m, struct trace_probe *tp)
+{
+}
+#endif
 
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
 int traceprobe_get_entry_data_size(struct trace_probe *tp);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c274346853d1..b2e264a4b96c 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -765,6 +765,9 @@ static int trace_uprobe_show(struct seq_file *m, struct dyn_event *ev)
 		seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
 
 	seq_putc(m, '\n');
+
+	trace_probe_dump_args(m, &tu->tp);
+
 	return 0;
 }
 


^ permalink raw reply related

* [PATCH v8 02/10] tracing/probes: Allow eprobe to use variable without $ prefix
From: Masami Hiramatsu (Google) @ 2026-06-24 14:41 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

The commit 69efd863a785 ("tracing/eprobes: Allow use of BTF names
to dereference pointers") allows eprobe to use event field without
"$" prefix when it is used with typecast, it is natual to allow it
without typecast.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - Newly added.
---
 kernel/trace/trace_probe.c                         |   12 +++++++++++-
 kernel/trace/trace_probe.h                         |    1 +
 .../test.d/dynevent/eprobes_syntax_errors.tc       |    3 +--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 0da7c0b53ba7..2ce7d62471cb 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1341,7 +1341,17 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 		ret = handle_typecast(arg, pcode, end, ctx);
 		break;
 	default:
-		if (isalpha(arg[0]) || arg[0] == '_') {	/* BTF variable */
+		if (isalpha(arg[0]) || arg[0] == '_') {
+			/* BTF variable or event field*/
+			if (ctx->flags & TPARG_FL_TEVENT) {
+				ret = parse_trace_event(arg, *pcode, ctx);
+				if (ret < 0) {
+					trace_probe_log_err(ctx->offset,
+							    NO_EVENT_FIELD);
+					return -EINVAL;
+				}
+				break;
+			}
 			if (!tparg_is_function_entry(ctx->flags) &&
 			    !tparg_is_function_return(ctx->flags)) {
 				trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 40b53b5b58a9..2e0d8384ee5c 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -559,6 +559,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(NO_PTR_STRCT,		"This is not a pointer to union/structure."),	\
 	C(NOSUP_DAT_ARG,	"Non pointer structure/union argument is not supported."),\
 	C(BAD_HYPHEN,		"Failed to parse single hyphen. Forgot '>'?"),	\
+	C(NO_EVENT_FIELD,	"This event field is not found."),	\
 	C(NO_BTF_FIELD,		"This field is not found."),	\
 	C(BAD_BTF_TID,		"Failed to get BTF type info."),\
 	C(BAD_TYPE4STR,		"This type does not fit for string."),\
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
index 2a680c086047..0e65e787e426 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
@@ -10,7 +10,7 @@ check_error() { # command-with-error-pos-by-^
 check_error 'e ^a.'			# NO_EVENT_INFO
 check_error 'e ^.b'			# NO_EVENT_INFO
 check_error 'e ^a.b'			# BAD_ATTACH_EVENT
-check_error 'e syscalls/sys_enter_openat ^foo'	# BAD_ATTACH_ARG
+check_error 'e syscalls/sys_enter_openat ^foo'	# NO_EVENT_FIELD
 check_error 'e:^/bar syscalls/sys_enter_openat'	# NO_GROUP_NAME
 check_error 'e:^12345678901234567890123456789012345678901234567890123456789012345/bar syscalls/sys_enter_openat'	# GROUP_TOO_LONG
 
@@ -19,7 +19,6 @@ check_error 'e:^ syscalls/sys_enter_openat'		# NO_EVENT_NAME
 check_error 'e:foo/^12345678901234567890123456789012345678901234567890123456789012345 syscalls/sys_enter_openat'	# EVENT_TOO_LONG
 check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'	# BAD_EVENT_NAME
 
-check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'	# BAD_FETCH_ARG
 check_error 'e:foo/bar syscalls/sys_enter_openat arg=^$foo'	# BAD_ATTACH_ARG
 
 if grep -q '<attached-group>\.<attached-event>.*\[if <filter>\]' README; then


^ permalink raw reply related

* [PATCH v8 01/10] tracing/probes: Make the $ prefix mandatory for comm access
From: Masami Hiramatsu (Google) @ 2026-06-24 14:41 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since $comm or $COMM are not event field but special fetcharg
variables to access current->comm, It should not be accessed
without '$' prefix even with typecast.

Fixes: 69efd863a785 ("tracing/eprobes: Allow use of BTF names to dereference pointers")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - Newly added.
---
 kernel/trace/trace_probe.c |   12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index c10bbb0df7b9..0da7c0b53ba7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -342,10 +342,6 @@ static int parse_trace_event(char *arg, struct fetch_insn *code,
 	ret = parse_trace_event_arg(arg, code, ctx);
 	if (!ret)
 		return 0;
-	if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
-		code->op = FETCH_OP_COMM;
-		return 0;
-	}
 	return -EINVAL;
 }
 
@@ -1065,8 +1061,14 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
 	int len;
 
 	if (ctx->flags & TPARG_FL_TEVENT) {
-		if (parse_trace_event(arg, code, ctx) < 0)
+		if (parse_trace_event(arg, code, ctx) < 0) {
+			/* 'comm' should be checked after field parsing. */
+			if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+				code->op = FETCH_OP_COMM;
+				return 0;
+			}
 			goto inval;
+		}
 		return 0;
 	}
 


^ permalink raw reply related

* [PATCH v8 00/10] tracing/probes: Add more typecast features
From: Masami Hiramatsu (Google) @ 2026-06-24 14:41 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest

Hi,

Here is the 8th version of series to introduce more typecast features
to probe events. The previous version is here:

 https://lore.kernel.org/all/178217904992.643090.15726197350652241270.stgit@devnote2/

In this version, I removed already picked 2 patches and add 2 new
fix and feature patches. The previous BTF typecast patch allows
`(STRUCT)FIELD->MEMBER` without $ prefix for eprobes, but it also
allows user to use COMM/comm instead of FIELD. $COMM/$comm are special
variables, so it should not skip $ prefix[1/10]. However, accessing
event fields without $ prefix itself is acceptable, it is generically
allowed without typecast[2/10].
Other patches have small fixes according to Julian and Sashiko's
comments and are rebased on top of probes/core branch.

This series extends BTF typecast feature and add more options:

1. Expanding BTF typecast to kprobe and fprobe.
   (currently only function entry/exit)

2. Introduce container_of like typecast. This adds a "assigned
   member" option to the typecast.

   (STRUCT,MEMBER)VAR->ANOTHER_MEMBER

   This casts VAR to STRUCT type but the VAR is as the address
   of STRUCT.MEMBER. In C, it is:

   container_of(VAR, STRUCT, MEMBER)->ANOTHER_MEMBER

3. Support nested typecast, e.g.

   (STRUCT)((STRUCT2)VAR->MEMBER2)->MEMBER

   the nest level must be smaller than 3.

4. Add $current variable to point "current" task_struct.
   This is useful with typecast, e.g.

   (task_struct)$current->pid

5. per-cpu dereference support.

   Intrdouce this_cpu_read(VAR) and this_cpu_ptr(VAR) to
   access per-cpu data on the current CPU (accessing other CPU
   data is not stable, because it can be changed.)

   You can access the member of per-cpu data structure using
   typecast like:

   (STRUCT)this_cpu_ptr(VAR)->MEMBER

6. Support event fields without $ prefix on eprobes.

   Now eprobe events can access its event fields.

And added fetcharg dump feature (for debug) and updated test scripts
to test part of them.

Thanks,

---
base-commit: 18dfb4703cd6af27deb30d628dac2e7db2b24e6a

Masami Hiramatsu (Google) (10):
      tracing/probes: Make the $ prefix mandatory for comm access
      tracing/probes: Allow eprobe to use variable without $ prefix
      tracing/probes: Support dumping fetcharg program for debugging dynamic events
      tracing/probes: Support typecast for various probe events
      tracing/probes: Support nested typecast
      tracing/probes: Type casting always involves nested calls
      tracing/probes: Support field specifier option for typecast
      tracing/probes: Add $current variable support
      tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
      tracing/probes: Add a new testcase for BTF typecasts


 Documentation/trace/eprobetrace.rst                |    9 
 Documentation/trace/fprobetrace.rst                |   10 
 Documentation/trace/kprobetrace.rst                |   11 
 kernel/trace/Kconfig                               |   12 
 kernel/trace/trace.c                               |    8 
 kernel/trace/trace_eprobe.c                        |    2 
 kernel/trace/trace_fprobe.c                        |    2 
 kernel/trace/trace_kprobe.c                        |    2 
 kernel/trace/trace_probe.c                         |  586 ++++++++++++++++----
 kernel/trace/trace_probe.h                         |   99 ++-
 kernel/trace/trace_probe_tmpl.h                    |   25 +
 kernel/trace/trace_uprobe.c                        |    3 
 samples/trace_events/trace-events-sample.c         |   40 +
 samples/trace_events/trace-events-sample.h         |   34 +
 .../ftrace/test.d/dynevent/btf_probe_event.tc      |   51 ++
 .../test.d/dynevent/eprobes_syntax_errors.tc       |    6 
 .../ftrace/test.d/dynevent/fprobe_syntax_errors.tc |   12 
 .../ftrace/test.d/kprobe/kprobe_syntax_errors.tc   |   12 
 .../ftrace/test.d/kprobe/uprobe_syntax_errors.tc   |    5 
 19 files changed, 770 insertions(+), 159 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v8 09/46] KVM: guest_memfd: Introduce function to check GFN private/shared status
From: Ackerley Tng @ 2026-06-24 14:38 UTC (permalink / raw)
  To: Binbin Wu
  Cc: aik, andrew.jones, brauner, chao.p.peng, david, jmattson,
	jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <1b59fec2-a464-4429-8532-880394912af5@linux.intel.com>

Binbin Wu <binbin.wu@linux.intel.com> writes:

>
> [...snip...]
>
>> +bool kvm_gmem_is_private(struct kvm *kvm, gfn_t gfn)
>> +{
>> +	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
>> +	struct inode *inode;
>> +
>> +	/*
>> +	 * If this gfn has no associated memslot, there's no chance of the gfn
>> +	 * being backed by private memory, since guest_memfd must be used for
>> +	 * private memory,
>
> "guest_memfd must be used for private memory" is a bit confusing to me.
>

Hmm good point. Is the source of confusion that guest_memfd can be used
for both shared and private memory?

Perhaps this can be rephrased as:

guest_memfd is the only provider of private memory and guest_memfd must
be used with a memslot, hence if there's no associated memslot, there's
no chance of this gfn being private.

>> and guest_memfd must be associated with some memslot.
>> +	 */
>> +	if (!slot)
>> +		return 0;
>> +
>>
>> [...snip...]
>>

^ permalink raw reply

* Re: [PATCHv4 02/13] uprobes/x86: Remove struct uprobe_trampoline object
From: Oleg Nesterov @ 2026-06-24 14:36 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Peter Zijlstra, Ingo Molnar, Masami Hiramatsu, Andrii Nakryiko,
	bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-3-jolsa@kernel.org>

On 05/26, Jiri Olsa wrote:
>
> Removing struct uprobe_trampoline object and it's tracking code,
> because it's not needed. We can do same thing directly on top of
> struct vm_area_struct objects.
>
> This makes the code simpler and allows easy propagation of the
> trampoline vma object into child process in following change.
>
> Note the original code called destroy_uprobe_trampoline if the
> optimiation failed, but it only freed the struct uprobe_trampoline
> object, not the vma. The new vma leak is fixed in following change.
>
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>

Reviewed-by: Oleg Nesterov <oleg@redhat.com>

---------------------------------------------------------------------
Although I can't convince myself I fully understand this code with or
without this patch ;)

A couple of questions below...

> -static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
> +static struct vm_area_struct *get_uprobe_trampoline(struct mm_struct *mm, unsigned long vaddr)
>  {
> -	struct pt_regs *regs = task_pt_regs(current);
> -	struct mm_struct *mm = current->mm;
> -	struct uprobe_trampoline *tramp;
> +	VMA_ITERATOR(vmi, mm, 0);
>  	struct vm_area_struct *vma;
>
> -	if (!user_64bit_mode(regs))
> -		return NULL;
> +	if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
> +		return ERR_PTR(-EINVAL);

Do we really need this check? It looks a bit confusing to me...
vaddr is bp_vaddr from handle_swbp(), it should be valid?

> +
> +	for_each_vma(vmi, vma) {
> +		if (!vma_is_special_mapping(vma, &tramp_mapping))
> +			continue;
> +		if (is_reachable_by_call(vma->vm_start, vaddr))
> +			return vma;
> +	}

Perhaps we can later optimize this code a bit? I mean something like

	start_reachable = ...;
	end_reachable = ...;

	VMA_ITERATOR(vmi, mm, start_reachable);

	for_each_vma(vmi, vma) {
		if (!vma_is_special_mapping(...))
			continue;
		if (vma->vm_start > end_reachable)
			break;
		return vma;
	}

>  static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
>  				  unsigned long vaddr)
>  {
> -	struct uprobe_trampoline *tramp;
> -	struct vm_area_struct *vma;
> -	bool new = false;
> -	int err = 0;
> +	struct pt_regs *regs = task_pt_regs(current);
> +	struct vm_area_struct *vma, *tramp;
>
> +	if (!user_64bit_mode(regs))
> +		return -EINVAL;
>  	vma = find_vma(mm, vaddr);
>  	if (!vma)
>  		return -EINVAL;

I guess find_vma() can't fail, the caller arch_uprobe_optimize() has called
copy_from_vaddr() under mmap_write_lock()... Nevermind.

Oleg.


^ permalink raw reply

* Re: [PATCH v3 2/2] tracing: Remove trace_printk.h from kernel.h
From: Steven Rostedt @ 2026-06-24 14:32 UTC (permalink / raw)
  To: David Laight
  Cc: linux-kernel, linux-trace-kernel, Masami Hiramatsu, Mark Rutland,
	Mathieu Desnoyers, Andrew Morton, Linus Torvalds,
	Sebastian Andrzej Siewior, John Ogness, Thomas Gleixner,
	Peter Zijlstra, Julia Lawall, Yury Norov
In-Reply-To: <20260624111152.75476a46@pumpkin>

On Wed, 24 Jun 2026 11:11:52 +0100
David Laight <david.laight.linux@gmail.com> wrote:

> That is all about changes to the file causing everything to be rebuilt,
> not the contents of the file slowing down builds.

I guess I should say it better. It causes more build time if that file
changes. That's what I meant. I update the wording to say:

   There have been complaints about trace_printk.h causing more build time
   for being in kernel.h it if changes. There is also an effort to clean up
   kernel.h to have it not include unneeded header files. Move trace_printk.h
   out of kernel.h and place it in the headers and C files that use it.
> 
> The part you are moving out of normal builds is just a few #defines.
> They won't have a significant effect on build times either.
> 
> So there is no point splitting out trace_controls.h.

That is a completely different reason. trace_printk.h is about
trace_printk() usage. The stuff split out into trace_controls.h have
nothing to do with trace_printk()s.

-- Steve


^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: David Laight @ 2026-06-24 14:23 UTC (permalink / raw)
  To: Christian König
  Cc: Kaitao Cheng, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt, David Howells,
	Simona Vetter, Randy Dunlap, Luca Ceresoli, Philipp Stanner,
	linux-block, linux-kernel, cgroups, linux-ntfs-dev, linux-fsdevel,
	io-uring, audit, bpf, netdev, dri-devel, linux-perf-users,
	linux-trace-kernel, kexec, live-patching, linux-modules,
	linux-crypto, linux-pm, rcu, sched-ext, linux-mm, virtualization,
	damon, llvm, Kaitao Cheng
In-Reply-To: <cf8467c7-b98f-44a5-9cf9-60b43b5da711@amd.com>

On Wed, 24 Jun 2026 15:23:47 +0200
Christian König <christian.koenig@amd.com> wrote:

> On 6/24/26 15:14, Kaitao Cheng wrote:
> > 
> > 
> > 在 2026/6/22 16:42, David Laight 写道:  
> >> On Mon, 22 Jun 2026 12:05:31 +0800
> >> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
> >>  
> >>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
> >>>
> >>> The list_for_each*_safe() helpers are used when the loop body may
> >>> remove the current entry.  Their API exposes the temporary cursor at
> >>> every call site, even though most users only need it for the iterator
> >>> implementation and never reference it in the loop body.
> >>>
> >>> Add *_mutable() variants for list and hlist iteration.  The new helpers
> >>> support both forms: callers may keep passing an explicit temporary cursor
> >>> when they need to inspect or reset it, or omit it and let the helper use
> >>> a unique internal cursor.  
> >>
> >> I'm not really sure 'mutable' means anything either.
> >> It is possible to make it valid for the loop body (or even other threads)
> >> to delete arbitrary list items - but that needs significant extra overheads.
> >>
> >> It might be worth doing something that doesn't need the extra variable,
> >> but there is little point doing all the churn just to rename things.
> >>  
> >>>
> >>> This makes call sites that only mutate the list through the current entry
> >>> less noisy, while keeping the existing *_safe() helpers available for
> >>> compatibility.
> >>>
> >>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
> >>> ---
> >>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
> >>>  1 file changed, 231 insertions(+), 38 deletions(-)
> >>>
> >>> diff --git a/include/linux/list.h b/include/linux/list.h
> >>> index 09d979976b3b..1081def7cea9 100644
> >>> --- a/include/linux/list.h
> >>> +++ b/include/linux/list.h
> >>> @@ -7,6 +7,7 @@
> >>>  #include <linux/stddef.h>
> >>>  #include <linux/poison.h>
> >>>  #include <linux/const.h>
> >>> +#include <linux/args.h>
> >>>  
> >>>  #include <asm/barrier.h>
> >>>  
> >>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
> >>>  #define list_for_each_prev(pos, head) \
> >>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
> >>>  
> >>> -/**
> >>> - * list_for_each_safe - iterate over a list safe against removal of list entry
> >>> - * @pos:	the &struct list_head to use as a loop cursor.
> >>> - * @n:		another &struct list_head to use as temporary storage
> >>> - * @head:	the head for your list.
> >>> +/*
> >>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
> >>>   */
> >>>  #define list_for_each_safe(pos, n, head) \
> >>>  	for (pos = (head)->next, n = pos->next; \
> >>>  	     !list_is_head(pos, (head)); \
> >>>  	     pos = n, n = pos->next)
> >>>  
> >>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
> >>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\  
> >>
> >> Use auto
> >>  
> >>> +	     !list_is_head(pos, (head));				\
> >>> +	     pos = tmp, tmp = pos->next)
> >>> +
> >>> +#define __list_for_each_mutable1(pos, head)				\
> >>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
> >>> +
> >>> +#define __list_for_each_mutable2(pos, next, head)			\
> >>> +	list_for_each_safe(pos, next, head)
> >>> +
> >>>  /**
> >>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
> >>> + * list_for_each_mutable - iterate over a list safe against entry removal
> >>>   * @pos:	the &struct list_head to use as a loop cursor.
> >>> - * @n:		another &struct list_head to use as temporary storage
> >>> - * @head:	the head for your list.
> >>> + * @...:	either (head) or (next, head)
> >>> + *
> >>> + * next:	another &struct list_head to use as optional temporary storage.
> >>> + *		The temporary cursor is internal unless explicitly supplied by
> >>> + *		the caller.
> >>> + * head:	the head for your list.
> >>> + */
> >>> +#define list_for_each_mutable(pos, ...)					\
> >>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
> >>> +		(pos, __VA_ARGS__)  
> >>
> >> The variable argument count logic really just slows down compilation.
> >> Maybe there aren't enough copies of this code to make that significant.
> >> But just because you can do it doesn't mean it is a gooD idea.
> >> I'm also not sure it really adds anything to the readability.
> >>
> >> And, it you are going to make the middle argument optional there is
> >> no need to change the macro name.  
> > 
> > Christian König and Jani Nikula also disagree with the variadic-argument
> > implementation approach. If we abandon that method, it means we will
> > inevitably need to add some new macros. If mutable is not a good name,
> > suggestions for better alternatives would be welcome; coming up with a
> > suitable name is indeed rather tricky.  
> 
> I don't think you need to add a new macro for the specific use case that people want to modify the next element of the iteration.
> 
> If I remember your numbers correctly that is a really corner case and keeping using the existing *_safe() macros for that sounds perfectly fine to me.

IIRC currently you have a choice of either:
	define               Item that can't be deleted
	list_for_each()	     The current item.
	list_for_each_safe() The next item.
There is also likely to be code that updates the variables to allow
for other scenarios.

Note that if increase a reference count and release a lock then list_for_each()
is likely safer than list_for_each_safe() :-)

list.h has 9 variants of the 'safe' loop.
The bloat of another 9 is getting excessive.

It has to be said that this is one of my least favourite type of list...

	David

> 
> Regards,
> Christian.


^ permalink raw reply

* Re: [PATCH v8 08/46] KVM: Provide generic interface for checking memory private/shared status
From: Ackerley Tng @ 2026-06-24 14:18 UTC (permalink / raw)
  To: Suzuki K Poulose, Fuad Tabba
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, willy, wyihan,
	yan.y.zhao, forkloop, pratyush, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <3ec15992-2a29-434b-8c99-8b86bfcf007e@arm.com>

Suzuki K Poulose <suzuki.poulose@arm.com> writes:

>
> [...snip...]
>
>>>> @@ -2546,7 +2546,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
>>>>   bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
>>>>                                           struct kvm_gfn_range *range);
>>>>
>>>> -static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>>>> +static inline bool kvm_vm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>>
>> Should have read the Sashiko review first, but where is this used?
>> It's not used at all in this series...
>
> See below:
>
>>
>> /fuad
>>
>>>>   {
>>>>          return kvm_get_vm_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>>>>   }
>>>> @@ -2557,6 +2557,16 @@ static inline bool kvm_mem_range_is_private(struct kvm *kvm, gfn_t start,
>>>>                                                    KVM_MEMORY_ATTRIBUTE_PRIVATE,
>>>>                                                    KVM_MEMORY_ATTRIBUTE_PRIVATE);
>>>>   }
>>>> +#endif  /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>>>> +
>>>> +#ifdef kvm_arch_has_private_mem
>>>> +typedef bool (kvm_mem_is_private_t)(struct kvm *kvm, gfn_t gfn);
>>>> +DECLARE_STATIC_CALL(__kvm_mem_is_private, kvm_mem_is_private_t);
>>>> +
>>>> +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>>>> +{
>>>> +       return static_call(__kvm_mem_is_private)(kvm, gfn);
>>>> +}
>>>>   #else
>>>>   static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>>>>   {
>>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>>>> index 6669f1477013c..8b238e461b854 100644
>>>> --- a/virt/kvm/kvm_main.c
>>>> +++ b/virt/kvm/kvm_main.c
>>>> @@ -2627,6 +2627,20 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>>>>   }
>>>>   #endif /* CONFIG_KVM_VM_MEMORY_ATTRIBUTES */
>>>>
>>>> +#ifdef kvm_arch_has_private_mem
>>>> +DEFINE_STATIC_CALL_RET0(__kvm_mem_is_private, kvm_mem_is_private_t);
>>>> +EXPORT_STATIC_CALL_GPL(__kvm_mem_is_private);
>>>> +
>>>> +static void kvm_init_memory_attributes(void)
>>>> +{
>>>> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>>>> +       static_call_update(__kvm_mem_is_private, kvm_vm_mem_is_private);
>>>> +#endif
>>>> +}
>
>
> Here ^^ as the static call update ?
>
>
> Suzuki

Thanks Suzuki, it is used here. kvm_mem_is_private() was and still is
the function used to check if some gfn is private or shared. Hence, in
this patch, the usages of kvm_mem_is_private() were not
updated. Instead, kvm_mem_is_private() is now set up as a static call,
and the static call is hard-wired to kvm_vm_mem_is_private() in this
patch.

In the later wiring patch, all the places where attributes are looked up
are updated all at once: if conversion enabled, take gmem route, else
take VM route.

kvm_mem_is_private() is special in that the if-else is done at KVM load
time rather than runtime, and I believe that's for performance reasons
since this is checked quite often from the KVM fault handling code.

Buut I think perhaps Fuad was referring to kvm_mem_range_is_private(),
which is indeed not used anywhere. Binbin also asked about this, I think
we should drop kvm_mem_range_is_private(). My reply to Binbin is at [1].

[1] https://lore.kernel.org/all/CAEvNRgGbBcrX5Fw3vNTsTOBNC=Ypi=9-S07674yPxLU9i4akjA@mail.gmail.com/

^ permalink raw reply

* Re: [PATCHv4 01/13] uprobes/x86: Use proper mm_struct in __in_uprobe_trampoline
From: Oleg Nesterov @ 2026-06-24 14:08 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Peter Zijlstra, Ingo Molnar, Masami Hiramatsu, Andrii Nakryiko,
	bpf, linux-trace-kernel
In-Reply-To: <20260526205840.173790-2-jolsa@kernel.org>

On 05/26, Jiri Olsa wrote:
>
> In the unregister path we use __in_uprobe_trampoline check with
> current->mm for the VMA lookup, which is wrong, because we are
> in the tracer context, not the traced process.
>
> Add mm_struct pointer argument to __in_uprobe_trampoline and
> changing related callers to pass proper mm_struct pointer.
>
> Fixes: ba2bfc97b462 ("uprobes/x86: Add support to optimize uprobes")
> Acked-by: Andrii Nakryiko <andrii@kernel.org>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>

Reviewed-by: Oleg Nesterov <oleg@redhat.com>


^ permalink raw reply

* Re: [PATCH v8 07/46] KVM: Rename memory attribute APIs to prepare for in-place gmem conversion
From: Ackerley Tng @ 2026-06-24 13:44 UTC (permalink / raw)
  To: Binbin Wu
  Cc: aik, andrew.jones, brauner, chao.p.peng, david, jmattson,
	jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
	Baoquan He, Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <96fb369d-dbff-4ed6-b1f9-0ce63d7d4ed0@linux.intel.com>

Binbin Wu <binbin.wu@linux.intel.com> writes:

>
> [...snip...]
>
>> +static inline bool kvm_mem_range_is_private(struct kvm *kvm, gfn_t start,
>> +					    gfn_t end)
>> +{
>> +	return kvm_range_has_vm_memory_attributes(kvm, start, end,
>> +						  KVM_MEMORY_ATTRIBUTE_PRIVATE,
>> +						  KVM_MEMORY_ATTRIBUTE_PRIVATE);
>>  }
>
> This function is added, but never used in this patch series.
> Is it intended to be called only when CONFIG_KVM_VM_MEMORY_ATTRIBUTES is
> enabled?
>

Thank you for catching this! I think in some earlier revision this was
meant to be used from the guest_memfd populate flow.

I think the version of kvm_gmem_range_is_private in this revision is
good because it is symmetric. If conversion is enabled, call the gmem
range-has-attributes function, and if conversion is disabled, use the VM
range-has-attributes function.

Sean, if no new revision is needed would you be able to drop
kvm_mem_range_is_private() while you're pulling it in?

>>
>> [...snip...]
>>

^ permalink raw reply

* [PATCH v1 11/11] rcuscale: Add concurrent expedited GP threads for callback scaling tests
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Add nexp and exp_interval parameters to rcuscale that spawn kthreads
running synchronize_rcu_expedited() in a loop. This generates concurrent
expedited GP load while the normal writers measure GP or callback
latency.

When combined with gp_async=1 (which uses call_rcu() for writers), this
tests how effectively callbacks benefit from expedited grace periods.
With RCU callback expedited GP tracking, the async callbacks should
complete faster because they piggyback on the expedited GPs rather than
waiting for normal GPs.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcuscale.c | 84 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index ac0b1c6b7dae2..1097ec15879cb 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -91,6 +91,8 @@ torture_param(int, shutdown_secs, !IS_MODULE(CONFIG_RCU_SCALE_TEST) * 300,
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
 torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
 torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
+torture_param(int, nexp, 0, "Number of expedited GP threads to run concurrently");
+torture_param(int, exp_interval, 0, "Interval (us) between expedited GPs, zero to disable");
 torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
 torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
 torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
@@ -115,8 +117,10 @@ struct writer_freelist {
 
 static int nrealreaders;
 static int nrealwriters;
+static int nrealexp;
 static struct task_struct **writer_tasks;
 static struct task_struct **reader_tasks;
+static struct task_struct **exp_tasks;
 
 static u64 **writer_durations;
 static bool *writer_done;
@@ -462,6 +466,34 @@ rcu_scale_reader(void *arg)
 	return 0;
 }
 
+/*
+ * RCU expedited GP kthread.  Repeatedly invokes expedited grace periods
+ * to generate concurrent expedited GP load while the normal-GP writers
+ * are being measured.  This allows measuring the benefit of callbacks
+ * that can piggyback on expedited grace periods.
+ */
+static int
+rcu_scale_exp(void *arg)
+{
+	long me = (long)arg;
+
+	VERBOSE_SCALEOUT_STRING("rcu_scale_exp task started");
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MIN_NICE);
+
+	if (holdoff)
+		schedule_timeout_idle(holdoff * HZ);
+
+	do {
+		if (exp_interval)
+			udelay(exp_interval);
+		cur_ops->exp_sync();
+		rcu_scale_wait_shutdown();
+	} while (!torture_must_stop());
+	torture_kthread_stopping("rcu_scale_exp");
+	return 0;
+}
+
 /*
  * Allocate a writer_mblock structure for the specified rcu_scale_writer
  * task.
@@ -664,8 +696,10 @@ static void
 rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" SCALE_FLAG
-		 "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown_secs=%d\n",
-		 scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown_secs);
+		 "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d nexp=%d exp_interval=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown_secs=%d\n",
+		 scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff,
+		 minruntime, nrealreaders, nrealwriters, nrealexp, exp_interval,
+		 writer_holdoff, writer_holdoff_jiffies, verbose, shutdown_secs);
 }
 
 /*
@@ -809,6 +843,13 @@ kfree_scale_cleanup(void)
 	if (torture_cleanup_begin())
 		return;
 
+	if (exp_tasks) {
+		for (i = 0; i < nrealexp; i++)
+			torture_stop_kthread(rcu_scale_exp, exp_tasks[i]);
+		kfree(exp_tasks);
+		exp_tasks = NULL;
+	}
+
 	if (kfree_reader_tasks) {
 		for (i = 0; i < kfree_nrealthreads; i++)
 			torture_stop_kthread(kfree_scale_thread,
@@ -903,6 +944,22 @@ kfree_scale_init(void)
 			goto unwind;
 	}
 
+	if (nrealexp > 0 && cur_ops->exp_sync) {
+		exp_tasks = kzalloc_objs(exp_tasks[0], nrealexp);
+		if (!exp_tasks) {
+			SCALEOUT_ERRSTRING("out of memory");
+			firsterr = -ENOMEM;
+			goto unwind;
+		}
+		for (i = 0; i < nrealexp; i++) {
+			firsterr = torture_create_kthread(rcu_scale_exp,
+							  (void *)i,
+							  exp_tasks[i]);
+			if (torture_init_error(firsterr))
+				goto unwind;
+		}
+	}
+
 	while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads)
 		schedule_timeout_uninterruptible(1);
 
@@ -959,6 +1016,13 @@ rcu_scale_cleanup(void)
 		return;
 	}
 
+	if (exp_tasks) {
+		for (i = 0; i < nrealexp; i++)
+			torture_stop_kthread(rcu_scale_exp, exp_tasks[i]);
+		kfree(exp_tasks);
+		exp_tasks = NULL;
+	}
+
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			torture_stop_kthread(rcu_scale_reader,
@@ -1076,6 +1140,7 @@ rcu_scale_init(void)
 		if (kthread_tp)
 			kthread_stime = kthread_tp->stime;
 	}
+	nrealexp = nexp;
 	if (kfree_rcu_test)
 		return kfree_scale_init();
 
@@ -1107,6 +1172,21 @@ rcu_scale_init(void)
 	}
 	while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
 		schedule_timeout_uninterruptible(1);
+	if (nrealexp > 0 && cur_ops->exp_sync) {
+		exp_tasks = kzalloc_objs(exp_tasks[0], nrealexp);
+		if (!exp_tasks) {
+			SCALEOUT_ERRSTRING("out of memory");
+			firsterr = -ENOMEM;
+			goto unwind;
+		}
+		for (i = 0; i < nrealexp; i++) {
+			firsterr = torture_create_kthread(rcu_scale_exp,
+							  (void *)i,
+							  exp_tasks[i]);
+			if (torture_init_error(firsterr))
+				goto unwind;
+		}
+	}
 	writer_tasks = kzalloc_objs(writer_tasks[0], nrealwriters);
 	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations), GFP_KERNEL);
 	writer_n_durations = kzalloc_objs(*writer_n_durations, nrealwriters);
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 10/11] rcu: Advance callbacks for expedited GP completion in rcu_core()
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Even when rcu_pending() triggers rcu_core(), the normal callback
advancement path through note_gp_changes() -> __note_gp_changes() bails
out when rdp->gp_seq == rnp->gp_seq (no normal GP change). Since
expedited GPs do not update rnp->gp_seq, rcu_advance_cbs() is never
called and callbacks remain stuck in RCU_WAIT_TAIL.

Add a direct callback advancement block in rcu_core() that checks for GP
completion via rcu_segcblist_nextgp() combined with
poll_state_synchronize_rcu_full(). When detected, trylock rnp and call
rcu_advance_cbs() to move completed callbacks to RCU_DONE_TAIL. Wake the
GP kthread if rcu_advance_cbs() requests a new grace period.

Uses trylock to avoid adding contention on rnp->lock. If the lock is
contended, callbacks will be advanced on the next tick.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/tree.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b01d7bf6b57b1..f42e01ef479c4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2891,6 +2891,23 @@ static __latent_entropy void rcu_core(void)
 	/* Update RCU state based on any recent quiescent states. */
 	rcu_check_quiescent_state(rdp);
 
+	/* Advance callbacks if an expedited GP has completed. */
+	if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_is_enabled(&rdp->cblist)) {
+		struct rcu_gp_seq gp_state;
+
+		if (rcu_segcblist_nextgp(&rdp->cblist, &gp_state) &&
+		    poll_state_synchronize_rcu_full(&gp_state)) {
+			guard(irqsave)();
+			if (raw_spin_trylock_rcu_node(rnp)) {
+				bool needwake = rcu_advance_cbs(rnp, rdp);
+
+				raw_spin_unlock_rcu_node(rnp);
+				if (needwake)
+					rcu_gp_kthread_wake();
+			}
+		}
+	}
+
 	/* No grace period and unregistered callbacks? */
 	if (!rcu_gp_in_progress() &&
 	    rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) {
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 09/11] rcu: Detect expedited grace period completion in rcu_pending()
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

rcu_pending() decides whether rcu_core() should run on the current CPU's
timer tick.  It does not account for expedited grace periods: after an
expedited GP completes, a non-offloaded CPU's callbacks remain in
RCU_WAIT_TAIL (not yet advanced to RCU_DONE_TAIL) and rcu_core() is
never invoked to advance them.

Detect that case via rcu_segcblist_nextgp() combined with a new
memory-ordering-free poll variant,
poll_state_synchronize_rcu_full_unordered().  This keeps rcu_pending()
cheap: it runs on every tick that has pending callbacks, so it must
not pay for the two memory barriers in
poll_state_synchronize_rcu_full().  The check is only a hint to run
rcu_core(); the ordered re-check and the actual callback advancement
happen there.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/tree.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 169d98ed52bbb..b01d7bf6b57b1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3598,6 +3598,24 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 
+/*
+ * Racy, memory-ordering-free test of whether the normal or expedited grace
+ * period recorded in *gsp has completed.  Callers that need the full
+ * memory-ordering guarantees must use poll_state_synchronize_rcu_full();
+ * this variant is only a hint (e.g. for rcu_pending()) and leaves any
+ * required ordering to a subsequent ordered check.
+ */
+static bool poll_state_synchronize_rcu_full_unordered(struct rcu_gp_seq *gsp)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	return gsp->norm == RCU_GET_STATE_COMPLETED ||
+	       rcu_seq_done_exact(&rnp->gp_seq, gsp->norm) ||
+	       gsp->exp == RCU_GET_STATE_COMPLETED ||
+	       (gsp->exp != RCU_GET_STATE_NOT_TRACKED &&
+		rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp));
+}
+
 /**
  * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
  * @gsp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
@@ -3633,14 +3651,8 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  */
 bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	struct rcu_node *rnp = rcu_get_root();
-
 	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
-	if (gsp->norm == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rnp->gp_seq, gsp->norm) ||
-	    gsp->exp == RCU_GET_STATE_COMPLETED ||
-	    (gsp->exp != RCU_GET_STATE_NOT_TRACKED &&
-	     rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp))) {
+	if (poll_state_synchronize_rcu_full_unordered(gsp)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
@@ -3710,6 +3722,7 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
 static int rcu_pending(int user)
 {
 	bool gp_in_progress;
+	struct rcu_gp_seq gp_state;
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;
 
@@ -3740,6 +3753,17 @@ static int rcu_pending(int user)
 	    rcu_segcblist_ready_cbs(&rdp->cblist))
 		return 1;
 
+	/*
+	 * Has a GP (normal or expedited) completed for pending callbacks?
+	 * This is only a racy hint to decide whether to run rcu_core(); the
+	 * ordered re-check and callback advancement happen there, so the
+	 * unordered test avoids paying for memory barriers on every tick.
+	 */
+	if (!rcu_rdp_is_offloaded(rdp) &&
+	    rcu_segcblist_nextgp(&rdp->cblist, &gp_state) &&
+	    poll_state_synchronize_rcu_full_unordered(&gp_state))
+		return 1;
+
 	/* Has RCU gone idle with this CPU needing another grace period? */
 	if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
 	    !rcu_rdp_is_offloaded(rdp) &&
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 08/11] rcu: Wake NOCB rcuog kthreads on expedited grace period completion
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

When an expedited grace period completes, rcu_exp_wait_wake() wakes
waiters on rnp->exp_wq[] but does not notify the NOCB rcuog kthreads.  An
rcuog kthread that is waiting for a grace period sleeps on the leaf
rcu_node's ->nocb_gp_wq[] with a wait condition based on the grace-period
state, so without a wakeup, callbacks on offloaded CPUs that could
benefit from the expedited GP wait until the rcuog kthread wakes for some
other reason (e.g. the next normal GP or a timer).

Make the rcuog grace-period wait honour expedited GPs and wake it when
one completes:

 - nocb_gp_wait() now records the grace period to wait for as a struct
   rcu_gp_seq (both normal and expedited), tracks the earliest pending
   normal and expedited sequence across the group, and releases the wait
   via poll_state_synchronize_rcu_full() so it wakes for whichever
   completes first.  ->nocb_gp_seq is widened to struct rcu_gp_seq
   accordingly.

 - rcu_exp_wait_wake() calls the new rcu_nocb_exp_cleanup() on leaf
   nodes, which wakes both ->nocb_gp_wq[0] and ->nocb_gp_wq[1] (the
   expedited sequence does not share parity with the normal ->gp_seq the
   waiter indexed with).  Both this path and rcu_nocb_gp_cleanup() use
   the shared rcu_nocb_cleanup_wake() helper, which checks swait_active()
   first; the smp_mb() in rcu_gp_cleanup()/rcu_exp_wait_wake() orders the
   grace-period state update before that check.

A stub rcu_nocb_exp_cleanup() is provided for CONFIG_RCU_NOCB_CPU=n.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/tree.c      | 11 ++++-
 kernel/rcu/tree.h      |  3 +-
 kernel/rcu/tree_exp.h  |  2 +
 kernel/rcu/tree_nocb.h | 95 +++++++++++++++++++++++++++++++++++-------
 4 files changed, 94 insertions(+), 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d7e47dfcf702e..169d98ed52bbb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2224,8 +2224,15 @@ static noinline void rcu_gp_cleanup(void)
 			dump_blkd_tasks(rnp, 10);
 		WARN_ON_ONCE(rnp->qsmask);
 		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
-		if (!rnp->parent)
-			smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
+		if (!rnp->parent) {
+			/*
+			 * Order against failing poll_state_synchronize_rcu_full(),
+			 * and also against rcu_nocb_gp_cleanup() -> swait_active(),
+			 * which relies on this barrier to observe a waiter that
+			 * enqueued before re-checking the grace-period state.
+			 */
+			smp_mb();
+		}
 		rdp = this_cpu_ptr(&rcu_data);
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rnp, rdp) || needgp;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 36330739d937c..79d3a656e5f73 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -268,7 +268,7 @@ struct rcu_data {
 	u8 nocb_gp_sleep;		/* Is the nocb GP thread asleep? */
 	u8 nocb_gp_bypass;		/* Found a bypass on last scan? */
 	u8 nocb_gp_gp;			/* GP to wait for on last scan? */
-	unsigned long nocb_gp_seq;	/*  If so, ->gp_seq to wait for. */
+	struct rcu_gp_seq nocb_gp_seq; /* If so, GP state to wait for. */
 	unsigned long nocb_gp_loops;	/* # passes through wait code. */
 	struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
 	bool nocb_cb_sleep;		/* Is the nocb CB thread asleep? */
@@ -511,6 +511,7 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
+static void rcu_nocb_exp_cleanup(struct rcu_node *rnp);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool wake_nocb_gp(struct rcu_data *rdp);
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 0569d8e40e86d..5c35e28708640 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -708,6 +708,8 @@ static void rcu_exp_wait_wake(unsigned long s)
 		}
 		smp_mb(); /* All above changes before wakeup. */
 		wake_up_all(&rnp->exp_wq[rcu_seq_ctr(s) & 0x3]);
+		if (rcu_is_leaf_node(rnp))
+			rcu_nocb_exp_cleanup(rnp);
 	}
 	trace_rcu_exp_grace_period(rcu_state.name, s, TPS("endwake"));
 	mutex_unlock(&rcu_state.exp_wake_mutex);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 263bb8a65a988..6da1b8f524768 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -170,13 +170,35 @@ static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
 		lockdep_assert_held(&rdp->nocb_lock);
 }
 
+static void rcu_nocb_cleanup_wake(struct swait_queue_head *sq)
+{
+	if (swait_active(sq))
+		swake_up_all(sq);
+}
+
 /*
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
-	swake_up_all(sq);
+	/*
+	 * swait_active() can be checked first because of the following
+	 * ordering, which pairs the smp_mb() in rcu_gp_cleanup() against
+	 * the implicit barrier in prepare_to_swait()/set_current_state()
+	 * on the nocb_gp_wait() side:
+	 *
+	 * rcu_gp_cleanup()                          nocb_gp_wait()
+	 * ---------------                           --------------
+	 * WRITE_ONCE(root->gp_seq, new_gp_seq);     swait_event_interruptible_exclusive(sq)
+	 * smp_mb()                                     prepare_to_swait()
+	 * if swait_active(sq)                             list_add_tail(...)
+	 *    swake_up_all(sq)                            set_current_state()
+	 *                                                  smp_mb()
+	 *                                             if (poll_state_synchronize_rcu_full())
+	 *                                                ...
+	 */
+	rcu_nocb_cleanup_wake(sq);
 }
 
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
@@ -190,6 +212,38 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
+/*
+ * Wake NOCB rcuog kthreads on a leaf node so that they can advance
+ * callbacks that were waiting for the just-completed expedited GP.
+ *
+ * The rcuog kthread waiting for a grace period sleeps on the per-leaf-node
+ * ->nocb_gp_wq[] (not on its rdp_gp's ->nocb_gp_wq, which only signals that
+ * new callbacks have shown up), so this is the queue that must be woken.
+ * Both the even and odd waitqueues are woken because the expedited sequence
+ * does not share parity with the normal ->gp_seq the waiter indexed with.
+ */
+static void rcu_nocb_exp_cleanup(struct rcu_node *rnp)
+{
+	/*
+	 * swait_active() can be checked first because of the following
+	 * ordering, which pairs the smp_mb() in rcu_exp_wait_wake() against
+	 * the implicit barrier in prepare_to_swait()/set_current_state()
+	 * on the nocb_gp_wait() side:
+	 *
+	 * rcu_exp_wait_wake()                          nocb_gp_wait()
+	 * ---------------                              --------------
+	 * rcu_seq_end(&rcu_state.expedited_sequence);  swait_event_interruptible_exclusive(sq)
+	 * smp_mb()                                         prepare_to_swait()
+	 * if swait_active(sq)                                 list_add_tail(...)
+	 *    swake_up_all(sq)                                set_current_state()
+	 *                                                      smp_mb()
+	 *                                                 if (poll_state_synchronize_rcu_full())
+	 *                                                    ...
+	 */
+	rcu_nocb_cleanup_wake(&rnp->nocb_gp_wq[0]);
+	rcu_nocb_cleanup_wake(&rnp->nocb_gp_wq[1]);
+}
+
 /* Clear any pending deferred wakeup timer (nocb_gp_lock must be held). */
 static void nocb_defer_wakeup_cancel(struct rcu_data *rdp_gp)
 {
@@ -659,7 +713,6 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 {
 	bool bypass = false;
 	int __maybe_unused cpu = my_rdp->cpu;
-	struct rcu_gp_seq cur_gp_seq;
 	unsigned long flags;
 	bool gotcbs = false;
 	unsigned long j = jiffies;
@@ -669,7 +722,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 	bool needwake_gp;
 	struct rcu_data *rdp, *rdp_toggling = NULL;
 	struct rcu_node *rnp;
-	unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+	struct rcu_gp_seq wait_gp_seq = {0}; // Suppress "use uninitialized" warning.
 	bool wasempty = false;
 
 	/*
@@ -693,6 +746,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 	 * won't be ignored for long.
 	 */
 	list_for_each_entry(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp) {
+		struct rcu_gp_seq cur_gp_seq;
 		long bypass_ncbs;
 		bool flush_bypass = false;
 		long lazy_ncbs;
@@ -754,9 +808,15 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		 */
 		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
 		    !poll_state_synchronize_rcu_full(&cur_gp_seq)) {
-			if (!needwait_gp ||
-			    ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq))
-				wait_gp_seq = cur_gp_seq.norm;
+			/*
+			 * Track the earliest pending normal and expedited GP
+			 * across the group so the wait below can be released by
+			 * whichever completes first.
+			 */
+			if (!needwait_gp || ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq.norm))
+				wait_gp_seq.norm = cur_gp_seq.norm;
+			if (!needwait_gp || ULONG_CMP_LT(cur_gp_seq.exp, wait_gp_seq.exp))
+				wait_gp_seq.exp = cur_gp_seq.exp;
 			needwait_gp = true;
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("NeedWaitGP"));
@@ -778,7 +838,8 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 
 	my_rdp->nocb_gp_bypass = bypass;
 	my_rdp->nocb_gp_gp = needwait_gp;
-	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+	if (needwait_gp)
+		my_rdp->nocb_gp_seq = wait_gp_seq;
 
 	// At least one child with non-empty ->nocb_bypass, so set
 	// timer in order to avoid stranding its callbacks.
@@ -813,12 +874,12 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		nocb_gp_sleep(my_rdp, cpu);
 	} else {
 		rnp = my_rdp->mynode;
-		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq.norm, TPS("StartWait"));
 		swait_event_interruptible_exclusive(
-			rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
-			rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+			rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq.norm) & 0x1],
+			poll_state_synchronize_rcu_full(&wait_gp_seq) ||
 			!READ_ONCE(my_rdp->nocb_gp_sleep));
-		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq.norm, TPS("EndWait"));
 	}
 
 	if (!rcu_nocb_poll) {
@@ -852,7 +913,8 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		swake_up_one(&rdp_toggling->nocb_state_wq);
 	}
 
-	my_rdp->nocb_gp_seq = -1;
+	my_rdp->nocb_gp_seq.norm = -1;
+	my_rdp->nocb_gp_seq.exp = -1;
 	WARN_ON(signal_pending(current));
 }
 
@@ -1536,7 +1598,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 {
 	struct rcu_node *rnp = rdp->mynode;
 
-	pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+	pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld/%ld rnp %d:%d %lu %c CPU %d%s\n",
 		rdp->cpu,
 		"kK"[!!rdp->nocb_gp_kthread],
 		"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
@@ -1548,7 +1610,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 		".W"[swait_active(&rnp->nocb_gp_wq[1])],
 		".B"[!!rdp->nocb_gp_bypass],
 		".G"[!!rdp->nocb_gp_gp],
-		(long)rdp->nocb_gp_seq,
+		(long)rdp->nocb_gp_seq.norm,
+		(long)rdp->nocb_gp_seq.exp,
 		rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
 		rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
 		rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
@@ -1668,6 +1731,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
 
+static void rcu_nocb_exp_cleanup(struct rcu_node *rnp)
+{
+}
+
 static bool wake_nocb_gp(struct rcu_data *rdp)
 {
 	return false;
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 07/11] rcu: Update comments for gp_seq and expedited GP tracking
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Update documentation comments throughout the RCU callback infrastructure
to reflect the transition from a single grace-period sequence number to
the full struct rcu_gp_seq that tracks both normal and expedited grace
periods.

The ->gp_seq[] array documentation in rcu_segcblist.h is updated to
describe dual (normal and expedited) GP tracking.  The
rcu_segcblist_advance(), rcu_segcblist_accelerate(), and
rcu_advance_cbs() comments are updated to refer to the struct rcu_gp_seq
state (gsp) instead of the old bare grace-period sequence number (seq).

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/rcu_segcblist.h | 14 +++++++-----
 kernel/rcu/rcu_segcblist.c    | 43 +++++++++++++++++++++++------------
 kernel/rcu/tree.c             |  6 ++---
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 137cc23b024c5..08b63ecf719b2 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -50,12 +50,14 @@ struct rcu_cblist {
  * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
  * empty.
  *
- * The ->gp_seq[] array contains the grace-period number at which the
- * corresponding segment of callbacks will be ready to invoke.  A given
- * element of this array is meaningful only when the corresponding segment
- * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
- * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
- * not yet been assigned a grace-period number).
+ * The ->gp_seq[] array contains the grace-period state at which the
+ * corresponding segment of callbacks will be ready to invoke.  This tracks
+ * both normal and expedited grace periods, allowing callbacks to complete
+ * when either type of GP finishes.  A given element of this array is
+ * meaningful only when the corresponding segment is non-empty, and it is
+ * never valid for RCU_DONE_TAIL (whose callbacks are already ready to
+ * invoke) or for RCU_NEXT_TAIL (whose callbacks have not yet been assigned
+ * a grace-period state).
  */
 #define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
 #define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index cf8951d33e767..dd770006e7f8b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -495,7 +495,8 @@ static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
 
 /*
  * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value of the grace-period counter.
+ * on the current grace-period state.  Checks both normal and expedited
+ * grace periods, advancing callbacks when either GP type completes.
  */
 void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
 {
@@ -506,8 +507,10 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
 		return;
 
 	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+	 * Find all callbacks whose grace periods have completed (either
+	 * normal or expedited) and put them into the RCU_DONE_TAIL segment.
+	 * We check against the current global GP state, which includes
+	 * proper memory barriers and handles special completion values.
 	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
 		if (!poll_state_synchronize_rcu_full(&rsclp->gp_seq[i]))
@@ -534,9 +537,9 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
  * them to complete at the end of the earlier grace period.
  *
  * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number seq at which new callbacks would become
+ * grace-period state gsp at which new callbacks would become
  * ready to invoke.  Returns true if there are callbacks that won't be
- * ready to invoke until seq, false otherwise.
+ * ready to invoke until the grace period represented by gsp, false otherwise.
  */
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
@@ -548,11 +551,11 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 
 	/*
 	 * Find the segment preceding the oldest segment of callbacks
-	 * whose ->gp_seq[] completion is at or after that passed in via
-	 * "seq", skipping any empty segments.  This oldest segment, along
+	 * whose grace period completion is at or after that passed in via
+	 * "gsp", skipping any empty segments.  This oldest segment, along
 	 * with any later segments, can be merged in with any newly arrived
-	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
-	 * as their ->gp_seq[] grace-period completion sequence number.
+	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "gsp"
+	 * as their grace-period completion state.
 	 */
 	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
 		if (!rcu_segcblist_segempty(rsclp, i) &&
@@ -561,7 +564,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 
 	/*
 	 * If all the segments contain callbacks that correspond to
-	 * earlier grace-period sequence numbers than "seq", leave.
+	 * earlier grace-period sequence numbers than "gsp", leave.
 	 * Assuming that the rcu_segcblist structure has enough
 	 * segments in its arrays, this can only happen if some of
 	 * the non-done segments contain callbacks that really are
@@ -569,15 +572,15 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 	 * out by the next call to rcu_segcblist_advance().
 	 *
 	 * Also advance to the oldest segment of callbacks whose
-	 * ->gp_seq[] completion is at or after that passed in via "seq",
+	 * ->gp_seq[] completion is at or after that passed in via "gsp",
 	 * skipping any empty segments.
 	 *
 	 * Note that segment "i" (and any lower-numbered segments
 	 * containing older callbacks) will be unaffected, and their
-	 * grace-period numbers remain unchanged.  For example, if i ==
+	 * grace-period states remain unchanged.  For example, if i ==
 	 * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched.
 	 * Instead, the CBs in NEXT_TAIL will be merged with those in
-	 * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL
+	 * NEXT_READY_TAIL and the grace-period state of NEXT_READY_TAIL
 	 * would be updated.  NEXT_TAIL would then be empty.
 	 */
 	if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
@@ -589,8 +592,8 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 
 	/*
 	 * Merge all later callbacks, including newly arrived callbacks,
-	 * into the segment located by the for-loop above.  Assign "seq"
-	 * as the ->gp_seq[] value in order to correctly handle the case
+	 * into the segment located by the for-loop above.  Assign "gsp"
+	 * as the grace-period state in order to correctly handle the case
 	 * where there were no pending callbacks in the rcu_segcblist
 	 * structure other than in the RCU_NEXT_TAIL segment.
 	 */
@@ -644,6 +647,10 @@ void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
 		return;
 
+	/*
+	 * Find all callbacks whose normal GP sequence numbers indicate
+	 * that they are ready to invoke.  For SRCU, we only check norm.
+	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
 		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i].norm))
 			break;
@@ -658,6 +665,12 @@ void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	rcu_segcblist_advance_compact(rsclp, i);
 }
 
+/*
+ * SRCU wrapper for rcu_segcblist_accelerate() - converts SRCU's unsigned
+ * long GP sequence to rcu_gp_seq format with exp set to
+ * RCU_GET_STATE_NOT_TRACKED (since SRCU does not use expedited GPs)
+ * and calls the core rcu_segcblist_accelerate().
+ */
 bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
 	struct rcu_gp_seq gs = { .norm = seq, .exp = RCU_GET_STATE_NOT_TRACKED };
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91c03887a1228..d7e47dfcf702e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1209,7 +1209,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
 /*
  * Move any callbacks whose grace period has completed to the
  * RCU_DONE_TAIL sublist, then compact the remaining sublists and
- * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
+ * assign ->gp_seq[] state to any callbacks in the RCU_NEXT_TAIL
  * sublist.  This function is idempotent, so it does not hurt to
  * invoke it repeatedly.  As long as it is not invoked -too- often...
  * Returns true if the RCU grace-period kthread needs to be awakened.
@@ -1226,8 +1226,8 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 		return false;
 
 	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
+	 * Find all callbacks whose grace periods have completed (either
+	 * normal or expedited) and put them into the RCU_DONE_TAIL sublist.
 	 */
 	rcu_segcblist_advance(&rdp->cblist);
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 06/11] rcu: Enable RCU callbacks to benefit from expedited grace periods
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Currently, RCU callbacks only track normal grace-period sequence
numbers.  This means callbacks must wait for normal grace periods to
complete even when expedited grace periods have already elapsed.

Use the full struct rcu_gp_seq (which tracks both the normal and
expedited grace-period sequences) throughout the callback
infrastructure.

rcu_segcblist_advance() now checks both normal and expedited GP
completion via poll_state_synchronize_rcu_full(), and becomes
parameterless since it reads the grace-period state internally.
rcu_segcblist_accelerate() stores the full state (both sequences)
instead of just the normal one.  rcu_accelerate_cbs() and
rcu_accelerate_cbs_unlocked() use get_state_synchronize_rcu_full() to
capture both sequences, and the NOCB advance checks use
poll_state_synchronize_rcu_full() instead of comparing only the normal
sequence.

srcu_segcblist_advance() becomes a standalone implementation because it
compares SRCU sequences directly and cannot use
poll_state_synchronize_rcu_full(), which reads RCU-specific globals.
srcu_segcblist_accelerate() sets the ->exp field to
RCU_GET_STATE_NOT_TRACKED so that poll_state_synchronize_rcu_full()
compares only ->norm and ignores ->exp.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 30 +++++++++++++++++++++++-------
 kernel/rcu/rcu_segcblist.h |  2 +-
 kernel/rcu/tree.c          |  9 +++------
 kernel/rcu/tree_nocb.h     | 33 +++++++++++++++++++++++----------
 4 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 4e3dfe42bc097..cf8951d33e767 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -12,6 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 
+#include "rcu.h"
 #include "rcu_segcblist.h"
 
 /* Initialize simple callback list. */
@@ -494,9 +495,9 @@ static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
 
 /*
  * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value passed in for the grace-period counter.
+ * on the current value of the grace-period counter.
  */
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp)
 {
 	int i;
 
@@ -509,7 +510,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
 	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(gsp->norm, rsclp->gp_seq[i].norm))
+		if (!poll_state_synchronize_rcu_full(&rsclp->gp_seq[i]))
 			break;
 		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
 		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
@@ -595,7 +596,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gs
 	 */
 	for (; i < RCU_NEXT_TAIL; i++) {
 		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
-		rsclp->gp_seq[i].norm = gsp->norm;
+		rsclp->gp_seq[i] = *gsp;
 	}
 	return true;
 }
@@ -637,14 +638,29 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 
 void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	struct rcu_gp_seq gs = { .norm = seq };
+	int i;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return;
+
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i].norm))
+			break;
+		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
+		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
+	}
+
+	/* If no callbacks moved, nothing more need be done. */
+	if (i == RCU_WAIT_TAIL)
+		return;
 
-	rcu_segcblist_advance(rsclp, &gs);
+	rcu_segcblist_advance_compact(rsclp, i);
 }
 
 bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	struct rcu_gp_seq gs = { .norm = seq };
+	struct rcu_gp_seq gs = { .norm = seq, .exp = RCU_GET_STATE_NOT_TRACKED };
 
 	return rcu_segcblist_accelerate(rsclp, &gs);
 }
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 16b0cb6b32507..431c4466b8898 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -139,7 +139,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
 void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 095a023b19f1f..91c03887a1228 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1164,7 +1164,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * accelerating callback invocation to an earlier grace-period
 	 * number.
 	 */
-	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	get_state_synchronize_rcu_full(&gs);
 	if (rcu_segcblist_accelerate(&rdp->cblist, &gs))
 		ret = rcu_start_this_gp(rnp, rdp, gs.norm);
 
@@ -1193,7 +1193,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
 	bool needwake;
 
 	rcu_lockdep_assert_cblist_protected(rdp);
-	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	get_state_synchronize_rcu_full(&gs);
 	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, gs.norm)) {
 		/* Old request still live, so mark recent callbacks. */
 		(void)rcu_segcblist_accelerate(&rdp->cblist, &gs);
@@ -1218,8 +1218,6 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
  */
 static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	struct rcu_gp_seq gs;
-
 	rcu_lockdep_assert_cblist_protected(rdp);
 	raw_lockdep_assert_held_rcu_node(rnp);
 
@@ -1231,8 +1229,7 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * Find all callbacks whose ->gp_seq numbers indicate that they
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
 	 */
-	gs.norm = rnp->gp_seq;
-	rcu_segcblist_advance(&rdp->cblist, &gs);
+	rcu_segcblist_advance(&rdp->cblist);
 
 	/* Classify any remaining callbacks. */
 	return rcu_accelerate_cbs(rnp, rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index e0274a2e1c1ae..263bb8a65a988 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -502,7 +502,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		}
 		if (j != rdp->nocb_gp_adv_time &&
 		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq.norm)) {
+		    poll_state_synchronize_rcu_full(&cur_gp_seq)) {
 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
 			rdp->nocb_gp_adv_time = j;
 		}
@@ -731,7 +731,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		if (!rcu_segcblist_restempty(&rdp->cblist,
 					     RCU_NEXT_READY_TAIL) ||
 		    (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm))) {
+		     poll_state_synchronize_rcu_full(&cur_gp_seq))) {
 			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
 			needwake_gp = rcu_advance_cbs(rnp, rdp);
 			wasempty = rcu_segcblist_restempty(&rdp->cblist,
@@ -742,7 +742,18 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		WARN_ON_ONCE(wasempty &&
 			     !rcu_segcblist_restempty(&rdp->cblist,
 						      RCU_NEXT_READY_TAIL));
-		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+		/*
+		 * Only request a GP wait if the next pending callback's
+		 * GP has not already completed (normal or expedited).
+		 * If poll_state_synchronize_rcu_full() says it completed,
+		 * then rcu_advance_cbs() above already moved those
+		 * callbacks to RCU_DONE_TAIL, so there is no GP to wait
+		 * for.  Any remaining callbacks got new (future) GP
+		 * numbers from rcu_accelerate_cbs() inside
+		 * rcu_advance_cbs() and will be handled on the next pass.
+		 */
+		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+		    !poll_state_synchronize_rcu_full(&cur_gp_seq)) {
 			if (!needwait_gp ||
 			    ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq))
 				wait_gp_seq = cur_gp_seq.norm;
@@ -919,7 +930,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 	lockdep_assert_irqs_enabled();
 	rcu_nocb_lock_irqsave(rdp, flags);
 	if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
-	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm) &&
+	    poll_state_synchronize_rcu_full(&cur_gp_seq) &&
 	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
 		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
@@ -1548,8 +1559,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 static void show_rcu_nocb_state(struct rcu_data *rdp)
 {
 	char bufd[22];
-	char bufw[45];
-	char bufr[45];
+	char bufw[64];
+	char bufr[64];
 	char bufn[22];
 	char bufb[22];
 	struct rcu_data *nocb_next_rdp;
@@ -1569,10 +1580,12 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 					      nocb_entry_rdp);
 
 	sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
-	sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL],
-		rsclp->gp_seq[RCU_WAIT_TAIL].norm);
-	sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
-		rsclp->gp_seq[RCU_NEXT_READY_TAIL].norm);
+	sprintf(bufw, "%ld(%ld/%ld)", rsclp->seglen[RCU_WAIT_TAIL],
+		rsclp->gp_seq[RCU_WAIT_TAIL].norm,
+		rsclp->gp_seq[RCU_WAIT_TAIL].exp);
+	sprintf(bufr, "%ld(%ld/%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+		rsclp->gp_seq[RCU_NEXT_READY_TAIL].norm,
+		rsclp->gp_seq[RCU_NEXT_READY_TAIL].exp);
 	sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
 	sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	pr_info("   CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 05/11] rcu: Add RCU_GET_STATE_NOT_TRACKED for subsystems without expedited GPs
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

SRCU and Tasks RCU do not track expedited grace periods. When their
callback state is checked via poll_state_synchronize_rcu_full(), the
uninitialized or zeroed exp field could cause false-positive
completion detection.

This commit adds an RCU_GET_STATE_NOT_TRACKED sentinel value (0x2) that
these subsystems can place into exp to indicate that expedited GP
tracking is not applicable. The expedited sequence check in
poll_state_synchronize_rcu_full() is guarded to skip entries marked with
this sentinel.

This is needed to allow rcu_segcblist_advance() and rcu_accelerate_cbs()
to work with both normal and expedited grace periods via
get_state_synchronize_rcu_full() and poll_state_synchronize_rcu_full().

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu.h  | 13 +++++++++++--
 kernel/rcu/tree.c |  3 ++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 14faa11ef23cd..39a9f6fa9a7b2 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -46,16 +46,25 @@
  *					the number of pending readers that will use
  *					this inactive index is bounded).
  *
- * RCU polled GP special control value:
+ * RCU polled GP special control values:
  *
  *	RCU_GET_STATE_COMPLETED :	State value indicating an already-completed
  *					polled GP has completed.  This value covers
  *					both the state and the counter of the
  *					grace-period sequence number.
+ *
+ *	RCU_GET_STATE_NOT_TRACKED :	State value indicating that a GP component
+ *					is not tracked by this subsystem and should
+ *					not be checked.  Used by SRCU and RCU Tasks
+ *					which do not track expedited GPs, to prevent
+ *					false-positive completion when their
+ *					gp_seq entries are checked via
+ *					poll_state_synchronize_rcu_full().
  */
 
-/* Low-order bit definition for polled grace-period APIs. */
+/* Low-order bit definitions for polled grace-period APIs. */
 #define RCU_GET_STATE_COMPLETED	0x1
+#define RCU_GET_STATE_NOT_TRACKED	0x2
 
 /* A complete grace period count */
 #define RCU_SEQ_GP (RCU_SEQ_STATE_MASK + 1)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1d65505460bc7..095a023b19f1f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3635,7 +3635,8 @@ bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 	if (gsp->norm == RCU_GET_STATE_COMPLETED ||
 	    rcu_seq_done_exact(&rnp->gp_seq, gsp->norm) ||
 	    gsp->exp == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp)) {
+	    (gsp->exp != RCU_GET_STATE_NOT_TRACKED &&
+	     rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp))) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 04/11] rcu/segcblist: Track segment grace periods with struct rcu_gp_seq
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Change the type of the per-segment ->gp_seq[] array in struct
rcu_segcblist from unsigned long to struct rcu_gp_seq.  This prepares the
callback tracking infrastructure to record both normal and expedited
grace periods per segment.

The rcu_segcblist_nextgp(), rcu_segcblist_advance(), and
rcu_segcblist_accelerate() helpers now take a struct rcu_gp_seq * instead
of an unsigned long, and all callers use the .norm field for comparisons
and assignments.  The SRCU and Tasks RCU wrappers construct a struct
rcu_gp_seq with only .norm set and forward to the core helpers.

No functional change: only the .norm field is used.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/rcu_segcblist.h |  2 +-
 include/trace/events/rcu.h    |  5 +++--
 kernel/rcu/rcu_segcblist.c    | 24 ++++++++++++++----------
 kernel/rcu/rcu_segcblist.h    |  6 +++---
 kernel/rcu/tree.c             | 25 ++++++++++++++-----------
 kernel/rcu/tree_nocb.h        | 21 +++++++++++----------
 6 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 2fdc2208f1ca3..137cc23b024c5 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -190,7 +190,7 @@ struct rcu_cblist {
 struct rcu_segcblist {
 	struct rcu_head *head;
 	struct rcu_head **tails[RCU_CBLIST_NSEGS];
-	unsigned long gp_seq[RCU_CBLIST_NSEGS];
+	struct rcu_gp_seq gp_seq[RCU_CBLIST_NSEGS];
 #ifdef CONFIG_RCU_NOCB_CPU
 	atomic_long_t len;
 #else
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 5fbdabe3faead..c84309c388343 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -547,10 +547,11 @@ TRACE_EVENT_RCU(rcu_segcb_stats,
 		),
 
 		TP_fast_assign(
+			int i;
 			__entry->ctx = ctx;
 			memcpy(__entry->seglen, rs->seglen, RCU_CBLIST_NSEGS * sizeof(long));
-			memcpy(__entry->gp_seq, rs->gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long));
-
+			for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+				__entry->gp_seq[i] = rs->gp_seq[i].norm;
 		),
 
 		TP_printk("%s seglen: (DONE=%ld, WAIT=%ld, NEXT_READY=%ld, NEXT=%ld) "
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 421f1dadb5e55..4e3dfe42bc097 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -307,13 +307,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
 
 /*
  * Return false if there are no CBs awaiting grace periods, otherwise,
- * return true and store the nearest waited-upon grace period into *lp.
+ * return true and store the nearest waited-upon grace period state into *gsp.
  */
-bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
 	if (!rcu_segcblist_pend_cbs(rsclp))
 		return false;
-	*lp = rsclp->gp_seq[RCU_WAIT_TAIL];
+	*gsp = rsclp->gp_seq[RCU_WAIT_TAIL];
 	return true;
 }
 
@@ -496,7 +496,7 @@ static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
  * Advance the callbacks in the specified rcu_segcblist structure based
  * on the current value passed in for the grace-period counter.
  */
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
 	int i;
 
@@ -509,7 +509,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
 	 */
 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+		if (ULONG_CMP_LT(gsp->norm, rsclp->gp_seq[i].norm))
 			break;
 		WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
 		rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL);
@@ -537,7 +537,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
  * ready to invoke.  Returns true if there are callbacks that won't be
  * ready to invoke until seq, false otherwise.
  */
-bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp)
 {
 	int i, j;
 
@@ -555,7 +555,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 */
 	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
 		if (!rcu_segcblist_segempty(rsclp, i) &&
-		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+		    ULONG_CMP_LT(rsclp->gp_seq[i].norm, gsp->norm))
 			break;
 
 	/*
@@ -595,7 +595,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 */
 	for (; i < RCU_NEXT_TAIL; i++) {
 		WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
-		rsclp->gp_seq[i] = seq;
+		rsclp->gp_seq[i].norm = gsp->norm;
 	}
 	return true;
 }
@@ -637,10 +637,14 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 
 void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	rcu_segcblist_advance(rsclp, seq);
+	struct rcu_gp_seq gs = { .norm = seq };
+
+	rcu_segcblist_advance(rsclp, &gs);
 }
 
 bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	return rcu_segcblist_accelerate(rsclp, seq);
+	struct rcu_gp_seq gs = { .norm = seq };
+
+	return rcu_segcblist_accelerate(rsclp, &gs);
 }
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 956f2967d9d29..16b0cb6b32507 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -124,7 +124,7 @@ bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
 bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
 struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
 void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 			   struct rcu_head *rhp);
 bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -139,8 +139,8 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
 void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 				   struct rcu_cblist *rclp);
-void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
-bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
+void rcu_segcblist_advance(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
+bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, struct rcu_gp_seq *gsp);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
 void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index af4b6daf6a0ff..1d65505460bc7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1142,7 +1142,7 @@ static void rcu_gp_kthread_wake(void)
  */
 static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 {
-	unsigned long gp_seq_req;
+	struct rcu_gp_seq gs;
 	bool ret = false;
 
 	rcu_lockdep_assert_cblist_protected(rdp);
@@ -1164,15 +1164,15 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * accelerating callback invocation to an earlier grace-period
 	 * number.
 	 */
-	gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq);
-	if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req))
-		ret = rcu_start_this_gp(rnp, rdp, gp_seq_req);
+	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	if (rcu_segcblist_accelerate(&rdp->cblist, &gs))
+		ret = rcu_start_this_gp(rnp, rdp, gs.norm);
 
 	/* Trace depending on how much we were able to accelerate. */
 	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
-		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
+		trace_rcu_grace_period(rcu_state.name, gs.norm, TPS("AccWaitCB"));
 	else
-		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+		trace_rcu_grace_period(rcu_state.name, gs.norm, TPS("AccReadyCB"));
 
 	trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc"));
 
@@ -1189,14 +1189,14 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
 					struct rcu_data *rdp)
 {
-	unsigned long c;
+	struct rcu_gp_seq gs;
 	bool needwake;
 
 	rcu_lockdep_assert_cblist_protected(rdp);
-	c = rcu_seq_snap(&rcu_state.gp_seq);
-	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
+	gs.norm = rcu_seq_snap(&rcu_state.gp_seq);
+	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, gs.norm)) {
 		/* Old request still live, so mark recent callbacks. */
-		(void)rcu_segcblist_accelerate(&rdp->cblist, c);
+		(void)rcu_segcblist_accelerate(&rdp->cblist, &gs);
 		return;
 	}
 	raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1218,6 +1218,8 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
  */
 static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 {
+	struct rcu_gp_seq gs;
+
 	rcu_lockdep_assert_cblist_protected(rdp);
 	raw_lockdep_assert_held_rcu_node(rnp);
 
@@ -1229,7 +1231,8 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 	 * Find all callbacks whose ->gp_seq numbers indicate that they
 	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
 	 */
-	rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq);
+	gs.norm = rnp->gp_seq;
+	rcu_segcblist_advance(&rdp->cblist, &gs);
 
 	/* Classify any remaining callbacks. */
 	return rcu_accelerate_cbs(rnp, rdp);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 373b877cf171d..e0274a2e1c1ae 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 				bool lazy)
 {
 	unsigned long c;
-	unsigned long cur_gp_seq;
+	struct rcu_gp_seq cur_gp_seq;
 	unsigned long j = jiffies;
 	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
 	long lazy_len = READ_ONCE(rdp->lazy_len);
@@ -502,7 +502,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		}
 		if (j != rdp->nocb_gp_adv_time &&
 		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq.norm)) {
 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
 			rdp->nocb_gp_adv_time = j;
 		}
@@ -659,7 +659,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 {
 	bool bypass = false;
 	int __maybe_unused cpu = my_rdp->cpu;
-	unsigned long cur_gp_seq;
+	struct rcu_gp_seq cur_gp_seq;
 	unsigned long flags;
 	bool gotcbs = false;
 	unsigned long j = jiffies;
@@ -731,7 +731,7 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 		if (!rcu_segcblist_restempty(&rdp->cblist,
 					     RCU_NEXT_READY_TAIL) ||
 		    (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
-		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm))) {
 			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
 			needwake_gp = rcu_advance_cbs(rnp, rdp);
 			wasempty = rcu_segcblist_restempty(&rdp->cblist,
@@ -744,8 +744,8 @@ static noinline_for_stack void nocb_gp_wait(struct rcu_data *my_rdp)
 						      RCU_NEXT_READY_TAIL));
 		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
 			if (!needwait_gp ||
-			    ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
-				wait_gp_seq = cur_gp_seq;
+			    ULONG_CMP_LT(cur_gp_seq.norm, wait_gp_seq))
+				wait_gp_seq = cur_gp_seq.norm;
 			needwait_gp = true;
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("NeedWaitGP"));
@@ -877,7 +877,7 @@ static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
 static void nocb_cb_wait(struct rcu_data *rdp)
 {
 	struct rcu_segcblist *cblist = &rdp->cblist;
-	unsigned long cur_gp_seq;
+	struct rcu_gp_seq cur_gp_seq;
 	unsigned long flags;
 	bool needwake_gp = false;
 	struct rcu_node *rnp = rdp->mynode;
@@ -919,7 +919,7 @@ static void nocb_cb_wait(struct rcu_data *rdp)
 	lockdep_assert_irqs_enabled();
 	rcu_nocb_lock_irqsave(rdp, flags);
 	if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
-	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq.norm) &&
 	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
 		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
@@ -1569,9 +1569,10 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 					      nocb_entry_rdp);
 
 	sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
-	sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+	sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL],
+		rsclp->gp_seq[RCU_WAIT_TAIL].norm);
 	sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
-		      rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+		rsclp->gp_seq[RCU_NEXT_READY_TAIL].norm);
 	sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
 	sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
 	pr_info("   CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 03/11] rcu/segcblist: Factor out rcu_segcblist_advance_compact() helper
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

This commit extracts the tail-pointer cleanup and segment compaction
logic from rcu_segcblist_advance() into a new static helper function,
rcu_segcblist_advance_compact(). This shared logic will be reused by the
upcoming srcu_segcblist_advance() standalone implementation, which
cannot call the core rcu_segcblist_advance() because that function will
use RCU-specific globals.

No functional change.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 50 ++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index da39d818b01b1..421f1dadb5e55 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -462,13 +462,43 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
 	WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
 }
 
+/*
+ * Clean up and compact the segmented callback list after callbacks have been
+ * advanced to the RCU_DONE_TAIL segment.  The @i parameter is the index of the
+ * first segment that was NOT advanced (i.e., the segment after the last one
+ * moved to RCU_DONE_TAIL). This function fixes up tail pointers and compacts
+ * any gaps left by the moved segments.
+ */
+static void rcu_segcblist_advance_compact(struct rcu_segcblist *rsclp, int i)
+{
+	int j;
+
+	/* Clean up tail pointers that might have been misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
+
+	/*
+	 * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
+	 * and a non-empty RCU_NEXT_READY_TAIL.  If so, copy the
+	 * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
+	 * created by the now-ready-to-invoke segments.
+	 */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+			break;  /* No more callbacks. */
+		WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
+		rcu_segcblist_move_seglen(rsclp, i, j);
+		rsclp->gp_seq[j] = rsclp->gp_seq[i];
+	}
+}
+
 /*
  * Advance the callbacks in the specified rcu_segcblist structure based
  * on the current value passed in for the grace-period counter.
  */
 void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 {
-	int i, j;
+	int i;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
 	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
@@ -489,23 +519,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
 	if (i == RCU_WAIT_TAIL)
 		return;
 
-	/* Clean up tail pointers that might have been misordered above. */
-	for (j = RCU_WAIT_TAIL; j < i; j++)
-		WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
-
-	/*
-	 * Callbacks moved, so there might be an empty RCU_WAIT_TAIL
-	 * and a non-empty RCU_NEXT_READY_TAIL.  If so, copy the
-	 * RCU_NEXT_READY_TAIL segment to fill the RCU_WAIT_TAIL gap
-	 * created by the now-ready-to-invoke segments.
-	 */
-	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-			break;  /* No more callbacks. */
-		WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
-		rcu_segcblist_move_seglen(rsclp, i, j);
-		rsclp->gp_seq[j] = rsclp->gp_seq[i];
-	}
+	rcu_segcblist_advance_compact(rsclp, i);
 }
 
 /*
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 02/11] rcu/segcblist: Add SRCU and Tasks RCU wrapper functions
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

Add srcu_segcblist_advance() and srcu_segcblist_accelerate() wrappers
that forward to the core rcu_segcblist_advance() and
rcu_segcblist_accelerate() functions, and switch all SRCU (srcutree.c)
and Tasks RCU (tasks.h) callers to use these wrappers.

This isolates SRCU and Tasks RCU from upcoming changes to the core
advance/accelerate functions, which will switch to struct
rcu_gp_seq for dual normal/expedited GP tracking. Because SRCU and
Tasks RCU use only normal GP sequences, their wrappers will maintain the
existing unsigned long interface.

No functional change.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 10 ++++++++++
 kernel/rcu/rcu_segcblist.h |  2 ++
 kernel/rcu/srcutree.c      | 14 +++++++-------
 kernel/rcu/tasks.h         |  8 ++++----
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 298a2c573f02c..da39d818b01b1 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -620,3 +620,13 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 
 	rcu_segcblist_init(src_rsclp);
 }
+
+void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+	rcu_segcblist_advance(rsclp, seq);
+}
+
+bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
+{
+	return rcu_segcblist_accelerate(rsclp, seq);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index fadc08ad4b7b6..956f2967d9d29 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -143,3 +143,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 			 struct rcu_segcblist *src_rsclp);
+void srcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
+bool srcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 7c2f7cc131f7a..519a35719c896 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1351,7 +1351,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	 *  2) The grace period for RCU_WAIT_TAIL is seen as started but not
 	 *     completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
 	 *
-	 *  3) This value is passed to rcu_segcblist_advance() which can't move
+	 *  3) This value is passed to srcu_segcblist_advance() which can't move
 	 *     any segment forward and fails.
 	 *
 	 *  4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
@@ -1360,15 +1360,15 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 	 *     RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
 	 *     so it returns a snapshot of the next grace period, which is X + 12.
 	 *
-	 *  5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
+	 *  5) The value of X + 12 is passed to srcu_segcblist_accelerate() but the
 	 *     freshly enqueued callback in RCU_NEXT_TAIL can't move to
 	 *     RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
 	 *     period (gp_num = X + 8). So acceleration fails.
 	 */
 	s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
 	if (rhp) {
-		rcu_segcblist_advance(&sdp->srcu_cblist,
-				      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+		srcu_segcblist_advance(&sdp->srcu_cblist,
+				       rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 		/*
 		 * Acceleration can never fail because the base current gp_seq
 		 * used for acceleration is <= the value of gp_seq used for
@@ -1376,7 +1376,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
 		 * always be able to be emptied by the acceleration into the
 		 * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
 		 */
-		WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
+		WARN_ON_ONCE(!srcu_segcblist_accelerate(&sdp->srcu_cblist, s));
 	}
 	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
 		sdp->srcu_gp_seq_needed = s;
@@ -1891,8 +1891,8 @@ static void srcu_invoke_callbacks(struct work_struct *work)
 	rcu_cblist_init(&ready_cbs);
 	raw_spin_lock_irq_rcu_node(sdp);
 	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
-	rcu_segcblist_advance(&sdp->srcu_cblist,
-			      rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+	srcu_segcblist_advance(&sdp->srcu_cblist,
+			       rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
 	/*
 	 * Although this function is theoretically re-entrant, concurrent
 	 * callbacks invocation is disallowed to avoid executing an SRCU barrier
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index f4da5fad70f51..92971499a12c5 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -481,8 +481,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
 			if (cpu > 0)
 				ncbsnz += n;
 		}
-		rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
-		(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+		srcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+		(void)srcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
 		if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) {
 			if (rtp->lazy_jiffies)
 				rtpcp->urgent_gp--;
@@ -565,7 +565,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	if (rcu_segcblist_empty(&rtpcp->cblist))
 		return;
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
-	rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
+	srcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq));
 	rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl);
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 	len = rcl.len;
@@ -578,7 +578,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu
 	}
 	raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 	rcu_segcblist_add_len(&rtpcp->cblist, -len);
-	(void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
+	(void)srcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq));
 	raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
 }
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 01/11] rcu: Rename struct rcu_gp_oldstate to rcu_gp_seq
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao
In-Reply-To: <20260624132356.516959-1-puranjay@kernel.org>

The polled grace-period state structure rcu_gp_oldstate holds a snapshot
of the normal (and, on SMP, expedited) grace-period sequence numbers.
Upcoming changes store this structure in the callback segment list, where
the "oldstate" name reads poorly: there it represents the grace period a
segment is waiting on and is also compared against the current
grace-period state.

Rename struct rcu_gp_oldstate to the more neutral struct rcu_gp_seq, and
shorten its members rgos_norm and rgos_exp to norm and exp.  Local
variables and parameters of this type are renamed from rgosp/rgos to
gsp/gs accordingly.

While at it, provide a single definition of the structure in rcupdate.h
rather than separate Tiny-RCU and Tree-RCU definitions, and give it the
->exp field unconditionally.  Tiny RCU does not track expedited grace
periods and leaves ->exp unused, but a single definition that always has
->exp lets the shared callback code in rcu_segcblist.c reference it
without CONFIG_SMP guards, including on !SMP builds.

No functional change.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 include/linux/rcupdate.h      | 13 ++++++--
 include/linux/rcupdate_wait.h |  2 +-
 include/linux/rcutiny.h       | 36 +++++++++-----------
 include/linux/rcutree.h       | 29 +++++++---------
 kernel/rcu/rcutorture.c       | 30 ++++++++---------
 kernel/rcu/tiny.c             |  4 +--
 kernel/rcu/tree.c             | 62 +++++++++++++++++------------------
 kernel/rcu/tree_exp.h         | 18 +++++-----
 mm/slab_common.c              |  6 ++--
 9 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5e95acc33989b..ce00f1726e95e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,9 +52,18 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
 void synchronize_rcu(void);
 
-struct rcu_gp_oldstate;
+/*
+ * Grace-period sequence snapshot for the polled RCU APIs: ->norm for the
+ * normal grace period and ->exp for the expedited one.  ->exp is unused by
+ * Tiny RCU, but is present unconditionally so that a single definition
+ * serves both Tiny RCU and Tree RCU.
+ */
+struct rcu_gp_seq {
+	unsigned long norm;
+	unsigned long exp;
+};
 unsigned long get_completed_synchronize_rcu(void);
-void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void get_completed_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 
 // Maximum number of unsigned long values corresponding to
 // not-yet-completed RCU grace periods.
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index 4c92d4291cce7..fa884704a3b79 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -18,7 +18,7 @@ struct rcu_synchronize {
 	struct completion completion;
 
 	/* This is for debugging. */
-	struct rcu_gp_oldstate oldstate;
+	struct rcu_gp_seq oldstate;
 };
 void wakeme_after_rcu(struct rcu_head *head);
 
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index f519cd6802286..e56ded733b1b5 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,11 +14,7 @@
 
 #include <asm/param.h> /* for HZ */
 
-struct rcu_gp_oldstate {
-	unsigned long rgos_norm;
-};
-
-// Maximum number of rcu_gp_oldstate values corresponding to
+// Maximum number of rcu_gp_seq values corresponding to
 // not-yet-completed RCU grace periods.
 #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 2
 
@@ -26,31 +22,31 @@ struct rcu_gp_oldstate {
  * Are the two oldstate values the same?  See the Tree RCU version for
  * docbook header.
  */
-static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
-						   struct rcu_gp_oldstate *rgosp2)
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_seq *rgosp1,
+						   struct rcu_gp_seq *rgosp2)
 {
-	return rgosp1->rgos_norm == rgosp2->rgos_norm;
+	return rgosp1->norm == rgosp2->norm;
 }
 
 unsigned long get_state_synchronize_rcu(void);
 
-static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline void get_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = get_state_synchronize_rcu();
+	gsp->norm = get_state_synchronize_rcu();
 }
 
 unsigned long start_poll_synchronize_rcu(void);
 
-static inline void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline void start_poll_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = start_poll_synchronize_rcu();
+	gsp->norm = start_poll_synchronize_rcu();
 }
 
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
-static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	return poll_state_synchronize_rcu(rgosp->rgos_norm);
+	return poll_state_synchronize_rcu(gsp->norm);
 }
 
 static inline void cond_synchronize_rcu(unsigned long oldstate)
@@ -58,9 +54,9 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
-static inline void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+static inline void cond_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	cond_synchronize_rcu(rgosp->rgos_norm);
+	cond_synchronize_rcu(gsp->norm);
 }
 
 static inline unsigned long start_poll_synchronize_rcu_expedited(void)
@@ -68,9 +64,9 @@ static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 	return start_poll_synchronize_rcu();
 }
 
-static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = start_poll_synchronize_rcu_expedited();
+	gsp->norm = start_poll_synchronize_rcu_expedited();
 }
 
 static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
@@ -78,9 +74,9 @@ static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 	cond_synchronize_rcu(oldstate);
 }
 
-static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	cond_synchronize_rcu_expedited(rgosp->rgos_norm);
+	cond_synchronize_rcu_expedited(gsp->norm);
 }
 
 extern void rcu_barrier(void);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9d2d7bd251d4f..16a04202888b4 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -38,12 +38,7 @@ void synchronize_rcu_expedited(void);
 void rcu_barrier(void);
 void rcu_momentary_eqs(void);
 
-struct rcu_gp_oldstate {
-	unsigned long rgos_norm;
-	unsigned long rgos_exp;
-};
-
-// Maximum number of rcu_gp_oldstate values corresponding to
+// Maximum number of rcu_gp_seq values corresponding to
 // not-yet-completed RCU grace periods.
 #define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4
 
@@ -60,29 +55,29 @@ struct rcu_gp_oldstate {
  * to a list header, allowing those structures to be slightly smaller.
  *
  * Note that equality is judged on a bitwise basis, so that an
- * @rcu_gp_oldstate structure with an already-completed state in one field
+ * @rcu_gp_seq structure with an already-completed state in one field
  * will compare not-equal to a structure with an already-completed state
- * in the other field.  After all, the @rcu_gp_oldstate structure is opaque
+ * in the other field.  After all, the @rcu_gp_seq structure is opaque
  * so how did such a situation come to pass in the first place?
  */
-static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
-						   struct rcu_gp_oldstate *rgosp2)
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_seq *rgosp1,
+						   struct rcu_gp_seq *rgosp2)
 {
-	return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp;
+	return rgosp1->norm == rgosp2->norm && rgosp1->exp == rgosp2->exp;
 }
 
 unsigned long start_poll_synchronize_rcu_expedited(void);
-void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
-void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp);
 unsigned long get_state_synchronize_rcu(void);
-void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void get_state_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 unsigned long start_poll_synchronize_rcu(void);
-void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void start_poll_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
-bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 void cond_synchronize_rcu(unsigned long oldstate);
-void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
+void cond_synchronize_rcu_full(struct rcu_gp_seq *gsp);
 
 #ifdef CONFIG_PROVE_RCU
 void rcu_irq_exit_check_preempt(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4d4ebeeeab440..b09e15746a08c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -393,23 +393,23 @@ struct rcu_torture_ops {
 	void (*exp_current)(void);
 	unsigned long (*get_gp_state_exp)(void);
 	unsigned long (*start_gp_poll_exp)(void);
-	void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
+	void (*start_gp_poll_exp_full)(struct rcu_gp_seq *gsp);
 	bool (*poll_gp_state_exp)(unsigned long oldstate);
 	void (*cond_sync_exp)(unsigned long oldstate);
-	void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
+	void (*cond_sync_exp_full)(struct rcu_gp_seq *gsp);
 	unsigned long (*get_comp_state)(void);
-	void (*get_comp_state_full)(struct rcu_gp_oldstate *rgosp);
+	void (*get_comp_state_full)(struct rcu_gp_seq *gsp);
 	bool (*same_gp_state)(unsigned long oldstate1, unsigned long oldstate2);
-	bool (*same_gp_state_full)(struct rcu_gp_oldstate *rgosp1, struct rcu_gp_oldstate *rgosp2);
+	bool (*same_gp_state_full)(struct rcu_gp_seq *rgosp1, struct rcu_gp_seq *rgosp2);
 	unsigned long (*get_gp_state)(void);
-	void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+	void (*get_gp_state_full)(struct rcu_gp_seq *gsp);
 	unsigned long (*start_gp_poll)(void);
-	void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
+	void (*start_gp_poll_full)(struct rcu_gp_seq *gsp);
 	bool (*poll_gp_state)(unsigned long oldstate);
-	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+	bool (*poll_gp_state_full)(struct rcu_gp_seq *gsp);
 	bool (*poll_need_2gp)(bool poll, bool poll_full);
 	void (*cond_sync)(unsigned long oldstate);
-	void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
+	void (*cond_sync_full)(struct rcu_gp_seq *gsp);
 	int poll_active;
 	int poll_active_full;
 	call_rcu_func_t call;
@@ -1608,7 +1608,7 @@ static void rcu_torture_write_types(void)
 static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
 {
 	unsigned long cookie;
-	struct rcu_gp_oldstate cookie_full;
+	struct rcu_gp_seq cookie_full;
 	bool dopoll;
 	bool dopoll_full;
 	unsigned long r = torture_random(trsp);
@@ -1656,18 +1656,18 @@ rcu_torture_writer(void *arg)
 	bool booting_still = false;
 	bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
 	unsigned long cookie;
-	struct rcu_gp_oldstate cookie_full;
+	struct rcu_gp_seq cookie_full;
 	int expediting = 0;
 	unsigned long gp_snap;
 	unsigned long gp_snap1;
-	struct rcu_gp_oldstate gp_snap_full;
-	struct rcu_gp_oldstate gp_snap1_full;
+	struct rcu_gp_seq gp_snap_full;
+	struct rcu_gp_seq gp_snap1_full;
 	int i;
 	int idx;
 	unsigned long j;
 	struct work_struct lazy_work;
 	int oldnice = task_nice(current);
-	struct rcu_gp_oldstate *rgo = NULL;
+	struct rcu_gp_seq *rgo = NULL;
 	int rgo_size = 0;
 	struct rcu_torture *rp;
 	struct rcu_torture *old_rp;
@@ -1966,7 +1966,7 @@ static int
 rcu_torture_fakewriter(void *arg)
 {
 	unsigned long gp_snap;
-	struct rcu_gp_oldstate gp_snap_full;
+	struct rcu_gp_seq gp_snap_full;
 	DEFINE_TORTURE_RANDOM(rand);
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
@@ -2404,7 +2404,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct
 struct rcu_torture_one_read_state {
 	bool checkpolling;
 	unsigned long cookie;
-	struct rcu_gp_oldstate cookie_full;
+	struct rcu_gp_seq cookie_full;
 	unsigned long started;
 	struct rcu_torture *p;
 	int readstate;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 585cade21010e..dccccd6be9411 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -187,9 +187,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * Store a grace-period-counter "cookie".  For more information,
  * see the Tree RCU header comment.
  */
-void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void get_completed_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+	gsp->norm = RCU_GET_STATE_COMPLETED;
 }
 EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e23d57f743912..af4b6daf6a0ff 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3290,7 +3290,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * Later on, this could in theory be the case for kernels built with
  * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
  * is not a common case.  Furthermore, this optimization would cause
- * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * the rcu_gp_seq structure to expand by 50%, so this potential
  * grace-period optimization is ignored once the scheduler is running.
  */
 static int rcu_blocking_is_gp(void)
@@ -3419,16 +3419,16 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 /**
  * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
- * @rgosp: Place to put state cookie
+ * @gsp: Place to put state cookie
  *
- * Stores into @rgosp a value that will always be treated by functions
+ * Stores into @gsp a value that will always be treated by functions
  * like poll_state_synchronize_rcu_full() as a cookie whose grace period
  * has already completed.
  */
-void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void get_completed_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
-	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+	gsp->norm = RCU_GET_STATE_COMPLETED;
+	gsp->exp = RCU_GET_STATE_COMPLETED;
 }
 EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 
@@ -3452,13 +3452,13 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
 /**
  * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
- * @rgosp: location to place combined normal/expedited grace-period state
+ * @gsp: location to place combined normal/expedited grace-period state
  *
- * Places the normal and expedited grace-period states in @rgosp.  This
+ * Places the normal and expedited grace-period states in @gsp.  This
  * state value can be passed to a later call to cond_synchronize_rcu_full()
  * or poll_state_synchronize_rcu_full() to determine whether or not a
  * grace period (whether normal or expedited) has elapsed in the meantime.
- * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * The rcu_gp_seq structure takes up twice the memory of an unsigned
  * long, but is guaranteed to see all grace periods.  In contrast, the
  * combined state occupies less memory, but can sometimes fail to take
  * grace periods into account.
@@ -3466,7 +3466,7 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
  * This does not guarantee that the needed grace period will actually
  * start.
  */
-void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void get_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
 	/*
 	 * Any prior manipulation of RCU-protected data must happen
@@ -3478,8 +3478,8 @@ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 	// in poll_state_synchronize_rcu_full() notwithstanding.  Use of
 	// the latter here would result in too-short grace periods due to
 	// interactions with newly onlined CPUs.
-	rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
-	rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+	gsp->norm = rcu_seq_snap(&rcu_state.gp_seq);
+	gsp->exp = rcu_seq_snap(&rcu_state.expedited_sequence);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
 
@@ -3530,18 +3530,18 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 
 /**
  * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
- * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ * @gsp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
  *
- * Places the normal and expedited grace-period states in *@rgos.  This
+ * Places the normal and expedited grace-period states in *@gs.  This
  * state value can be passed to a later call to cond_synchronize_rcu_full()
  * or poll_state_synchronize_rcu_full() to determine whether or not a
  * grace period (whether normal or expedited) has elapsed in the meantime.
  * If the needed grace period is not already slated to start, notifies
  * RCU core of the need for that grace period.
  */
-void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void start_poll_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	get_state_synchronize_rcu_full(rgosp);
+	get_state_synchronize_rcu_full(gsp);
 
 	start_poll_synchronize_rcu_common();
 }
@@ -3593,19 +3593,19 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 
 /**
  * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
- * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ * @gsp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
  *
  * If a full RCU grace period has elapsed since the earlier call from
- * which *rgosp was obtained, return @true, otherwise return @false.
+ * which *gsp was obtained, return @true, otherwise return @false.
  * If @false is returned, it is the caller's responsibility to invoke this
  * function later on until it does return @true.  Alternatively, the caller
- * can explicitly wait for a grace period, for example, by passing @rgosp
+ * can explicitly wait for a grace period, for example, by passing @gsp
  * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
  *
  * Yes, this function does not take counter wrap into account.
  * But counter wrap is harmless.  If the counter wraps, we have waited
  * for more than a billion grace periods (and way more on a 64-bit
- * system!).  Those needing to keep rcu_gp_oldstate values for very
+ * system!).  Those needing to keep rcu_gp_seq values for very
  * long time periods (many hours even on 32-bit systems) should check
  * them occasionally and either refresh them or set a flag indicating
  * that the grace period has completed.  Alternatively, they can use
@@ -3614,7 +3614,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  *
  * This function provides the same memory-ordering guarantees that would
  * be provided by a synchronize_rcu() that was invoked at the call to
- * the function that provided @rgosp, and that returned at the end of this
+ * the function that provided @gsp, and that returned at the end of this
  * function.  And this guarantee requires that the root rcu_node structure's
  * ->gp_seq field be checked instead of that of the rcu_state structure.
  * The problem is that the just-ending grace-period's callbacks can be
@@ -3624,15 +3624,15 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
  * then the root rcu_node structure is the one that needs to be polled.
  */
-bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+bool poll_state_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
 	struct rcu_node *rnp = rcu_get_root();
 
 	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
-	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
-	    rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
+	if (gsp->norm == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rnp->gp_seq, gsp->norm) ||
+	    gsp->exp == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rcu_state.expedited_sequence, gsp->exp)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
@@ -3667,11 +3667,11 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
 /**
  * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
- * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ * @gsp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
  *
  * If a full RCU grace period has elapsed since the call to
  * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
- * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * or start_poll_synchronize_rcu_expedited_full() from which @gsp was
  * obtained, just return.  Otherwise, invoke synchronize_rcu() to wait
  * for a full grace period.
  *
@@ -3682,12 +3682,12 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @rgosp and that returned at the end of
+ * to the function that provided @gsp and that returned at the end of
  * this function.
  */
-void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+void cond_synchronize_rcu_full(struct rcu_gp_seq *gsp)
 {
-	if (!poll_state_synchronize_rcu_full(rgosp))
+	if (!poll_state_synchronize_rcu_full(gsp))
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a43469da39269..0569d8e40e86d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1064,18 +1064,18 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
 
 /**
  * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
- * @rgosp: Place to put snapshot of grace-period state
+ * @gsp: Place to put snapshot of grace-period state
  *
- * Places the normal and expedited grace-period states in rgosp.  This
+ * Places the normal and expedited grace-period states in gsp.  This
  * state value can be passed to a later call to cond_synchronize_rcu_full()
  * or poll_state_synchronize_rcu_full() to determine whether or not a
  * grace period (whether normal or expedited) has elapsed in the meantime.
  * If the needed expedited grace period is not already slated to start,
  * initiates that grace period.
  */
-void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	get_state_synchronize_rcu_full(rgosp);
+	get_state_synchronize_rcu_full(gsp);
 	(void)start_poll_synchronize_rcu_expedited();
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
@@ -1109,11 +1109,11 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
 
 /**
  * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
- * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ * @gsp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
  *
  * If a full RCU grace period has elapsed since the call to
  * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
- * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * or start_poll_synchronize_rcu_expedited_full() from which @gsp was
  * obtained, just return.  Otherwise, invoke synchronize_rcu_expedited()
  * to wait for a full grace period.
  *
@@ -1124,12 +1124,12 @@ EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @rgosp and that returned at the end of
+ * to the function that provided @gsp and that returned at the end of
  * this function.
  */
-void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_seq *gsp)
 {
-	if (!poll_state_synchronize_rcu_full(rgosp))
+	if (!poll_state_synchronize_rcu_full(gsp))
 		synchronize_rcu_expedited();
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d5a70a831a2a5..f4ff50527db3a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1322,7 +1322,7 @@ static struct workqueue_struct *rcu_reclaim_wq;
  */
 struct kvfree_rcu_bulk_data {
 	struct list_head list;
-	struct rcu_gp_oldstate gp_snap;
+	struct rcu_gp_seq gp_snap;
 	unsigned long nr_records;
 	void *records[] __counted_by(nr_records);
 };
@@ -1347,7 +1347,7 @@ struct kvfree_rcu_bulk_data {
 struct kfree_rcu_cpu_work {
 	struct rcu_work rcu_work;
 	struct rcu_head *head_free;
-	struct rcu_gp_oldstate head_free_gp_snap;
+	struct rcu_gp_seq head_free_gp_snap;
 	struct list_head bulk_head_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
 };
@@ -1555,7 +1555,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	struct rcu_head *head;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
-	struct rcu_gp_oldstate head_gp_snap;
+	struct rcu_gp_seq head_gp_snap;
 	int i;
 
 	krwp = container_of(to_rcu_work(work),
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH v1 00/11] RCU: Enable callbacks to benefit from expedited grace periods
From: Puranjay Mohan @ 2026-06-24 13:23 UTC (permalink / raw)
  To: rcu, linux-kernel, linux-trace-kernel
  Cc: Puranjay Mohan, Paul E. McKenney, Frederic Weisbecker,
	Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng,
	Uladzislau Rezki, Steven Rostedt, Mathieu Desnoyers,
	Lai Jiangshan, Zqiang, Masami Hiramatsu, Davidlohr Bueso,
	Breno Leitao

This series lets call_rcu() callbacks be reclaimed as soon as either a
normal or an expedited grace period that covers them has elapsed, rather
than always waiting for a normal grace period.

Motivation
==========
Today there is an asymmetry: synchronize_rcu_expedited() callers get fast
reclaim, but call_rcu() callers never benefit from those same expedited
grace periods, even though an expedited GP proves exactly the same thing
as a normal one -- all pre-existing readers are done.  When expedited GPs
are running on the system (driven by other subsystems), call_rcu()
callbacks that could already be freed instead sit in RCU_WAIT_TAIL until
the next normal GP.  This series treats a grace period as a grace period
regardless of how it was driven, so memory is reclaimed sooner.

Design
======
Callback segments now record both the normal and expedited grace-period
sequence in struct rcu_gp_seq, and rcu_segcblist_advance() releases a
segment as soon as poll_state_synchronize_rcu_full() reports that either
has completed.  Three notification paths are taught about expedited
completion so the advance actually happens: the NOCB rcuog kthreads,
the rcu_pending() tick gate, and rcu_core().

Changelog:
RFC: https://lore.kernel.org/all/20260417231203.785172-1-puranjay@kernel.org/
Changes in v1:
 - New prep patch 1 renames struct rcu_gp_oldstate to struct rcu_gp_seq
   and its fields rgos_norm/rgos_exp to norm/exp tree-wide (Frederic).
 - The rcu_segcblist segment field stays named gp_seq; only its type
   changes (Frederic).
 - Patch 8 (NOCB wake) is reworked.  v1 woke the wrong waitqueue
   (rdp_gp->nocb_gp_wq via wake_nocb_gp() rather than the leaf
   rnp->nocb_gp_wq[] that an rcuog kthread waiting for a GP sleeps on),
   and the wait condition only checked the normal ->gp_seq.  The rcuog
   grace-period wait now tracks a struct rcu_gp_seq and is released via
   poll_state_synchronize_rcu_full(); rcu_exp_wait_wake() wakes the leaf
   node through the new rcu_nocb_exp_cleanup() (Frederic).
 - rcu_pending() uses a new memory-ordering-free
   poll_state_synchronize_rcu_full_unordered() to avoid memory barriers
   on every tick, leaving the ordering duty to rcu_core() (Frederic).

Still open: Frederic asked whether the first smp_mb() in
poll_state_synchronize_rcu_full() is needed on the callback-advance path
(patch 6).  That path still uses the fully ordered helper; only
rcu_pending() was switched to the unordered variant.  Happy to revisit.

Puranjay Mohan (11):
  rcu: Rename struct rcu_gp_oldstate to rcu_gp_seq
  rcu/segcblist: Add SRCU and Tasks RCU wrapper functions
  rcu/segcblist: Factor out rcu_segcblist_advance_compact() helper
  rcu/segcblist: Track segment grace periods with struct rcu_gp_seq
  rcu: Add RCU_GET_STATE_NOT_TRACKED for subsystems without expedited
    GPs
  rcu: Enable RCU callbacks to benefit from expedited grace periods
  rcu: Update comments for gp_seq and expedited GP tracking
  rcu: Wake NOCB rcuog kthreads on expedited grace period completion
  rcu: Detect expedited grace period completion in rcu_pending()
  rcu: Advance callbacks for expedited GP completion in rcu_core()
  rcuscale: Add concurrent expedited GP threads for callback scaling
    tests

 include/linux/rcu_segcblist.h |  16 ++--
 include/linux/rcupdate.h      |  13 ++-
 include/linux/rcupdate_wait.h |   2 +-
 include/linux/rcutiny.h       |  36 ++++-----
 include/linux/rcutree.h       |  29 +++----
 include/trace/events/rcu.h    |   5 +-
 kernel/rcu/rcu.h              |  13 ++-
 kernel/rcu/rcu_segcblist.c    | 139 ++++++++++++++++++++++----------
 kernel/rcu/rcu_segcblist.h    |   8 +-
 kernel/rcu/rcuscale.c         |  84 ++++++++++++++++++-
 kernel/rcu/rcutorture.c       |  30 +++----
 kernel/rcu/srcutree.c         |  14 ++--
 kernel/rcu/tasks.h            |   8 +-
 kernel/rcu/tiny.c             |   4 +-
 kernel/rcu/tree.c             | 147 ++++++++++++++++++++++------------
 kernel/rcu/tree.h             |   3 +-
 kernel/rcu/tree_exp.h         |  20 ++---
 kernel/rcu/tree_nocb.h        | 131 ++++++++++++++++++++++++------
 mm/slab_common.c              |   6 +-
 19 files changed, 496 insertions(+), 212 deletions(-)


base-commit: 709d17a22bfac78765f6cbaec42e15bcd4aa4f08
-- 
2.53.0-Meta


^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Christian König @ 2026-06-24 13:23 UTC (permalink / raw)
  To: Kaitao Cheng, David Laight
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt, David Howells,
	Simona Vetter, Randy Dunlap, Luca Ceresoli, Philipp Stanner,
	linux-block, linux-kernel, cgroups, linux-ntfs-dev, linux-fsdevel,
	io-uring, audit, bpf, netdev, dri-devel, linux-perf-users,
	linux-trace-kernel, kexec, live-patching, linux-modules,
	linux-crypto, linux-pm, rcu, sched-ext, linux-mm, virtualization,
	damon, llvm, Kaitao Cheng
In-Reply-To: <351a6b67-b394-4c58-aee2-88b6c8089ad5@linux.dev>

On 6/24/26 15:14, Kaitao Cheng wrote:
> 
> 
> 在 2026/6/22 16:42, David Laight 写道:
>> On Mon, 22 Jun 2026 12:05:31 +0800
>> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>
>>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>>
>>> The list_for_each*_safe() helpers are used when the loop body may
>>> remove the current entry.  Their API exposes the temporary cursor at
>>> every call site, even though most users only need it for the iterator
>>> implementation and never reference it in the loop body.
>>>
>>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>>> support both forms: callers may keep passing an explicit temporary cursor
>>> when they need to inspect or reset it, or omit it and let the helper use
>>> a unique internal cursor.
>>
>> I'm not really sure 'mutable' means anything either.
>> It is possible to make it valid for the loop body (or even other threads)
>> to delete arbitrary list items - but that needs significant extra overheads.
>>
>> It might be worth doing something that doesn't need the extra variable,
>> but there is little point doing all the churn just to rename things.
>>
>>>
>>> This makes call sites that only mutate the list through the current entry
>>> less noisy, while keeping the existing *_safe() helpers available for
>>> compatibility.
>>>
>>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>>> ---
>>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/include/linux/list.h b/include/linux/list.h
>>> index 09d979976b3b..1081def7cea9 100644
>>> --- a/include/linux/list.h
>>> +++ b/include/linux/list.h
>>> @@ -7,6 +7,7 @@
>>>  #include <linux/stddef.h>
>>>  #include <linux/poison.h>
>>>  #include <linux/const.h>
>>> +#include <linux/args.h>
>>>  
>>>  #include <asm/barrier.h>
>>>  
>>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>>  #define list_for_each_prev(pos, head) \
>>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>>  
>>> -/**
>>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>>> - * @pos:	the &struct list_head to use as a loop cursor.
>>> - * @n:		another &struct list_head to use as temporary storage
>>> - * @head:	the head for your list.
>>> +/*
>>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>>   */
>>>  #define list_for_each_safe(pos, n, head) \
>>>  	for (pos = (head)->next, n = pos->next; \
>>>  	     !list_is_head(pos, (head)); \
>>>  	     pos = n, n = pos->next)
>>>  
>>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
>>
>> Use auto
>>
>>> +	     !list_is_head(pos, (head));				\
>>> +	     pos = tmp, tmp = pos->next)
>>> +
>>> +#define __list_for_each_mutable1(pos, head)				\
>>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>>> +
>>> +#define __list_for_each_mutable2(pos, next, head)			\
>>> +	list_for_each_safe(pos, next, head)
>>> +
>>>  /**
>>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>>   * @pos:	the &struct list_head to use as a loop cursor.
>>> - * @n:		another &struct list_head to use as temporary storage
>>> - * @head:	the head for your list.
>>> + * @...:	either (head) or (next, head)
>>> + *
>>> + * next:	another &struct list_head to use as optional temporary storage.
>>> + *		The temporary cursor is internal unless explicitly supplied by
>>> + *		the caller.
>>> + * head:	the head for your list.
>>> + */
>>> +#define list_for_each_mutable(pos, ...)					\
>>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>>> +		(pos, __VA_ARGS__)
>>
>> The variable argument count logic really just slows down compilation.
>> Maybe there aren't enough copies of this code to make that significant.
>> But just because you can do it doesn't mean it is a gooD idea.
>> I'm also not sure it really adds anything to the readability.
>>
>> And, it you are going to make the middle argument optional there is
>> no need to change the macro name.
> 
> Christian König and Jani Nikula also disagree with the variadic-argument
> implementation approach. If we abandon that method, it means we will
> inevitably need to add some new macros. If mutable is not a good name,
> suggestions for better alternatives would be welcome; coming up with a
> suitable name is indeed rather tricky.

I don't think you need to add a new macro for the specific use case that people want to modify the next element of the iteration.

If I remember your numbers correctly that is a really corner case and keeping using the existing *_safe() macros for that sounds perfectly fine to me.

Regards,
Christian.

^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Kaitao Cheng @ 2026-06-24 13:14 UTC (permalink / raw)
  To: David Laight
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König, David Howells, Simona Vetter, Randy Dunlap,
	Luca Ceresoli, Philipp Stanner, linux-block, linux-kernel,
	cgroups, linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf,
	netdev, dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, Kaitao Cheng
In-Reply-To: <20260622094242.64531b9a@pumpkin>



在 2026/6/22 16:42, David Laight 写道:
> On Mon, 22 Jun 2026 12:05:31 +0800
> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
> 
>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>
>> The list_for_each*_safe() helpers are used when the loop body may
>> remove the current entry.  Their API exposes the temporary cursor at
>> every call site, even though most users only need it for the iterator
>> implementation and never reference it in the loop body.
>>
>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>> support both forms: callers may keep passing an explicit temporary cursor
>> when they need to inspect or reset it, or omit it and let the helper use
>> a unique internal cursor.
> 
> I'm not really sure 'mutable' means anything either.
> It is possible to make it valid for the loop body (or even other threads)
> to delete arbitrary list items - but that needs significant extra overheads.
> 
> It might be worth doing something that doesn't need the extra variable,
> but there is little point doing all the churn just to rename things.
> 
>>
>> This makes call sites that only mutate the list through the current entry
>> less noisy, while keeping the existing *_safe() helpers available for
>> compatibility.
>>
>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>> ---
>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>
>> diff --git a/include/linux/list.h b/include/linux/list.h
>> index 09d979976b3b..1081def7cea9 100644
>> --- a/include/linux/list.h
>> +++ b/include/linux/list.h
>> @@ -7,6 +7,7 @@
>>  #include <linux/stddef.h>
>>  #include <linux/poison.h>
>>  #include <linux/const.h>
>> +#include <linux/args.h>
>>  
>>  #include <asm/barrier.h>
>>  
>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>  #define list_for_each_prev(pos, head) \
>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>  
>> -/**
>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>> - * @pos:	the &struct list_head to use as a loop cursor.
>> - * @n:		another &struct list_head to use as temporary storage
>> - * @head:	the head for your list.
>> +/*
>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>   */
>>  #define list_for_each_safe(pos, n, head) \
>>  	for (pos = (head)->next, n = pos->next; \
>>  	     !list_is_head(pos, (head)); \
>>  	     pos = n, n = pos->next)
>>  
>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
> 
> Use auto
> 
>> +	     !list_is_head(pos, (head));				\
>> +	     pos = tmp, tmp = pos->next)
>> +
>> +#define __list_for_each_mutable1(pos, head)				\
>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>> +
>> +#define __list_for_each_mutable2(pos, next, head)			\
>> +	list_for_each_safe(pos, next, head)
>> +
>>  /**
>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>   * @pos:	the &struct list_head to use as a loop cursor.
>> - * @n:		another &struct list_head to use as temporary storage
>> - * @head:	the head for your list.
>> + * @...:	either (head) or (next, head)
>> + *
>> + * next:	another &struct list_head to use as optional temporary storage.
>> + *		The temporary cursor is internal unless explicitly supplied by
>> + *		the caller.
>> + * head:	the head for your list.
>> + */
>> +#define list_for_each_mutable(pos, ...)					\
>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>> +		(pos, __VA_ARGS__)
> 
> The variable argument count logic really just slows down compilation.
> Maybe there aren't enough copies of this code to make that significant.
> But just because you can do it doesn't mean it is a gooD idea.
> I'm also not sure it really adds anything to the readability.
> 
> And, it you are going to make the middle argument optional there is
> no need to change the macro name.

Christian König and Jani Nikula also disagree with the variadic-argument
implementation approach. If we abandon that method, it means we will
inevitably need to add some new macros. If mutable is not a good name,
suggestions for better alternatives would be welcome; coming up with a
suitable name is indeed rather tricky.

-- 
Thanks
Kaitao Cheng


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox