Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v10 7/8] selftests/ftrace: Add a testcase for fprobe events on module
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a testcase for fprobe events on module, which unloads a kernel
module on which fprobe events are probing and ensure the ftrace
hash map is cleared correctly.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v10:
 - Fix module name typo in error case trap.
 Changes in v9:
 - Use "trace-events-sample" instead of "trace_events_sample"
 - Add checking unload module and remove core-kernel event case.
 - Check test module exists when unloading it in EXIT.
 Changes in v8:
 - Newly added.
---
 .../test.d/dynevent/add_remove_fprobe_module.tc    |   87 ++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
new file mode 100644
index 000000000000..2915206777b6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
@@ -0,0 +1,87 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove fprobe events on module
+# requires: dynamic_events "f[:[<group>/][<event>]] <func-name>[%return] [<args>]":README enabled_functions
+
+rmmod trace-events-sample ||:
+if ! modprobe trace-events-sample ; then
+  echo "No trace-events sample module - please make CONFIG_SAMPLE_TRACE_EVENTS=m"
+  exit_unresolved;
+fi
+trap "lsmod | grep -q trace_events_sample && rmmod trace-events-sample" EXIT
+
+echo 0 > events/enable
+echo > dynamic_events
+
+FUNC1='foo_bar*'
+FUNC2='vfs_read'
+
+:;: "Add an event on the test module" ;:
+echo "f:test1 $FUNC1" >> dynamic_events
+echo 1 > events/fprobes/test1/enable
+
+:;: "Ensure it is enabled" ;:
+funcs=`cat enabled_functions | wc -l`
+test $funcs -ne 0
+
+:;: "Check the enabled_functions is cleared on unloading" ;:
+rmmod trace-events-sample
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+:;: "Check it is kept clean" ;:
+modprobe trace-events-sample
+echo 1 > events/fprobes/test1/enable || echo "OK"
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+:;: "Add another event not on the test module" ;:
+echo "f:test2 $FUNC2" >> dynamic_events
+echo 1 > events/fprobes/test2/enable
+
+:;: "Ensure it is enabled" ;:
+ofuncs=`cat enabled_functions | wc -l`
+test $ofuncs -ne 0
+
+:;: "Disable and remove the first event"
+echo 0 > events/fprobes/test1/enable
+echo "-:fprobes/test1" >> dynamic_events
+funcs=`cat enabled_functions | wc -l`
+test $ofuncs -eq $funcs
+
+:;: "Disable and remove other events" ;:
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+rmmod trace-events-sample
+
+:;: "Add events on kernel and test module" ;:
+modprobe trace-events-sample
+echo "f:test1 $FUNC1" >> dynamic_events
+echo 1 > events/fprobes/test1/enable
+echo "f:test2 $FUNC2" >> dynamic_events
+echo 1 > events/fprobes/test2/enable
+ofuncs=`cat enabled_functions | wc -l`
+test $ofuncs -ne 0
+
+:;: "Unload module (ftrace entry should be removed)" ;:
+rmmod trace-events-sample
+funcs=`cat enabled_functions | wc -l`
+test $funcs -ne 0
+test $ofuncs -ne $funcs
+
+:;: "Disable and remove core-kernel fprobe event" ;:
+echo 0 > events/fprobes/test2/enable
+echo "-:fprobes/test2" >> dynamic_events
+
+:;: "Ensure ftrace is disabled." ;:
+funcs=`cat enabled_functions | wc -l`
+test $funcs -eq 0
+
+echo 0 > events/fprobes/enable
+echo > dynamic_events
+
+trap "" EXIT
+clear_trace


^ permalink raw reply related

* [PATCH v10 6/8] tracing/fprobe: Fix to unregister ftrace_ops if it is empty on module unloading
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Fix fprobe to unregister ftrace_ops if corresponding type of fprobe
does not exist on the fprobe_ip_table and it is expected to be empty
when unloading modules.

Since ftrace thinks that the empty hash means everything to be traced,
if we set fprobes only on the unloaded module, all functions are traced
unexpectedly after unloading module.
e.g.

 # modprobe xt_LOG.ko
 # echo 'f:test log_tg*' > dynamic_events
 # echo 1 > events/fprobes/test/enable
 # cat enabled_functions
log_tg [xt_LOG] (1)             tramp: 0xffffffffa0004000 (fprobe_ftrace_entry+0x0/0x490) ->fprobe_ftrace_entry+0x0/0x490
log_tg_check [xt_LOG] (1)               tramp: 0xffffffffa0004000 (fprobe_ftrace_entry+0x0/0x490) ->fprobe_ftrace_entry+0x0/0x490
log_tg_destroy [xt_LOG] (1)             tramp: 0xffffffffa0004000 (fprobe_ftrace_entry+0x0/0x490) ->fprobe_ftrace_entry+0x0/0x490
 # rmmod xt_LOG
 # wc -l enabled_functions
34085 enabled_functions

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v10:
  - Even if the removed node count is 0 because of the memory allocation
    fails, call __fprobe_f*_unregister() if the number of ftrace_hash_node
    is 0.
 Changes in v9:
  - Remove fprobe_graph_active and fprobe_ftrace_active to fix
    remove fprobe after unload module case.
 Changes in v8:
  - Fix to check fprobe_graph/ftrace_registered flag directly
    when registering ftrace_ops.
 Changes in v7:
  - Fix to split checking whether ftrace_ops is registered from
    the number of registered fprobes, because ftrace_ops can be
    unregistered in module unloading.
 Changes in v6:
  - Newly added.
---
 kernel/trace/fprobe.c |  224 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 159 insertions(+), 65 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index d0a68a2c5eaf..cc49ebd2a773 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -79,7 +79,7 @@ static const struct rhashtable_params fprobe_rht_params = {
 };
 
 /* Node insertion and deletion requires the fprobe_mutex */
-static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+static int __insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 {
 	int ret;
 
@@ -92,7 +92,7 @@ static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 	return ret;
 }
 
-static void delete_fprobe_node(struct fprobe_hlist_node *node)
+static void __delete_fprobe_node(struct fprobe_hlist_node *node)
 {
 	lockdep_assert_held(&fprobe_mutex);
 
@@ -250,7 +250,65 @@ static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent
 	return ret;
 }
 
+static int fprobe_fgraph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+			       struct ftrace_regs *fregs);
+static void fprobe_return(struct ftrace_graph_ret *trace,
+			  struct fgraph_ops *gops,
+			  struct ftrace_regs *fregs);
+
+static struct fgraph_ops fprobe_graph_ops = {
+	.entryfunc	= fprobe_fgraph_entry,
+	.retfunc	= fprobe_return,
+};
+/* Number of fgraph fprobe nodes */
+static int nr_fgraph_fprobes;
+/* Is fprobe_graph_ops registered? */
+static bool fprobe_graph_registered;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+	int ret;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+	if (ret)
+		return ret;
+
+	if (!fprobe_graph_registered) {
+		ret = register_ftrace_graph(&fprobe_graph_ops);
+		if (WARN_ON_ONCE(ret)) {
+			ftrace_free_filter(&fprobe_graph_ops.ops);
+			return ret;
+		}
+		fprobe_graph_registered = true;
+	}
+	return 0;
+}
+
+static void __fprobe_graph_unregister(void)
+{
+	if (fprobe_graph_registered) {
+		unregister_ftrace_graph(&fprobe_graph_ops);
+		ftrace_free_filter(&fprobe_graph_ops.ops);
+		fprobe_graph_registered = false;
+	}
+}
+
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+	lockdep_assert_held(&fprobe_mutex);
+
+	if (!nr_fgraph_fprobes)
+		__fprobe_graph_unregister();
+	else if (num)
+		ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
+}
+
 #if defined(CONFIG_DYNAMIC_FTRACE_WITH_ARGS) || defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS)
+
 /* ftrace_ops callback, this processes fprobes which have only entry_handler. */
 static void fprobe_ftrace_entry(unsigned long ip, unsigned long parent_ip,
 	struct ftrace_ops *ops, struct ftrace_regs *fregs)
@@ -293,7 +351,10 @@ static struct ftrace_ops fprobe_ftrace_ops = {
 	.func	= fprobe_ftrace_entry,
 	.flags	= FTRACE_OPS_FL_SAVE_ARGS,
 };
-static int fprobe_ftrace_active;
+/* Number of ftrace fprobe nodes */
+static int nr_ftrace_fprobes;
+/* Is fprobe_ftrace_ops registered? */
+static bool fprobe_ftrace_registered;
 
 static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
 {
@@ -305,26 +366,33 @@ static int fprobe_ftrace_add_ips(unsigned long *addrs, int num)
 	if (ret)
 		return ret;
 
-	if (!fprobe_ftrace_active) {
+	if (!fprobe_ftrace_registered) {
 		ret = register_ftrace_function(&fprobe_ftrace_ops);
 		if (ret) {
 			ftrace_free_filter(&fprobe_ftrace_ops);
 			return ret;
 		}
+		fprobe_ftrace_registered = true;
 	}
-	fprobe_ftrace_active++;
 	return 0;
 }
 
+static void __fprobe_ftrace_unregister(void)
+{
+	if (fprobe_ftrace_registered) {
+		unregister_ftrace_function(&fprobe_ftrace_ops);
+		ftrace_free_filter(&fprobe_ftrace_ops);
+		fprobe_ftrace_registered = false;
+	}
+}
+
 static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
 {
 	lockdep_assert_held(&fprobe_mutex);
 
-	fprobe_ftrace_active--;
-	if (!fprobe_ftrace_active) {
-		unregister_ftrace_function(&fprobe_ftrace_ops);
-		ftrace_free_filter(&fprobe_ftrace_ops);
-	} else if (num)
+	if (!nr_ftrace_fprobes)
+		__fprobe_ftrace_unregister();
+	else if (num)
 		ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
 }
 
@@ -333,6 +401,40 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 	return !fp->exit_handler;
 }
 
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+{
+	int ret;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	ret = __insert_fprobe_node(node, fp);
+	if (!ret) {
+		if (fprobe_is_ftrace(fp))
+			nr_ftrace_fprobes++;
+		else
+			nr_fgraph_fprobes++;
+	}
+
+	return ret;
+}
+
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+	struct fprobe *fp;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	fp = READ_ONCE(node->fp);
+	if (fp) {
+		if (fprobe_is_ftrace(fp))
+			nr_ftrace_fprobes--;
+		else
+			nr_fgraph_fprobes--;
+	}
+	__delete_fprobe_node(node);
+}
+
 static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace)
 {
 	struct rhlist_head *head, *pos;
@@ -362,8 +464,15 @@ static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace)
 #ifdef CONFIG_MODULES
 static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
-	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
+	if (!nr_fgraph_fprobes)
+		__fprobe_graph_unregister();
+	else if (cnt)
+		ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+
+	if (!nr_ftrace_fprobes)
+		__fprobe_ftrace_unregister();
+	else if (cnt)
+		ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
 }
 #endif
 #else
@@ -381,6 +490,32 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 	return false;
 }
 
+/* Node insertion and deletion requires the fprobe_mutex */
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
+{
+	int ret;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	ret = __insert_fprobe_node(node, fp);
+	if (!ret)
+		nr_fgraph_fprobes++;
+
+	return ret;
+}
+
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
+{
+	struct fprobe *fp;
+
+	lockdep_assert_held(&fprobe_mutex);
+
+	fp = READ_ONCE(node->fp);
+	if (fp)
+		nr_fgraph_fprobes--;
+	__delete_fprobe_node(node);
+}
+
 static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused)
 {
 	struct rhlist_head *head, *pos;
@@ -407,7 +542,10 @@ static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused)
 #ifdef CONFIG_MODULES
 static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+	if (!nr_fgraph_fprobes)
+		__fprobe_graph_unregister();
+	else if (cnt)
+		ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
 }
 #endif
 #endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
@@ -535,48 +673,6 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
 }
 NOKPROBE_SYMBOL(fprobe_return);
 
-static struct fgraph_ops fprobe_graph_ops = {
-	.entryfunc	= fprobe_fgraph_entry,
-	.retfunc	= fprobe_return,
-};
-static int fprobe_graph_active;
-
-/* Add @addrs to the ftrace filter and register fgraph if needed. */
-static int fprobe_graph_add_ips(unsigned long *addrs, int num)
-{
-	int ret;
-
-	lockdep_assert_held(&fprobe_mutex);
-
-	ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
-	if (ret)
-		return ret;
-
-	if (!fprobe_graph_active) {
-		ret = register_ftrace_graph(&fprobe_graph_ops);
-		if (WARN_ON_ONCE(ret)) {
-			ftrace_free_filter(&fprobe_graph_ops.ops);
-			return ret;
-		}
-	}
-	fprobe_graph_active++;
-	return 0;
-}
-
-/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
-static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
-{
-	lockdep_assert_held(&fprobe_mutex);
-
-	fprobe_graph_active--;
-	/* Q: should we unregister it ? */
-	if (!fprobe_graph_active) {
-		unregister_ftrace_graph(&fprobe_graph_ops);
-		ftrace_free_filter(&fprobe_graph_ops.ops);
-	} else if (num)
-		ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
-}
-
 #ifdef CONFIG_MODULES
 
 #define FPROBE_IPS_BATCH_INIT 128
@@ -653,16 +749,14 @@ static int fprobe_module_callback(struct notifier_block *nb,
 	} while (node == ERR_PTR(-EAGAIN) && !retry);
 	rhashtable_walk_exit(&iter);
 	/* Remove any ips from hash table(s) */
-	if (alist.index > 0) {
-		fprobe_remove_ips(alist.addrs, alist.index);
-		/*
-		 * If we break rhashtable walk loop except for -EAGAIN, we need
-		 * to restart looping from start for safety. Anyway, this is
-		 * not a hotpath.
-		 */
-		if (retry)
-			goto again;
-	}
+	fprobe_remove_ips(alist.addrs, alist.index);
+	/*
+	 * If we break rhashtable walk loop except for -EAGAIN, we need
+	 * to restart looping from start for safety. Anyway, this is
+	 * not a hotpath.
+	 */
+	if (retry)
+		goto again;
 
 	mutex_unlock(&fprobe_mutex);
 


^ permalink raw reply related

* [PATCH v10 5/8] tracing/fprobe: Check the same type fprobe on table as the unregistered one
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Commit 2c67dc457bc6 ("tracing: fprobe: optimization for entry only case")
introduced a different ftrace_ops for entry-only fprobes.

However, when unregistering an fprobe, the kernel only checks if another
fprobe exists at the same address, without checking which type of fprobe
it is.
If different fprobes are registered at the same address, the same address
will be registered in both fgraph_ops and ftrace_ops, but only one of
them will be deleted when unregistering. (the one removed first will not
be deleted from the ops).

This results in junk entries remaining in either fgraph_ops or ftrace_ops.
For example:
 =======
 cd /sys/kernel/tracing

 # 'Add entry and exit events on the same place'
 echo 'f:event1 vfs_read' >> dynamic_events
 echo 'f:event2 vfs_read%return' >> dynamic_events

 # 'Enable both of them'
 echo 1 > events/fprobes/enable
 cat enabled_functions
vfs_read (2)            ->arch_ftrace_ops_list_func+0x0/0x210

 # 'Disable and remove exit event'
 echo 0 > events/fprobes/event2/enable
 echo -:event2 >> dynamic_events

 # 'Disable and remove all events'
 echo 0 > events/fprobes/enable
 echo > dynamic_events

 # 'Add another event'
 echo 'f:event3 vfs_open%return' > dynamic_events
 cat dynamic_events
f:fprobes/event3 vfs_open%return

 echo 1 > events/fprobes/enable
 cat enabled_functions
vfs_open (1)            tramp: 0xffffffffa0001000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60    subops: {ent:fprobe_fgraph_entry+0x0/0x620 ret:fprobe_return+0x0/0x150}
vfs_read (1)            tramp: 0xffffffffa0001000 (ftrace_graph_func+0x0/0x60) ->ftrace_graph_func+0x0/0x60    subops: {ent:fprobe_fgraph_entry+0x0/0x620 ret:fprobe_return+0x0/0x150}
 =======

As you can see, an entry for the vfs_read remains.

To fix this issue, when unregistering, the kernel should also check if
there is the same type of fprobes still exist at the same address, and
if not, delete its entry from either fgraph_ops or ftrace_ops.

Fixes: 2c67dc457bc6 ("tracing: fprobe: optimization for entry only case")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 kernel/trace/fprobe.c |   82 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 65 insertions(+), 17 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9b913facfd36..d0a68a2c5eaf 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -92,11 +92,8 @@ static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 	return ret;
 }
 
-/* Return true if there are synonims */
-static bool delete_fprobe_node(struct fprobe_hlist_node *node)
+static void delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	bool ret;
-
 	lockdep_assert_held(&fprobe_mutex);
 
 	/* Avoid double deleting and non-inserted nodes */
@@ -105,13 +102,6 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 		rhltable_remove(&fprobe_ip_table, &node->hlist,
 				fprobe_rht_params);
 	}
-
-	rcu_read_lock();
-	ret = !!rhltable_lookup(&fprobe_ip_table, &node->addr,
-				fprobe_rht_params);
-	rcu_read_unlock();
-
-	return ret;
 }
 
 /* Check existence of the fprobe */
@@ -343,6 +333,32 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 	return !fp->exit_handler;
 }
 
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace)
+{
+	struct rhlist_head *head, *pos;
+	struct fprobe_hlist_node *node;
+	struct fprobe *fp;
+
+	guard(rcu)();
+	head = rhltable_lookup(&fprobe_ip_table, &ip,
+				fprobe_rht_params);
+	if (!head)
+		return false;
+	/* We have to check the same type on the list. */
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
+		if (node->addr != ip)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (likely(fp)) {
+			if ((!ftrace && fp->exit_handler) ||
+			    (ftrace && !fp->exit_handler))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 #ifdef CONFIG_MODULES
 static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
@@ -365,6 +381,29 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 	return false;
 }
 
+static bool fprobe_exists_on_hash(unsigned long ip, bool ftrace __maybe_unused)
+{
+	struct rhlist_head *head, *pos;
+	struct fprobe_hlist_node *node;
+	struct fprobe *fp;
+
+	guard(rcu)();
+	head = rhltable_lookup(&fprobe_ip_table, &ip,
+				fprobe_rht_params);
+	if (!head)
+		return false;
+	/* We only need to check fp is there. */
+	rhl_for_each_entry_rcu(node, pos, head, hlist) {
+		if (node->addr != ip)
+			break;
+		fp = READ_ONCE(node->fp);
+		if (likely(fp))
+			return true;
+	}
+
+	return false;
+}
+
 #ifdef CONFIG_MODULES
 static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
@@ -551,18 +590,25 @@ struct fprobe_addr_list {
 static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
 					 struct fprobe_addr_list *alist)
 {
+	lockdep_assert_in_rcu_read_lock();
+
 	if (!within_module(node->addr, mod))
 		return 0;
 
-	if (delete_fprobe_node(node))
-		return 0;
+	delete_fprobe_node(node);
 	/* If no address list is available, we can't track this address. */
 	if (!alist->addrs)
 		return 0;
+	/*
+	 * Don't care the type here, because all fprobes on the same
+	 * address must be removed eventually.
+	 */
+	if (!rhltable_lookup(&fprobe_ip_table, &node->addr, fprobe_rht_params)) {
+		alist->addrs[alist->index++] = node->addr;
+		if (alist->index == alist->size)
+			return -ENOSPC;
+	}
 
-	alist->addrs[alist->index++] = node->addr;
-	if (alist->index == alist->size)
-		return -ENOSPC;
 	return 0;
 }
 
@@ -933,7 +979,9 @@ static int unregister_fprobe_nolock(struct fprobe *fp)
 	/* Remove non-synonim ips from table and hash */
 	count = 0;
 	for (i = 0; i < hlist_array->size; i++) {
-		if (!delete_fprobe_node(&hlist_array->array[i]) && addrs)
+		delete_fprobe_node(&hlist_array->array[i]);
+		if (addrs && !fprobe_exists_on_hash(hlist_array->array[i].addr,
+						    fprobe_is_ftrace(fp)))
 			addrs[count++] = hlist_array->array[i].addr;
 	}
 	del_fprobe_hash(fp);


^ permalink raw reply related

* [PATCH v10 4/8] tracing/fprobe: Avoid kcalloc() in rcu_read_lock section
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

fprobe_remove_node_in_module() is called under RCU read locked, but
this invokes kcalloc() if there are more than 8 fprobes installed
on the module. Sashiko warns it because kcalloc() can sleep [1].

 [1] https://sashiko.dev/#/patchset/177552432201.853249.5125045538812833325.stgit%40mhiramat.tok.corp.google.com

To fix this issue, expand the batch size to 128 and do not expand
the fprobe_addr_list, but just cancel walking on fprobe_ip_table,
update fgraph/ftrace_ops and retry the loop again.

Fixes: 0de4c70d04a4 ("tracing: fprobe: use rhltable for fprobe_ip_table")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v6:
  - Retry outside rhltable_walk_enter/exit() again.
 Changes in v5:
  - Skip updating ftrace_ops when fails to allocate memory in module
    unloading.
 Changes in v4:
  - fix a build error typo in case of CONFIG_DYNAMIC_FTRACE=n.
 Changes in v3:
  - Retry inside rhltable_walk_enter/exit().
  - Rename fprobe_set_ips() to fprobe_remove_ips().
  - Rename 'retry' label to 'again'.
---
 kernel/trace/fprobe.c |   92 ++++++++++++++++++++++++-------------------------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 621477ad0947..9b913facfd36 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -344,11 +344,10 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 }
 
 #ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
-			   int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
-	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
 }
 #endif
 #else
@@ -367,10 +366,9 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 }
 
 #ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
-			   int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
 }
 #endif
 #endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
@@ -542,7 +540,7 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
 
 #ifdef CONFIG_MODULES
 
-#define FPROBE_IPS_BATCH_INIT 8
+#define FPROBE_IPS_BATCH_INIT 128
 /* instruction pointer address list */
 struct fprobe_addr_list {
 	int index;
@@ -550,45 +548,24 @@ struct fprobe_addr_list {
 	unsigned long *addrs;
 };
 
-static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr)
+static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+					 struct fprobe_addr_list *alist)
 {
-	unsigned long *addrs;
-
-	/* Previously we failed to expand the list. */
-	if (alist->index == alist->size)
-		return -ENOSPC;
-
-	alist->addrs[alist->index++] = addr;
-	if (alist->index < alist->size)
+	if (!within_module(node->addr, mod))
 		return 0;
 
-	/* Expand the address list */
-	addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL);
-	if (!addrs)
-		return -ENOMEM;
-
-	memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs));
-	alist->size *= 2;
-	kfree(alist->addrs);
-	alist->addrs = addrs;
+	if (delete_fprobe_node(node))
+		return 0;
+	/* If no address list is available, we can't track this address. */
+	if (!alist->addrs)
+		return 0;
 
+	alist->addrs[alist->index++] = node->addr;
+	if (alist->index == alist->size)
+		return -ENOSPC;
 	return 0;
 }
 
-static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
-					 struct fprobe_addr_list *alist)
-{
-	if (!within_module(node->addr, mod))
-		return;
-	if (delete_fprobe_node(node))
-		return;
-	/*
-	 * If failed to update alist, just continue to update hlist.
-	 * Therefore, at list user handler will not hit anymore.
-	 */
-	fprobe_addr_list_add(alist, node->addr);
-}
-
 /* Handle module unloading to manage fprobe_ip_table. */
 static int fprobe_module_callback(struct notifier_block *nb,
 				  unsigned long val, void *data)
@@ -597,29 +574,50 @@ static int fprobe_module_callback(struct notifier_block *nb,
 	struct fprobe_hlist_node *node;
 	struct rhashtable_iter iter;
 	struct module *mod = data;
+	bool retry;
 
 	if (val != MODULE_STATE_GOING)
 		return NOTIFY_DONE;
 
 	alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL);
-	/* If failed to alloc memory, we can not remove ips from hash. */
-	if (!alist.addrs)
-		return NOTIFY_DONE;
+	/*
+	 * If failed to alloc memory, ftrace_ops will not be able to remove ips from
+	 * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid
+	 * the potential wrong callback. So just print a warning here and try to
+	 * continue without address list.
+	 */
+	WARN_ONCE(!alist.addrs,
+		"Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated");
 
 	mutex_lock(&fprobe_mutex);
+again:
+	retry = false;
+	alist.index = 0;
 	rhltable_walk_enter(&fprobe_ip_table, &iter);
 	do {
 		rhashtable_walk_start(&iter);
 
 		while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
-			fprobe_remove_node_in_module(mod, node, &alist);
+			if (fprobe_remove_node_in_module(mod, node, &alist) < 0) {
+				retry = true;
+				break;
+			}
 
 		rhashtable_walk_stop(&iter);
-	} while (node == ERR_PTR(-EAGAIN));
+	} while (node == ERR_PTR(-EAGAIN) && !retry);
 	rhashtable_walk_exit(&iter);
+	/* Remove any ips from hash table(s) */
+	if (alist.index > 0) {
+		fprobe_remove_ips(alist.addrs, alist.index);
+		/*
+		 * If we break rhashtable walk loop except for -EAGAIN, we need
+		 * to restart looping from start for safety. Anyway, this is
+		 * not a hotpath.
+		 */
+		if (retry)
+			goto again;
+	}
 
-	if (alist.index > 0)
-		fprobe_set_ips(alist.addrs, alist.index, 1, 0);
 	mutex_unlock(&fprobe_mutex);
 
 	kfree(alist.addrs);


^ permalink raw reply related

* [PATCH v10 3/8] tracing/fprobe: Remove fprobe from hash in failure path
From: Masami Hiramatsu (Google) @ 2026-04-20 14:01 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

When register_fprobe_ips() fails, it tries to remove a list of
fprobe_hash_node from fprobe_ip_table, but it missed to remove
fprobe itself from fprobe_table. Moreover, when removing
the fprobe_hash_node which is added to rhltable once, it must
use kfree_rcu() after removing from rhltable.

To fix these issues, this reuses unregister_fprobe() internal
code to rollback the half-way registered fprobe.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v10:
  - Add RCU sync for the registration failure.
 Changes in v8:
  - Fix to check return value of add_fprobe_hash() and break
    loop if insert_fprobe_node() is failed.
 Changes in v7:
  - Remove RCU grace period wait, since fprobe itself is not
    that is not needed.
 Changes in v6:
  - Wait for an RCU grace period before returning error in
    unregister_fprobe_nolock().
 Changes in v5:
  - When rolling back an fprobe that failed to register, the
    fprobe_hash_node are forcibly removed and warn if failure.
 Changes in v4:
  - Remove short-cut case because we always need to upadte ftrace_ops.
  - Use guard(mutex) in register_fprobe_ips() to unlock it correctly.
  - Remove redundant !ret check in register_fprobe_ips().
  - Do not set hlist_array->size in failure case, instead,
    hlist_array->array[i].fp is set only when insertion is succeeded.
  Changes in v3:
  - Newly added.
---
 kernel/trace/fprobe.c |   90 ++++++++++++++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 43 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index a2b659006e0e..621477ad0947 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -79,20 +79,27 @@ static const struct rhashtable_params fprobe_rht_params = {
 };
 
 /* Node insertion and deletion requires the fprobe_mutex */
-static int insert_fprobe_node(struct fprobe_hlist_node *node)
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 {
+	int ret;
+
 	lockdep_assert_held(&fprobe_mutex);
 
-	return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+	ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+	/* Set the fprobe pointer if insertion was successful. */
+	if (!ret)
+		WRITE_ONCE(node->fp, fp);
+	return ret;
 }
 
 /* Return true if there are synonims */
 static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	lockdep_assert_held(&fprobe_mutex);
 	bool ret;
 
-	/* Avoid double deleting */
+	lockdep_assert_held(&fprobe_mutex);
+
+	/* Avoid double deleting and non-inserted nodes */
 	if (READ_ONCE(node->fp) != NULL) {
 		WRITE_ONCE(node->fp, NULL);
 		rhltable_remove(&fprobe_ip_table, &node->hlist,
@@ -756,7 +763,6 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
 	fp->hlist_array = hlist_array;
 	hlist_array->fp = fp;
 	for (i = 0; i < num; i++) {
-		hlist_array->array[i].fp = fp;
 		addr = ftrace_location(addrs[i]);
 		if (!addr) {
 			fprobe_fail_cleanup(fp);
@@ -820,6 +826,8 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
 }
 EXPORT_SYMBOL_GPL(register_fprobe);
 
+static int unregister_fprobe_nolock(struct fprobe *fp);
+
 /**
  * register_fprobe_ips() - Register fprobe to ftrace by address.
  * @fp: A fprobe data structure to be registered.
@@ -846,28 +854,25 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	if (ret)
 		return ret;
 
-	hlist_array = fp->hlist_array;
 	if (fprobe_is_ftrace(fp))
 		ret = fprobe_ftrace_add_ips(addrs, num);
 	else
 		ret = fprobe_graph_add_ips(addrs, num);
-
-	if (!ret) {
-		add_fprobe_hash(fp);
-		for (i = 0; i < hlist_array->size; i++) {
-			ret = insert_fprobe_node(&hlist_array->array[i]);
-			if (ret)
-				break;
-		}
-		/* fallback on insert error */
-		if (ret) {
-			for (i--; i >= 0; i--)
-				delete_fprobe_node(&hlist_array->array[i]);
-		}
+	if (ret) {
+		fprobe_fail_cleanup(fp);
+		return ret;
 	}
 
-	if (ret)
-		fprobe_fail_cleanup(fp);
+	hlist_array = fp->hlist_array;
+	ret = add_fprobe_hash(fp);
+	for (i = 0; i < hlist_array->size && !ret; i++)
+		ret = insert_fprobe_node(&hlist_array->array[i], fp);
+
+	if (ret) {
+		unregister_fprobe_nolock(fp);
+		/* In error case, wait for clean up safely. */
+		synchronize_rcu();
+	}
 
 	return ret;
 }
@@ -911,27 +916,12 @@ bool fprobe_is_registered(struct fprobe *fp)
 	return true;
 }
 
-/**
- * unregister_fprobe() - Unregister fprobe.
- * @fp: A fprobe data structure to be unregistered.
- *
- * Unregister fprobe (and remove ftrace hooks from the function entries).
- *
- * Return 0 if @fp is unregistered successfully, -errno if not.
- */
-int unregister_fprobe(struct fprobe *fp)
+static int unregister_fprobe_nolock(struct fprobe *fp)
 {
-	struct fprobe_hlist *hlist_array;
+	struct fprobe_hlist *hlist_array = fp->hlist_array;
 	unsigned long *addrs = NULL;
-	int ret = 0, i, count;
+	int i, count;
 
-	mutex_lock(&fprobe_mutex);
-	if (!fp || !fprobe_registered(fp)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	hlist_array = fp->hlist_array;
 	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
 	/*
 	 * This will remove fprobe_hash_node from the hash table even if
@@ -957,12 +947,26 @@ int unregister_fprobe(struct fprobe *fp)
 
 	kfree_rcu(hlist_array, rcu);
 	fp->hlist_array = NULL;
+	kfree(addrs);
 
-out:
-	mutex_unlock(&fprobe_mutex);
+	return 0;
+}
 
-	kfree(addrs);
-	return ret;
+/**
+ * unregister_fprobe() - Unregister fprobe.
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+	guard(mutex)(&fprobe_mutex);
+	if (!fp || !fprobe_registered(fp))
+		return -EINVAL;
+
+	return unregister_fprobe_nolock(fp);
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
 


^ permalink raw reply related

* [PATCH v10 2/8] tracing/fprobe: Unregister fprobe even if memory allocation fails
From: Masami Hiramatsu (Google) @ 2026-04-20 14:00 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

unregister_fprobe() can fail under memory pressure because of memory
allocation failure, but this maybe called from module unloading, and
usually there is no way to retry it. Moreover. trace_fprobe does not
check the return value.

To fix this problem, unregister fprobe and fprobe_hash_node even if
working memory allocation fails.
Anyway, if the last fprobe is removed, the filter will be freed.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v9:
  - Clear ftrace_ops filter when unregister it.
 Changes in v7:
  - Newly added.
---
 kernel/trace/fprobe.c |   25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index af9ba7250874..a2b659006e0e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -324,9 +324,10 @@ static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
 	lockdep_assert_held(&fprobe_mutex);
 
 	fprobe_ftrace_active--;
-	if (!fprobe_ftrace_active)
+	if (!fprobe_ftrace_active) {
 		unregister_ftrace_function(&fprobe_ftrace_ops);
-	if (num)
+		ftrace_free_filter(&fprobe_ftrace_ops);
+	} else if (num)
 		ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
 }
 
@@ -525,10 +526,10 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
 
 	fprobe_graph_active--;
 	/* Q: should we unregister it ? */
-	if (!fprobe_graph_active)
+	if (!fprobe_graph_active) {
 		unregister_ftrace_graph(&fprobe_graph_ops);
-
-	if (num)
+		ftrace_free_filter(&fprobe_graph_ops.ops);
+	} else if (num)
 		ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
 }
 
@@ -932,15 +933,19 @@ int unregister_fprobe(struct fprobe *fp)
 
 	hlist_array = fp->hlist_array;
 	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
-	if (!addrs) {
-		ret = -ENOMEM;	/* TODO: Fallback to one-by-one loop */
-		goto out;
-	}
+	/*
+	 * This will remove fprobe_hash_node from the hash table even if
+	 * memory allocation fails. However, ftrace_ops will not be updated.
+	 * Anyway, when the last fprobe is unregistered, ftrace_ops is also
+	 * unregistered.
+	 */
+	if (!addrs)
+		pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n");
 
 	/* Remove non-synonim ips from table and hash */
 	count = 0;
 	for (i = 0; i < hlist_array->size; i++) {
-		if (!delete_fprobe_node(&hlist_array->array[i]))
+		if (!delete_fprobe_node(&hlist_array->array[i]) && addrs)
 			addrs[count++] = hlist_array->array[i].addr;
 	}
 	del_fprobe_hash(fp);


^ permalink raw reply related

* [PATCH v10 1/8] tracing/fprobe: Reject registration of a registered fprobe before init
From: Masami Hiramatsu (Google) @ 2026-04-20 14:00 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177669363667.132053.12454670015890859277.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Reject registration of a registered fprobe which is on the fprobe
hash table before initializing fprobe.
The add_fprobe_hash() checks this re-register fprobe, but since
fprobe_init() clears hlist_array field, it is too late to check it.
It has to check the re-registration before touncing fprobe.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v6:
  - Newly added.
---
 kernel/trace/fprobe.c |   21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 56d145017902..af9ba7250874 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -4,6 +4,7 @@
  */
 #define pr_fmt(fmt) "fprobe: " fmt
 
+#include <linux/cleanup.h>
 #include <linux/err.h>
 #include <linux/fprobe.h>
 #include <linux/kallsyms.h>
@@ -107,7 +108,7 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 }
 
 /* Check existence of the fprobe */
-static bool is_fprobe_still_exist(struct fprobe *fp)
+static bool fprobe_registered(struct fprobe *fp)
 {
 	struct hlist_head *head;
 	struct fprobe_hlist *fph;
@@ -120,7 +121,7 @@ static bool is_fprobe_still_exist(struct fprobe *fp)
 	}
 	return false;
 }
-NOKPROBE_SYMBOL(is_fprobe_still_exist);
+NOKPROBE_SYMBOL(fprobe_registered);
 
 static int add_fprobe_hash(struct fprobe *fp)
 {
@@ -132,9 +133,6 @@ static int add_fprobe_hash(struct fprobe *fp)
 	if (WARN_ON_ONCE(!fph))
 		return -EINVAL;
 
-	if (is_fprobe_still_exist(fp))
-		return -EEXIST;
-
 	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
 	hlist_add_head_rcu(&fp->hlist_array->hlist, head);
 	return 0;
@@ -149,7 +147,7 @@ static int del_fprobe_hash(struct fprobe *fp)
 	if (WARN_ON_ONCE(!fph))
 		return -EINVAL;
 
-	if (!is_fprobe_still_exist(fp))
+	if (!fprobe_registered(fp))
 		return -ENOENT;
 
 	fph->fp = NULL;
@@ -480,7 +478,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
 		if (!fp)
 			break;
 		curr += FPROBE_HEADER_SIZE_IN_LONG;
-		if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+		if (fprobe_registered(fp) && !fprobe_disabled(fp)) {
 			if (WARN_ON_ONCE(curr + size > size_words))
 				break;
 			fp->exit_handler(fp, trace->func, ret_ip, fregs,
@@ -839,12 +837,14 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	struct fprobe_hlist *hlist_array;
 	int ret, i;
 
+	guard(mutex)(&fprobe_mutex);
+	if (fprobe_registered(fp))
+		return -EEXIST;
+
 	ret = fprobe_init(fp, addrs, num);
 	if (ret)
 		return ret;
 
-	mutex_lock(&fprobe_mutex);
-
 	hlist_array = fp->hlist_array;
 	if (fprobe_is_ftrace(fp))
 		ret = fprobe_ftrace_add_ips(addrs, num);
@@ -864,7 +864,6 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 				delete_fprobe_node(&hlist_array->array[i]);
 		}
 	}
-	mutex_unlock(&fprobe_mutex);
 
 	if (ret)
 		fprobe_fail_cleanup(fp);
@@ -926,7 +925,7 @@ int unregister_fprobe(struct fprobe *fp)
 	int ret = 0, i, count;
 
 	mutex_lock(&fprobe_mutex);
-	if (!fp || !is_fprobe_still_exist(fp)) {
+	if (!fp || !fprobe_registered(fp)) {
 		ret = -EINVAL;
 		goto out;
 	}


^ permalink raw reply related

* [PATCH v10 0/8] tracing/fprobe: Fix fprobe_ip_table related bugs
From: Masami Hiramatsu (Google) @ 2026-04-20 14:00 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel

Hi,

Here is the 10th version of fprobe bugfix series.
The previous version is here.

https://lore.kernel.org/all/177644266147.584467.8179035927318998910.stgit@mhiramat.tok.corp.google.com/

This version fixes minor bugs. Add an RCU sync when fprobe
registration failed [3/8], call __fprobe_f*_unregister()
if there is no node on hash table even if it fails to allocate
working memory [6/8]. And fix to check "normalized" module
name in error path [7/8].

Sashiko pointed other issues[1],

[1] https://sashiko.dev/#/patchset/177644266147.584467.8179035927318998910.stgit%40mhiramat.tok.corp.google.com

- Is there a missing RCU read-side critical section here?
  -> No, it is under preemption disabled. It seems a preempt
  disabling is stronger restriction for RCU read-side critical
  section.

- Does the error fallback path leave dangling pointers in the global hash
  table?
  Yes, but it is not introduced by this, and is fixed by [3/8].

Thank you!

Masami Hiramatsu (Google) (8):
      tracing/fprobe: Reject registration of a registered fprobe before init
      tracing/fprobe: Unregister fprobe even if memory allocation fails
      tracing/fprobe: Remove fprobe from hash in failure path
      tracing/fprobe: Avoid kcalloc() in rcu_read_lock section
      tracing/fprobe: Check the same type fprobe on table as the unregistered one
      tracing/fprobe: Fix to unregister ftrace_ops if it is empty on module unloading
      selftests/ftrace: Add a testcase for fprobe events on module
      selftests/ftrace: Add a testcase for multiple fprobe events


 kernel/trace/fprobe.c                              |  472 +++++++++++++-------
 .../test.d/dynevent/add_remove_fprobe_module.tc    |   87 ++++
 .../test.d/dynevent/add_remove_multiple_fprobe.tc  |   69 +++
 3 files changed, 466 insertions(+), 162 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_multiple_fprobe.tc


base-commit: e0a384434ae1bdfb03954c46c464e3dbd3223ad6
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH 7.2 v16 04/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Usama Arif @ 2026-04-20 13:55 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-5-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:41 -0600 Nico Pache <npache@redhat.com> wrote:

> generalize the order of the __collapse_huge_page_* and collapse_max_*
> functions to support future mTHP collapse.
> 
> The current mechanism for determining collapse with the
> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
> raises a key design issue: if we support user defined max_pte_none values
> (even those scaled by order), a collapse of a lower order can introduces
> an feedback loop, or "creep", when max_ptes_none is set to a value greater
> than HPAGE_PMD_NR / 2.
> 
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
> 
> To fix this issue introduce a helper function that will limit mTHP
> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
> This effectively supports two modes:
> 
> - max_ptes_none=0: never introduce new none-pages for mTHP collapse.
> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
>   available mTHP order.
> 
> This removes the possiblilty of "creep", while not modifying any uAPI
> expectations. A warning will be emitted if any non-supported
> max_ptes_none value is configured with mTHP enabled.
> 
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
> 
> No functional changes in this patch; however it defines future behavior
> for mTHP collapse.
> 
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 124 ++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 88 insertions(+), 36 deletions(-)
> 

Small nits. Most might not need change.

> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f42b55421191..283bb63854a5 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -352,51 +352,86 @@ static bool pte_none_or_zero(pte_t pte)
>   * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
>   * @cc: The collapse control struct
>   * @vma: The vma to check for userfaultfd
> + * @order: The folio order being collapsed to
>   *
>   * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
> - * empty page.
> + * empty page. For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the
> + * configured khugepaged_max_ptes_none value.
> + *
> + * For mTHP collapses, we currently only support khugepaged_max_pte_none values
> + * of 0 or (KHUGEPAGED_MAX_PTES_LIMIT). Any other value will emit a warning and
> + * no mTHP collapse will be attempted
>   *
>   * Return: Maximum number of empty PTEs allowed for the collapse operation
>   */
> -static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> -		struct vm_area_struct *vma)
> +static int collapse_max_ptes_none(struct collapse_control *cc,
> +		struct vm_area_struct *vma, unsigned int order)
>  {
>  	if (vma && userfaultfd_armed(vma))
>  		return 0;
>  	if (!cc->is_khugepaged)
>  		return HPAGE_PMD_NR;
> -	return khugepaged_max_ptes_none;
> +	if (is_pmd_order(order))
> +		return khugepaged_max_ptes_none;
> +	/* Zero/non-present collapse disabled. */
> +	if (!khugepaged_max_ptes_none)
> +		return 0;
> +	if (khugepaged_max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
> +		return (1 << order) - 1;
> +

There are 2 reads of khugepaged_max_ptes_none here.
A concurrent sysctl write between reads can yield "0 then non-zero" or "LIMIT
then mid-value".

Would be good to just snapshot once at the start of the function and use that
value?

> +	pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
> +		      KHUGEPAGED_MAX_PTES_LIMIT);

IMO, warn_once can get lost quickly in dmesg. Maybe pr_warn_ratelimited?

Not sure what others opinions are..

> +	return -EINVAL;
>  }
>  
>  /**
>   * collapse_max_ptes_shared - Calculate maximum allowed shared PTEs for collapse
>   * @cc: The collapse control struct
> + * @order: The folio order being collapsed to
>   *
>   * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
>   * shared page.
>   *
> + * For mTHP collapses, we currently dont support collapsing memory with
> + * shared memory.
> + *
>   * Return: Maximum number of shared PTEs allowed for the collapse operation
>   */
> -static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
> +static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
> +		unsigned int order)
>  {
>  	if (!cc->is_khugepaged)
>  		return HPAGE_PMD_NR;
> +	if (!is_pmd_order(order))
> +		return 0;
> +
>  	return khugepaged_max_ptes_shared;
>  }
>  
>  /**
>   * collapse_max_ptes_swap - Calculate maximum allowed swap PTEs for collapse
>   * @cc: The collapse control struct
> + * @order: The folio order being collapsed to
>   *
>   * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
>   * swap page.
>   *
> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> + * khugepaged_max_ptes_swap value.
> + *
> + * For mTHP collapses, we currently dont support collapsing memory with
> + * swapped out memory.
> + *
>   * Return: Maximum number of swap PTEs allowed for the collapse operation
>   */
> -static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
> +static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
> +		unsigned int order)
>  {
>  	if (!cc->is_khugepaged)
>  		return HPAGE_PMD_NR;
> +	if (!is_pmd_order(order))
> +		return 0;
> +
>  	return khugepaged_max_ptes_swap;
>  }
>  
> @@ -590,18 +625,22 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>  
>  static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> -		struct list_head *compound_pagelist)
> +		unsigned int order, struct list_head *compound_pagelist)
>  {
> +	const unsigned long nr_pages = 1UL << order;
>  	struct page *page = NULL;
>  	struct folio *folio = NULL;
>  	unsigned long addr = start_addr;
>  	pte_t *_pte;
>  	int none_or_zero = 0, shared = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
> -	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> -	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
> +	int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
> +	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
> +
> +	if (max_ptes_none < 0)
> +		return result;

Would a dedicated SCAN_INVALID_PTES_NONE make more sense here instead
of SCAN_FAIL?

>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +	for (_pte = pte; _pte < pte + nr_pages;
>  	     _pte++, addr += PAGE_SIZE) {
>  		pte_t pteval = ptep_get(_pte);
>  		if (pte_none_or_zero(pteval)) {
> @@ -734,18 +773,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  }
>  
>  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> -						struct vm_area_struct *vma,
> -						unsigned long address,
> -						spinlock_t *ptl,
> -						struct list_head *compound_pagelist)
> +		struct vm_area_struct *vma, unsigned long address,
> +		spinlock_t *ptl, unsigned int order,
> +		struct list_head *compound_pagelist)
>  {
> -	unsigned long end = address + HPAGE_PMD_SIZE;
> +	const unsigned long nr_pages = 1UL << order;
> +	unsigned long end = address + (PAGE_SIZE << order);
>  	struct folio *src, *tmp;
>  	pte_t pteval;
>  	pte_t *_pte;
>  	unsigned int nr_ptes;
>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> +	for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
>  	     address += nr_ptes * PAGE_SIZE) {
>  		nr_ptes = 1;
>  		pteval = ptep_get(_pte);
> @@ -798,13 +837,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>  }
>  
>  static void __collapse_huge_page_copy_failed(pte_t *pte,
> -					     pmd_t *pmd,
> -					     pmd_t orig_pmd,
> -					     struct vm_area_struct *vma,
> -					     struct list_head *compound_pagelist)
> +		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> +		unsigned int order, struct list_head *compound_pagelist)
>  {
> +	const unsigned long nr_pages = 1UL << order;
>  	spinlock_t *pmd_ptl;
> -

Shouldn't remove the newline above?

>  	/*
>  	 * Re-establish the PMD to point to the original page table
>  	 * entry. Restoring PMD needs to be done prior to releasing
> @@ -818,7 +855,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>  	 * Release both raw and compound pages isolated
>  	 * in __collapse_huge_page_isolate.
>  	 */
> -	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> +	release_pte_pages(pte, pte + nr_pages, compound_pagelist);
>  }
>  
>  /*
> @@ -838,16 +875,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>   */
>  static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>  		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> -		unsigned long address, spinlock_t *ptl,
> +		unsigned long address, spinlock_t *ptl, unsigned int order,
>  		struct list_head *compound_pagelist)
>  {
> +	const unsigned long nr_pages = 1UL << order;
>  	unsigned int i;
>  	enum scan_result result = SCAN_SUCCEED;
> -

Same here?

>  	/*
>  	 * Copying pages' contents is subject to memory poison at any iteration.
>  	 */
> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +	for (i = 0; i < nr_pages; i++) {
>  		pte_t pteval = ptep_get(pte + i);
>  		struct page *page = folio_page(folio, i);
>  		unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -866,10 +903,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
>  
>  	if (likely(result == SCAN_SUCCEED))
>  		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> -						    compound_pagelist);
> +						    order, compound_pagelist);
>  	else
>  		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> -						 compound_pagelist);
> +						 order, compound_pagelist);
>  
>  	return result;
>  }
> @@ -1040,12 +1077,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
>   * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
>   */
>  static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> -		struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
> -		int referenced)
> +		struct vm_area_struct *vma, unsigned long start_addr,
> +		pmd_t *pmd, int referenced, unsigned int order)

Will probably find out in later reviews, but there is tracepoint in __collapse_huge_page_swapin.
Would be good to add order in that tracepoint if you are adding order here?

>  {
>  	int swapped_in = 0;
>  	vm_fault_t ret = 0;
> -	unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> +	unsigned long addr, end = start_addr + (PAGE_SIZE << order);
>  	enum scan_result result;
>  	pte_t *pte = NULL;
>  	spinlock_t *ptl;
> @@ -1077,6 +1114,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>  		    pte_present(vmf.orig_pte))
>  			continue;
>  
> +		/*
> +		 * TODO: Support swapin without leading to further mTHP
> +		 * collapses. Currently bringing in new pages via swapin may
> +		 * cause a future higher order collapse on a rescan of the same
> +		 * range.
> +		 */
> +		if (!is_pmd_order(order)) {

Would it be good to introduce this in the patch that activates it? No strong
preference btw. Just that its dead code in this patch itself.

> +			pte_unmap(pte);
> +			mmap_read_unlock(mm);
> +			result = SCAN_EXCEED_SWAP_PTE;
> +			goto out;
> +		}
> +
>  		vmf.pte = pte;
>  		vmf.ptl = ptl;
>  		ret = do_swap_page(&vmf);
> @@ -1196,7 +1246,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  		 * that case.  Continuing to collapse causes inconsistency.
>  		 */
>  		result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> -						     referenced);
> +						     referenced, HPAGE_PMD_ORDER);
>  		if (result != SCAN_SUCCEED)
>  			goto out_nolock;
>  	}
> @@ -1244,6 +1294,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>  	if (pte) {
>  		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> +						      HPAGE_PMD_ORDER,
>  						      &compound_pagelist);
>  		spin_unlock(pte_ptl);
>  	} else {
> @@ -1274,6 +1325,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>  
>  	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>  					   vma, address, pte_ptl,
> +					   HPAGE_PMD_ORDER,
>  					   &compound_pagelist);
>  	pte_unmap(pte);
>  	if (unlikely(result != SCAN_SUCCEED))
> @@ -1318,9 +1370,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  	unsigned long addr;
>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;
> -	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> -	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
> -	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
> +	int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> +	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
> +	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
>  
>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>  
> @@ -2371,8 +2423,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
>  	int present, swap;
>  	int node = NUMA_NO_NODE;
>  	enum scan_result result = SCAN_SUCCEED;
> -	unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
> -	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
> +	int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
> +	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
>  
>  	present = 0;
>  	swap = 0;
> -- 
> 2.53.0
> 
> 

^ permalink raw reply

* Re: [PATCH 0/3] mm: split the file's i_mmap tree for NUMA
From: Pedro Falcato @ 2026-04-20 13:48 UTC (permalink / raw)
  To: Huang Shijie
  Cc: Mateusz Guzik, akpm, viro, brauner, linux-mm, linux-kernel,
	linux-arm-kernel, linux-fsdevel, muchun.song, osalvador,
	linux-trace-kernel, linux-perf-users, linux-parisc, nvdimm,
	zhongyuan, fangbaoshun, yingzhiwei
In-Reply-To: <aeWLCxru6cLWsxvQ@SH-HV00110.Hygon.cn>

BTW you're missing _a lot_ of CC's here, including the whole of mm/rmap.c
maintainership.

On Mon, Apr 20, 2026 at 10:10:19AM +0800, Huang Shijie wrote:
> On Mon, Apr 13, 2026 at 05:33:21PM +0200, Mateusz Guzik wrote:
> > On Mon, Apr 13, 2026 at 02:20:39PM +0800, Huang Shijie wrote:
> > >   In NUMA, there are maybe many NUMA nodes and many CPUs.
> > > For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
> > > In the UnixBench tests, there is a test "execl" which tests
> > > the execve system call.
> > > 
> > >   When we test our server with "./Run -c 384 execl",
> > > the test result is not good enough. The i_mmap locks contended heavily on
> > > "libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
> > > over 6000 VMAs, all the VMAs can be in different NUMA mode.
> > > The insert/remove operations do not run quickly enough.
> > > 
> > > patch 1 & patch 2 are try to hide the direct access of i_mmap.
> > > patch 3 splits the i_mmap into sibling trees, and we can get better 
> > > performance with this patch set:
> > >     we can get 77% performance improvement(10 times average)
> > > 
> > 
> > To my reading you kept the lock as-is and only distributed the protected
> > state.
> > 
> > While I don't doubt the improvement, I'm confident should you take a
> > look at the profile you are going to find this still does not scale with
> > rwsem being one of the problems (there are other global locks, some of
> > which have experimental patches for).
> > 
> > Apart from that this does nothing to help high core systems which are
> > all one node, which imo puts another question mark on this specific
> > proposal.
> > 
> > Of course one may question whether a RB tree is the right choice here,
> > it may be the lock-protected cost can go way down with merely a better
> > data structure.
> > 
> > Regardless of that, for actual scalability, there will be no way around
> > decentralazing locking around this and partitioning per some core count
> > (not just by numa awareness).
> > 
> > Decentralizing locking is definitely possible, but I have not looked
> > into specifics of how problematic it is. Best case scenario it will
> > merely with separate locks. Worst case scenario something needs a fully
> > stabilized state for traversal, in that case another rw lock can be
> > slapped around this, creating locking order read lock -> per-subset
> > write lock -- this will suffer scalability due to the read locking, but
> > it will still scale drastically better as apart from that there will be
> > no serialization. In this setting the problematic consumer will write
> > lock the new thing to stabilize the state.
> > 
> I thought over again.
> I can change this patch set to support the non-NUMA case by:
>   1.) Still use one rw lock.

No. This doesn't help anything.

>   2.) For NUMA, keep the patch set as it is.

Please no. No NUMA vs non-NUMA case.

>   3.) For non-NUMA case, split the i_mmap tree to several subtrees.
>       For example, if a machine has 192 CPUs, split the 32 CPUs as a tree.

If lock contention is the problem, I don't see how splitting the tree helps,
unless it helps reduce lock hold time in a way that randomly helps your workload.
But that's entirely random.

> 
> So extend the patch set to support both the NUMA and non-NUMA machines.

FYI I've discussed some concrete ideas for reworking file rmap with Mateusz.
I'll be giving them a shot. Note that this needs to be done _carefully_,
particularly as there are some hidden assumptions wrt forking that aren't
very clear as to how they work[1].

[1] https://lore.kernel.org/all/bnukmnuxxuhdfeasjz33miemgr7w35c4aa6pqdmgupx7oxmeeb@gozgc3yxhcdd/
-- 
Pedro

^ permalink raw reply

* [PATCH AUTOSEL 7.0-6.18] rtla: Handle pthread_create() failure properly
From: Sasha Levin @ 2026-04-20 13:19 UTC (permalink / raw)
  To: patches, stable
  Cc: Wander Lairson Costa, Tomas Glozar, Sasha Levin, rostedt,
	linux-trace-kernel, linux-kernel
In-Reply-To: <20260420132314.1023554-1-sashal@kernel.org>

From: Wander Lairson Costa <wander@redhat.com>

[ Upstream commit d847188bb92b14518a04d7542e44928a22060847 ]

Add proper error handling when pthread_create() fails to create the
timerlat user-space dispatcher thread. Previously, the code only logged
an error message but continued execution, which could lead to undefined
behavior when the tool later expects the thread to be running.

When pthread_create() returns an error, the function now jumps to the
out_trace error path to properly clean up resources and exit. This
ensures consistent error handling and prevents the tool from running
in an invalid state without the required user-space thread.

Signed-off-by: Wander Lairson Costa <wander@redhat.com>
Link: https://lore.kernel.org/r/20260309195040.1019085-10-wander@redhat.com
Signed-off-by: Tomas Glozar <tglozar@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Now let me compile the full analysis.

---

## Phase 1: Commit Message Forensics

**Step 1.1: Subject Line**
- Subsystem: `rtla` (Real-Time Linux Analysis tool, under
  `tools/tracing/rtla`)
- Action verb: "Handle" (strongly suggests fixing incorrect handling - a
  bug fix pattern)
- Summary: Add proper error handling when pthread_create() fails for the
  timerlat user-space dispatcher thread.

Record: [rtla] [Handle] [Add missing error path when pthread_create
fails]

**Step 1.2: Tags**
- `Signed-off-by: Wander Lairson Costa <wander@redhat.com>` - author,
  Red Hat engineer
- `Link:
  https://lore.kernel.org/r/20260309195040.1019085-10-wander@redhat.com`
  - part of a larger series (patch 10 of a series)
- `Signed-off-by: Tomas Glozar <tglozar@redhat.com>` - maintainer sign-
  off
- No Fixes: tag (expected for autosel candidates)
- No Reported-by: (found by code review, not user report)
- No Cc: stable (expected)

Record: Author is Wander (Red Hat, active rtla contributor). Part of a
larger series (patch 10). Accepted by Tomas Glozar (rtla maintainer).

**Step 1.3: Body Text**
The commit message clearly describes: when `pthread_create()` fails, the
code only logged an error but continued execution. This leads to the
tool running in an invalid state where it expects user-space threads
that don't exist.

Record: Bug = missing error exit on pthread_create failure. Symptom =
tool runs without required user-space thread. Root cause = missing `goto
out_trace` on error path.

**Step 1.4: Hidden Bug Fix Detection**
"Handle ... properly" is a classic bug-fix pattern. This IS a bug fix -
it adds a missing error exit path.

Record: Yes, this is a clear bug fix despite not using the word "fix" in
the subject.

## Phase 2: Diff Analysis

**Step 2.1: Changes Inventory**
- 1 file modified: `tools/tracing/rtla/src/common.c`
- Net change: +3 lines / -1 line (added braces + `goto out_trace;`)
- Function modified: `run_tool()`
- Scope: Single-file, surgical fix

**Step 2.2: Code Flow Change**
Before: `pthread_create()` failure logged an error message but execution
continued to `ops->enable(tool)`, `ops->main(tool)`, etc.
After: `pthread_create()` failure logs error and jumps to `out_trace`
for proper cleanup and exit.

**Step 2.3: Bug Mechanism**
Category: (a) Error path fix. The code was missing a `goto` to the error
cleanup path when `pthread_create()` failed. Without it, the tool runs
without the user-space timerlat threads, producing incorrect/misleading
measurements.

**Step 2.4: Fix Quality**
- Obviously correct: follows the identical pattern used by all other
  error checks in the same function (lines 247, 253, 280, 287)
- Minimal/surgical: only adds braces and a `goto`
- Regression risk: extremely low - only changes behavior when
  `pthread_create()` fails (which is already an error condition)

Record: Fix is obviously correct, minimal, and consistent with
surrounding code patterns. No regression risk.

## Phase 3: Git History Investigation

**Step 3.1: Blame**
The buggy code (lines 257-276) was introduced by commit `2f3172f9dd58cc`
("tools/rtla: Consolidate code between osnoise/timerlat and hist/top")
by Crystal Wood, September 2025. However, tracing further back, the
original missing error handling existed since commit `cdca4f4e5e8ea`
("rtla/timerlat_top: Add timerlat user-space support") by Daniel Bristot
de Oliveira, June 2023 (v6.5-rc1).

Record: Bug introduced in v6.5-rc1, present in all stable trees from
6.6.y onward. The consolidation commit just carried the bug forward into
`common.c`.

**Step 3.2: Fixes Tag**
No Fixes: tag present (expected for autosel candidates). The bug
logically traces to `cdca4f4e5e8ea` (v6.5-rc1).

**Step 3.3: File History**
The file has been actively developed. Recent commits include
consolidations of option parsing, volatile fix for stop_tracing, and
other improvements. The author (Wander Lairson Costa) is a prolific
contributor to rtla.

**Step 3.4: Author**
Wander has at least 17 commits in rtla (including multiple fixes like
NULL pointer dereference fix, parse return value doc fix, volatile fix).
He is a regular contributor and maintainer-level contributor for rtla.

Record: Author is a regular, trusted contributor to this subsystem.

**Step 3.5: Dependencies**
The `run_tool()` function and the `out_trace` label already exist in the
7.0 tree. No dependencies needed. However, the `run_tool()` function
only exists since the consolidation commit `2f3172f9dd58cc` (~v6.18
cycle). In older stable trees (6.6.y, 6.12.y), the same fix would need
to target `timerlat_top.c` and `timerlat_hist.c` instead.

Record: For 7.0.y, applies standalone with no dependencies. For older
trees, would need different patches.

## Phase 4: Mailing List and External Research

**Step 4.1-4.2: Patch Discussion**
The commit's Link tag shows it's patch 10 of a series (Message-ID
`20260309195040.1019085-10-wander@redhat.com`). Lore.kernel.org was
blocked by anti-bot protection, but b4 dig confirmed the author's other
patches in the same series (e.g., `20260106133655.249887-16` for the
volatile fix). The patch was accepted and signed off by maintainer Tomas
Glozar.

Record: Part of a larger cleanup/fix series. Accepted by rtla
maintainer.

**Step 4.3-4.5: Bug Report / Stable Discussion**
No explicit bug report found. This appears to be found by code
review/audit, not by a user hitting it in practice.

Record: No user reports. Found by code inspection.

## Phase 5: Code Semantic Analysis

**Step 5.1: Functions Modified**
Only `run_tool()` in `common.c`.

**Step 5.2: Callers**
`run_tool()` is the unified entry point for all rtla tool modes (osnoise
top/hist, timerlat top/hist). It's called from each tool's main
function.

**Step 5.3-5.4: Call Chain**
When `pthread_create()` fails and execution continues:
1. `ops->enable(tool)` - enables tracing infrastructure
2. `ops->main(tool)` - runs main measurement loop (top_main_loop or
   hist_main_loop)
3. Both main loops check `params->user.stopped_running` to detect if
   user threads died
4. Since threads were never created, `stopped_running` stays at 0, so
   the tool thinks threads are still running
5. The tool produces measurements and statistics without user-space
   thread contributions

**Step 5.5: Similar Patterns**
The original code in `timerlat_top.c` and `timerlat_hist.c` (pre-
consolidation) had the identical missing error handling pattern,
confirming this is a systematic bug.

## Phase 6: Cross-Referencing and Stable Tree Analysis

**Step 6.1: Buggy Code in Stable**
The `run_tool()` function in `common.c` only exists since ~v6.18 cycle.
In 7.0.y, the code exists as-is and the patch applies cleanly. For older
stable trees, different patches targeting `timerlat_top.c` and
`timerlat_hist.c` would be needed.

**Step 6.2: Backport Complications**
For 7.0.y: clean apply expected - no conflicts.

**Step 6.3: Related Fixes**
No other fix for this specific issue found in stable.

## Phase 7: Subsystem and Maintainer Context

**Step 7.1: Subsystem**
`tools/tracing/rtla` - userspace real-time latency analysis tool.
Criticality: PERIPHERAL (userspace tool, not kernel code), but important
for real-time system validation.

**Step 7.2: Activity**
Very actively developed - 14+ commits since the consolidation.

## Phase 8: Impact and Risk Assessment

**Step 8.1: Who Is Affected**
Users of the rtla timerlat tool with `--user-threads` option,
specifically when `pthread_create()` fails.

**Step 8.2: Trigger Conditions**
Rare - requires `--user-threads` mode AND `pthread_create()` failure
(typically due to resource exhaustion or system limits).

**Step 8.3: Failure Mode Severity**
When triggered: tool continues running in invalid state, producing
measurements without user-space thread contributions. For a real-time
analysis tool, this means **silently incorrect results** (the error
message is printed but could be missed). Severity: MEDIUM (incorrect
tool output, not kernel crash/corruption).

**Step 8.4: Risk-Benefit Ratio**
- BENEFIT: Moderate - prevents misleading latency measurements
- RISK: Very low - 2-line change to an error path only triggered on
  failure
- Ratio: Favorable

## Phase 9: Final Synthesis

**Step 9.1: Evidence**
FOR backporting:
- Fixes a real bug (missing error exit leads to invalid tool state)
- Small and surgical (3 lines changed)
- Obviously correct (follows identical pattern in same function)
- Very low regression risk
- Author is trusted contributor; accepted by maintainer
- Bug existed since v6.5

AGAINST backporting:
- Userspace tool, not kernel code
- Trigger condition is rare (pthread_create failure)
- No user reports of actually hitting this bug
- MEDIUM severity (incorrect tool output, not crash/corruption)

**Step 9.2: Stable Rules Checklist**
1. Obviously correct and tested? **YES** - trivially correct
2. Fixes a real bug? **YES** - tool runs without required threads
3. Important issue? **MEDIUM** - incorrect measurements, not crash
4. Small and contained? **YES** - 3 lines in 1 file
5. No new features? **YES** - pure error handling fix
6. Can apply to stable? **YES** for 7.0.y

**Step 9.3: Exception Categories**
Not an exception category - standard bug fix.

**Step 9.4: Decision**
The fix is small, obviously correct, and prevents the rtla tool from
operating in an invalid state. While it's a userspace tool and the
trigger is rare, the fix has essentially zero regression risk and
improves correctness. The pattern of missing error handling on
`pthread_create()` failure is a clear bug.

## Verification

- [Phase 1] Parsed tags: Link to patch 10 of a series, SOBs from author
  and maintainer
- [Phase 2] Diff analysis: +3/-1 lines, adds `goto out_trace` to error
  path in `run_tool()`
- [Phase 3] git blame: buggy code carried from `cdca4f4e5e8ea`
  (v6.5-rc1) through consolidation `2f3172f9dd58cc`
- [Phase 3] git show 2f3172f9dd58cc: confirmed consolidation commit
  created `run_tool()` carrying the bug
- [Phase 3] Checked pre-consolidation files: both `timerlat_top.c` and
  `timerlat_hist.c` had identical missing error handling
- [Phase 4] b4 dig: confirmed author's series via `af2962d68b970` match
- [Phase 4] Lore blocked by anti-bot; could not read full thread
  discussion
- [Phase 5] Traced `run_tool()` flow: after failed pthread_create, tool
  continues to enable/main/stats without user threads
- [Phase 5] Verified `out_trace` cleanup path exists and is used by
  other error checks in same function
- [Phase 6] Code exists in 7.0.y (run_tool in common.c); older trees
  have equivalent code in different files
- [Phase 8] Failure mode: tool produces results without user-space
  threads, severity MEDIUM
- UNVERIFIED: Could not read full mailing list thread due to lore anti-
  bot protection

**YES**

 tools/tracing/rtla/src/common.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/tracing/rtla/src/common.c b/tools/tracing/rtla/src/common.c
index ceff76a62a30b..68426ce6f9971 100644
--- a/tools/tracing/rtla/src/common.c
+++ b/tools/tracing/rtla/src/common.c
@@ -271,8 +271,10 @@ int run_tool(struct tool_ops *ops, int argc, char *argv[])
 		params->user.cgroup_name = params->cgroup_name;
 
 		retval = pthread_create(&user_thread, NULL, timerlat_u_dispatcher, &params->user);
-		if (retval)
+		if (retval) {
 			err_msg("Error creating timerlat user-space threads\n");
+			goto out_trace;
+		}
 	}
 
 	retval = ops->enable(tool);
-- 
2.53.0


^ permalink raw reply related

* [PATCH] tracing: branch: Fix inverted check on stat tracer registration
From: Breno Leitao @ 2026-04-20 13:25 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Ingo Molnar,
	Frederic Weisbecker
  Cc: Steven Rostedt, linux-kernel, linux-trace-kernel, paulmck,
	kernel-team, Breno Leitao

init_annotated_branch_stats() and all_annotated_branch_stats() check the
return value of register_stat_tracer() with "if (!ret)", but
register_stat_tracer() returns 0 on success and a negative errno on
failure. The inverted check causes the warning to be printed on every
successful registration, e.g.:

  Warning: could not register annotated branches stats

while leaving real failures silent. The initcall also returned a
hard-coded 1 instead of the actual error.

Invert the check and propagate ret so that the warning fires on real
errors and the initcall reports the correct status.

Fixes: 002bb86d8d42 ("tracing/ftrace: separate events tracing and stats tracing engine")
Signed-off-by: Breno Leitao <leitao@debian.org>
---
 kernel/trace/trace_branch.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6809b370e991d..d1564db95a8f5 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -373,10 +373,10 @@ __init static int init_annotated_branch_stats(void)
 	int ret;
 
 	ret = register_stat_tracer(&annotated_branch_stats);
-	if (!ret) {
+	if (ret) {
 		printk(KERN_WARNING "Warning: could not register "
 				    "annotated branches stats\n");
-		return 1;
+		return ret;
 	}
 	return 0;
 }
@@ -438,10 +438,10 @@ __init static int all_annotated_branch_stats(void)
 	int ret;
 
 	ret = register_stat_tracer(&all_branch_stats);
-	if (!ret) {
+	if (ret) {
 		printk(KERN_WARNING "Warning: could not register "
 				    "all branches stats\n");
-		return 1;
+		return ret;
 	}
 	return 0;
 }

---
base-commit: c7275b05bc428c7373d97aa2da02d3a7fa6b9f66
change-id: 20260420-tracing-3f1367ee4b93

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply related

* [PATCH AUTOSEL 6.18] btrfs: tracepoints: get correct superblock from dentry in event btrfs_sync_file()
From: Sasha Levin @ 2026-04-20 13:08 UTC (permalink / raw)
  To: patches, stable
  Cc: Goldwyn Rodrigues, Boris Burkov, Goldwyn Rodrigues, David Sterba,
	Sasha Levin, clm, rostedt, mhiramat, linux-btrfs, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260420131539.986432-1-sashal@kernel.org>

From: Goldwyn Rodrigues <rgoldwyn@suse.de>

[ Upstream commit a85b46db143fda5869e7d8df8f258ccef5fa1719 ]

If overlay is used on top of btrfs, dentry->d_sb translates to overlay's
super block and fsid assignment will lead to a crash.

Use file_inode(file)->i_sb to always get btrfs_sb.

Reviewed-by: Boris Burkov <boris@bur.io>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Error: Failed to generate final synthesis

 include/trace/events/btrfs.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 125bdc166bfed..0864700f76e0a 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -769,12 +769,15 @@ TRACE_EVENT(btrfs_sync_file,
 	),
 
 	TP_fast_assign(
-		const struct dentry *dentry = file->f_path.dentry;
-		const struct inode *inode = d_inode(dentry);
+		struct dentry *dentry = file_dentry(file);
+		struct inode *inode = file_inode(file);
+		struct dentry *parent = dget_parent(dentry);
+		struct inode *parent_inode = d_inode(parent);
 
-		TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb));
+		dput(parent);
+		TP_fast_assign_fsid(btrfs_sb(inode->i_sb));
 		__entry->ino		= btrfs_ino(BTRFS_I(inode));
-		__entry->parent		= btrfs_ino(BTRFS_I(d_inode(dentry->d_parent)));
+		__entry->parent		= btrfs_ino(BTRFS_I(parent_inode));
 		__entry->datasync	= datasync;
 		__entry->root_objectid	= btrfs_root_id(BTRFS_I(inode)->root);
 	),
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH 7.2 v16 03/13] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Usama Arif @ 2026-04-20 13:15 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-4-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:40 -0600 Nico Pache <npache@redhat.com> wrote:

> The following cleanup reworks all the max_ptes_* handling into helper
> functions. This increases the code readability and will later be used to
> implement the mTHP handling of these variables.
> 
> With these changes we abstract all the madvise_collapse() special casing
> (dont respect the sysctls) away from the functions that utilize them. And
> will later in this series to cleanly restrict mTHP collapses behaviors.
> 
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 114 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 78 insertions(+), 36 deletions(-)
> 

The old code re-read khugepaged_max_ptes_* on every loop iteration; the new
code snapshots them once per scan call. If userspace writes the sysctl
mid-scan, old behavior reacted within the scan, new behavior uses the value
sampled at entry. This is completely ok IMO, but might be good to call out.

Also might be good to write no functional change intended apart from
above in the commit message?

Acked-by: Usama Arif <usama.arif@linux.dev>


^ permalink raw reply

* [PATCH AUTOSEL 6.18] tracing/probe: reject non-closed empty immediate strings
From: Sasha Levin @ 2026-04-20 13:08 UTC (permalink / raw)
  To: patches, stable
  Cc: Pengpeng Hou, Steven Rostedt (Google), Masami Hiramatsu (Google),
	Sasha Levin, linux-kernel, linux-trace-kernel
In-Reply-To: <20260420131539.986432-1-sashal@kernel.org>

From: Pengpeng Hou <pengpeng@iscas.ac.cn>

[ Upstream commit 4346be6577aaa04586167402ae87bbdbe32484a4 ]

parse_probe_arg() accepts quoted immediate strings and passes the body
after the opening quote to __parse_imm_string(). That helper currently
computes strlen(str) and immediately dereferences str[len - 1], which
underflows when the body is empty and not closed with double-quotation.

Reject empty non-closed immediate strings before checking for the closing quote.

Link: https://lore.kernel.org/all/20260401160315.88518-1-pengpeng@iscas.ac.cn/

Fixes: a42e3c4de964 ("tracing/probe: Add immediate string parameter support")
Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Error: Failed to generate final synthesis

 kernel/trace/trace_probe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5cbdc423afebc..d7adbf1536c8b 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1068,7 +1068,7 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
 {
 	size_t len = strlen(str);
 
-	if (str[len - 1] != '"') {
+	if (!len || str[len - 1] != '"') {
 		trace_probe_log_err(offs + len, IMMSTR_NO_CLOSE);
 		return -EINVAL;
 	}
-- 
2.53.0


^ permalink raw reply related

* [PATCH AUTOSEL 7.0-6.12] ring-buffer: Enforce read ordering of trace_buffer cpumask and buffers
From: Sasha Levin @ 2026-04-20 13:07 UTC (permalink / raw)
  To: patches, stable
  Cc: Vincent Donnefort, Steven Rostedt (Google), Sasha Levin, mhiramat,
	linux-kernel, linux-trace-kernel
In-Reply-To: <20260420131539.986432-1-sashal@kernel.org>

From: Vincent Donnefort <vdonnefort@google.com>

[ Upstream commit 20ad8b0888be392eb2c4c3654805eb8594952373 ]

On CPU hotplug, if it is the first time a trace_buffer sees a CPU, a
ring_buffer_per_cpu will be allocated and its corresponding bit toggled
in the cpumask. Many readers check this cpumask to know if they can
safely read the ring_buffer_per_cpu but they are doing so without memory
ordering and may observe the cpumask bit set while having NULL buffer
pointer.

Enforce the memory read ordering by sending an IPI to all online CPUs.
The hotplug path is a slow-path anyway and it saves us from adding read
barriers in numerous call sites.

Link: https://patch.msgid.link/20260401053659.3458961-1-vdonnefort@google.com
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Suggested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---

LLM Generated explanations, may be completely bogus:

Based on my comprehensive analysis, I have sufficient information to
make a decision. Let me now lay out the full analysis following the
9-phase workflow.

---

## PHASE 1: COMMIT MESSAGE FORENSICS

**Step 1.1 - Subject Line Parse:**
Record: Subsystem=`ring-buffer`, action verb=`Enforce`, summary=`Enforce
read ordering of trace_buffer cpumask and buffers`. "Enforce" implies
fixing a missing ordering guarantee = bug fix.

**Step 1.2 - Tags:**
Record:
- Link: https://patch.msgid.link/20260401053659.3458961-1-
  vdonnefort@google.com (lore discussion)
- Signed-off-by: Vincent Donnefort (author)
- Suggested-by: Steven Rostedt (Google) - tracing maintainer suggested
  the fix approach
- Signed-off-by: Steven Rostedt - applied via maintainer tree
- No Fixes: tag
- No Cc: stable tag
- No Reported-by tag

**Step 1.3 - Body Analysis:**
Record: Bug described is a memory ordering issue. On CPU hotplug,
`trace_rb_cpu_prepare()` allocates a `ring_buffer_per_cpu` and sets the
corresponding cpumask bit. Readers on other CPUs check the cpumask
first, then access `buffer->buffers[cpu]`. Without read ordering, on
weakly-ordered architectures a reader may observe the cpumask bit set
while still seeing NULL for `buffer->buffers[cpu]`, causing NULL pointer
dereference. Fix uses IPI + barrier trick to force ordering across all
CPUs. Language "may observe" indicates defensive/analytical fix rather
than reported crash.

**Step 1.4 - Hidden Fix Detection:**
Record: This IS clearly a bug fix (memory ordering/race fix), though
without explicit "fix" in subject. The word "Enforce" indicates adding a
missing ordering guarantee - a classic race fix pattern.

---

## PHASE 2: DIFF ANALYSIS

**Step 2.1 - Inventory:**
Record: Single file `kernel/trace/ring_buffer.c`, +18 -1 lines. New
function `rb_cpu_sync()` (6 lines). Modified `trace_rb_cpu_prepare()`
(replace single `smp_wmb()` with conditional IPI + wmb). Surgical fix.

**Step 2.2 - Code Flow:**
Record:
- Before: `rb_allocate_cpu_buffer()` -> `smp_wmb()` ->
  `cpumask_set_cpu()`. Writer-side ordering only.
- After: `rb_allocate_cpu_buffer()` -> `on_each_cpu(rb_cpu_sync, ...)`
  (IPI to all online CPUs, each executes `smp_rmb()`) -> `smp_wmb()` ->
  `cpumask_set_cpu()`. Forces synchronization on reader CPUs.
- `if (unlikely(system_state == SYSTEM_RUNNING))` guard avoids IPI
  during early boot when IPI infrastructure may not be ready.

**Step 2.3 - Bug Mechanism:**
Record: Category (b) Synchronization/race condition fix. Specifically:
data race between writer (trace_rb_cpu_prepare) and many readers/writers
on other CPUs. On weakly-ordered CPUs (ARM, ARM64, PowerPC), readers can
observe the cpumask bit set before seeing the newly-allocated buffer
pointer, leading to NULL dereference. The IPI mechanism acts as a cross-
CPU barrier: after `on_each_cpu()` returns synchronously, every online
CPU has executed `smp_rmb()` via the IPI handler. Any subsequent
cpumask_set_cpu write becomes visible only after that sync, ensuring any
reader observing the bit also observes the buffer pointer.

**Step 2.4 - Fix Quality:**
Record: Fix is conceptually elegant and correct. Uses a well-known
kernel pattern (IPI-as-barrier). Only affects the slow CPU hotplug path.
Low regression risk - the IPI is guarded against early-boot execution,
and CPU hotplug is inherently rare. The alternative (adding `smp_rmb()`
to 30+ callsites) would be much more invasive and error-prone.

---

## PHASE 3: GIT HISTORY

**Step 3.1 - Blame:**
Record: The `smp_wmb()` + `cpumask_set_cpu` pattern was introduced by
commit `b32614c03413f8` "tracing/rb: Convert to hotplug state machine"
by Sebastian Andrzej Siewior, dated Nov 27, 2016 (v4.10). This code has
been present in all stable trees since v4.10.

**Step 3.2 - Fixes: Tag Follow-up:**
Record: No Fixes: tag. The buggy code (the missing read barrier on
reader side) has been present since 2016. The root cause is systemic -
many reader callsites were added over the years without matching
smp_rmb().

**Step 3.3 - File History:**
Record: `kernel/trace/ring_buffer.c` is actively maintained (578
commits, recent activity). No prerequisite patches needed. Standalone
fix (v1 only).

**Step 3.4 - Author Context:**
Record: Vincent Donnefort is a regular ring-buffer contributor (6+
commits to ring_buffer.c in 2024). Steven Rostedt (tracing maintainer)
suggested the approach. Both are highly credentialed.

**Step 3.5 - Dependencies:**
Record: None. Self-contained fix. `on_each_cpu`, `smp_rmb`, `smp_wmb`,
`system_state`/`SYSTEM_RUNNING` are all long-standing kernel primitives
available in all stable trees.

---

## PHASE 4: MAILING LIST

**Step 4.1 - Patch Discussion:**
Record: `b4 dig -c 20ad8b0888be3` returned lore URL https://lore.kernel.
org/all/20260401053659.3458961-1-vdonnefort@google.com/. Thread contains
only the single patch submission (v1). No review comments, no NAKs, no
"Cc: stable" suggestions. Thread mbox has just the submission message.

**Step 4.2 - Reviewers:**
Record: To: rostedt@goodmis.org, mhiramat@kernel.org,
mathieu.desnoyers@efficios.com, linux-trace-kernel@vger.kernel.org. Cc:
kernel-team@android.com, linux-kernel@vger.kernel.org. All relevant
maintainers CC'd.

**Step 4.3 - Bug Report:**
Record: No bug report linked. No Reported-by. This is a defensive fix
based on code analysis. Web fetch of lore was blocked by Anubis bot
protection.

**Step 4.4 - Related Patches:**
Record: `b4 dig -c -a` confirmed only v1 exists. Standalone single
patch, not part of a series.

**Step 4.5 - Stable Discussion:**
Record: Not searched separately; no Cc: stable in the submission
indicates the author/maintainer did not explicitly nominate it for
stable.

---

## PHASE 5: CODE SEMANTIC ANALYSIS

**Step 5.1 - Key Functions:**
Record: `trace_rb_cpu_prepare()` (modified), `rb_cpu_sync()` (new).

**Step 5.2 - Callers of `trace_rb_cpu_prepare`:**
Record: Registered as CPU hotplug callback via
`cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, ...)`. Called on CPU
online transition for each `trace_buffer` registered. Runs in kernel
context on the control CPU (not the target CPU).

**Step 5.3 - Readers Affected (impact surface):**
Record: `grep cpumask_test_cpu.*buffer->cpumask` in ring_buffer.c shows
30+ callsites that use the pattern `if (!cpumask_test_cpu(cpu,
buffer->cpumask)) return; cpu_buffer = buffer->buffers[cpu];`. Key hot
paths include:
- `ring_buffer_lock_reserve()` line 4716 - the main WRITE path, called
  from every trace event
- `ring_buffer_peek()` line 5874 (reader)
- `ring_buffer_record_disable_cpu/enable_cpu` - userspace-reachable via
  tracefs
- `rb_wait()` via userspace read of per-CPU trace_pipe_raw

These are user-reachable from tracefs and syscall paths.

**Step 5.4 - Bug Reachability:**
Record: User-reachable. CPU hotplug is triggered by:
- Boot (initial CPU online)
- Suspend/resume cycle (all CPUs brought down/up)
- `/sys/devices/system/cpu/cpuX/online` toggling by root
- VM CPU hotplug on cloud instances

Concurrent readers on other CPUs during hotplug could race with the
buffer allocation. On weakly-ordered archs (ARM64 very common), the race
is realistic.

**Step 5.5 - Similar Patterns:**
Record: The IPI-as-barrier pattern is used elsewhere in the kernel
(e.g., `kernel/time/hrtimer.c:958` `on_each_cpu(retrigger_next_event,
NULL, 1)`). It's a well-established idiom.

---

## PHASE 6: STABLE TREE ANALYSIS

**Step 6.1 - Buggy Code in Stable:**
Record: Confirmed by inspecting archived versions - the exact
`smp_wmb(); cpumask_set_cpu(cpu, buffer->cpumask);` pattern exists in
v6.1, v6.6, v6.12, v6.18 (all active stable trees) and has been there
since v4.10. Many readers lacking `smp_rmb()` are also present across
all those versions.

**Step 6.2 - Backport Complications:**
Record: The patch context is simple and stable across versions. The line
numbers differ but the exact 2-line context
(`smp_wmb();\n\tcpumask_set_cpu(cpu, buffer->cpumask);`) is unchanged
across all examined stable trees. Should apply cleanly with fuzz.

**Step 6.3 - Related Fixes in Stable:**
Record: No prior related fix found. Searched for memory ordering/barrier
commits in kernel/trace/ - no matches targeting this specific ordering
issue.

---

## PHASE 7: SUBSYSTEM CONTEXT

**Step 7.1 - Subsystem/Criticality:**
Record: `kernel/trace/ring_buffer.c` is the core ring buffer backing
tracefs/ftrace/perf/BPF. Criticality = IMPORTANT (affects all users of
tracing - kernel developers, security/observability tools, distros using
ftrace).

**Step 7.2 - Subsystem Activity:**
Record: Very active - ~578 commits to ring_buffer.c, recent activity in
2024-2026. Well-maintained with careful attention to correctness.

---

## PHASE 8: IMPACT AND RISK

**Step 8.1 - Affected Population:**
Record: Users of tracing (ftrace, perf, BPF) on systems with CPU
hotplug, especially on weakly-ordered architectures (ARM64 - ubiquitous
in cloud, mobile, embedded; PowerPC - servers). Any Linux system where
tracing is active during suspend/resume is potentially affected.

**Step 8.2 - Trigger Conditions:**
Record: Requires:
1. CPU hotplug event bringing online a CPU not previously seen by a
   trace_buffer
2. Concurrent tracing activity on other CPUs
3. Weakly-ordered CPU architecture (or unlucky timing on x86)

Trigger is realistic (suspend/resume of any laptop, VM CPU hotplug) but
timing-sensitive. Unprivileged users cannot directly trigger CPU
hotplug, but tracing (if accessible) can be used to exercise the hot
path. System_state transitions occur normally at boot.

**Step 8.3 - Failure Severity:**
Record: NULL pointer dereference -> kernel oops/panic. CRITICAL severity
(system crash). No data corruption but immediate loss of system
availability.

**Step 8.4 - Risk-Benefit:**
Record:
- Benefit: MEDIUM-HIGH. Prevents a real kernel crash in a widely-used
  subsystem on common architectures.
- Risk: LOW. 18-line change. IPI only on slow-path (CPU hotplug).
  SYSTEM_RUNNING guard prevents early-boot issues. Standard kernel
  idiom. Reviewed by subsystem maintainer.
- Ratio: Favorable for backport.

---

## PHASE 9: SYNTHESIS

**Step 9.1 - Evidence Summary:**

FOR backport:
- Real memory-ordering race that can cause NULL dereference (kernel
  crash)
- Small, surgical fix (18 lines, single file)
- Buggy pattern present in all active stable trees since v4.10
- Reviewed and authored by the tracing maintainer (Steven Rostedt)
- Applies cleanly (context unchanged across stable versions)
- Affects commonly-used architecture (ARM64)
- Impact path includes hot tracing code called from many paths

AGAINST backport:
- No Fixes: tag
- No Cc: stable tag
- No Reported-by - defensive fix based on code analysis, not observed
  crash
- Bug was latent for 10 years without mainstream report
- Commit message uses conditional "may observe"

**Step 9.2 - Stable Rules Checklist:**
1. Obviously correct and tested? Yes - uses standard IPI-as-barrier
   idiom, accepted by maintainer
2. Fixes a real bug? Yes - verifiable memory-ordering race
3. Important (crash)? Yes - NULL pointer dereference
4. Small and contained? Yes - 18 lines, single file
5. No new features? Correct - pure ordering fix
6. Can apply to stable? Yes - simple context, unchanged code structure

**Step 9.3 - Exception Category:**
Not a device-ID/quirk/build/doc/DT fix. Standard bug fix category.

**Step 9.4 - Decision:**

This is a borderline-positive case. The fix addresses a genuine memory-
ordering bug that can cause a kernel crash (NULL dereference) on weakly-
ordered architectures. The fix is small, well-reasoned, reviewed by the
subsystem maintainer, and very low regression risk. While the absence of
Fixes:/Cc:stable tags and concrete reports suggests lower urgency,
stable rules explicitly include theoretical races with severe
consequences. The tracing subsystem is widely used, and ARM64 (where
this bug most readily manifests) is prevalent. The fix applies cleanly
to stable trees.

---

## Verification

- [Phase 1] Parsed commit message: confirmed no Fixes:, no Cc: stable,
  no Reported-by; Suggested-by: Steven Rostedt (tracing maintainer)
- [Phase 1] Language analysis: "may observe" and "Enforce" indicate
  defensive race fix based on code analysis
- [Phase 2] `git show 20ad8b0888be3 --stat`: verified 18 insertions, 1
  deletion, single file
- [Phase 2] Diff inspection: verified replacement of `smp_wmb()` with
  conditional `on_each_cpu + smp_wmb` block; new `rb_cpu_sync` helper
  with `smp_rmb()`
- [Phase 3] `git blame -L 7508,7512 kernel/trace/ring_buffer.c`:
  confirmed `smp_wmb();/cpumask_set_cpu` pattern introduced by
  b32614c03413f8 in v4.10 (Nov 2016)
- [Phase 3] `git show b32614c03413f8`: confirmed original commit by
  Sebastian Andrzej Siewior, "tracing/rb: Convert to hotplug state
  machine"
- [Phase 3] `git log --author="Vincent Donnefort" --
  kernel/trace/ring_buffer.c`: author has 6+ ring_buffer.c commits,
  regular contributor
- [Phase 4] `b4 dig -c 20ad8b0888be3`: resolved to lore URL, confirmed
  submission thread
- [Phase 4] `b4 dig -c -a`: confirmed only v1 exists, no revisions
- [Phase 4] `/tmp/rb_sync_thread.mbox` read: thread has only the single
  patch submission, no review replies, no stable nomination in
  discussion
- [Phase 5] `grep cpumask_test_cpu.*buffer->cpumask`: confirmed 30+
  reader callsites in ring_buffer.c
- [Phase 5] Verified `ring_buffer_lock_reserve` (line 4716) uses the
  pattern - hot write path
- [Phase 5] Verified `on_each_cpu(x, NULL, 1)` idiom used elsewhere
  (kernel/time/hrtimer.c:958)
- [Phase 6] `git show v6.6/v6.12/v6.18:kernel/trace/ring_buffer.c`:
  confirmed identical 2-line context `smp_wmb();\ncpumask_set_cpu(cpu,
  buffer->cpumask);` present in all major stable trees -> patch will
  apply cleanly
- [Phase 6] `git show v4.10:kernel/trace/ring_buffer.c`: confirmed
  pattern present at trace_rb_cpu_prepare since v4.10
- [Phase 7] `git log --oneline --since=2024 --
  kernel/trace/ring_buffer.c`: confirmed active subsystem with many
  recent commits
- [Phase 8] Confirmed reachability: cpumask checks precede
  `buffer->buffers[cpu]` dereference in hot write path
  (`ring_buffer_lock_reserve`) and reader paths - NULL deref is possible
  if race occurs
- UNVERIFIED: Cannot confirm whether this race has actually been
  observed in production (no Reported-by, no Link to bug tracker, Lore
  WebFetch blocked by bot protection). Assessment is based on code
  analysis and consequences of the race, which are severe when it does
  trigger.

**YES**

 kernel/trace/ring_buffer.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 170170bd83bd9..10d2d0404434d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7468,6 +7468,12 @@ int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
 	return 0;
 }
 
+static void rb_cpu_sync(void *data)
+{
+	/* Not really needed, but documents what is happening */
+	smp_rmb();
+}
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
@@ -7506,7 +7512,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
 		     cpu);
 		return -ENOMEM;
 	}
-	smp_wmb();
+
+	/*
+	 * Ensure trace_buffer readers observe the newly allocated
+	 * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+	 * read barrier for all readers, send an IPI.
+	 */
+	if (unlikely(system_state == SYSTEM_RUNNING)) {
+		on_each_cpu(rb_cpu_sync, NULL, 1);
+		/* Not really needed, but documents what is happening */
+		smp_wmb();
+	}
+
 	cpumask_set_cpu(cpu, buffer->cpumask);
 	return 0;
 }
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH 7.2 v16 02/13] mm/khugepaged: generalize alloc_charge_folio()
From: Usama Arif @ 2026-04-20 13:05 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-3-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:39 -0600 Nico Pache <npache@redhat.com> wrote:

> From: Dev Jain <dev.jain@arm.com>
> 
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst |  8 ++++++++
>  include/linux/huge_mm.h                    |  2 ++
>  mm/huge_memory.c                           |  4 ++++
>  mm/khugepaged.c                            | 17 +++++++++++------
>  4 files changed, 25 insertions(+), 6 deletions(-)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>

^ permalink raw reply

* Re: [PATCH 7.2 v16 01/13] mm/khugepaged: generalize hugepage_vma_revalidate for mTHP support
From: Usama Arif @ 2026-04-20 12:59 UTC (permalink / raw)
  To: Nico Pache
  Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260419185750.260784-2-npache@redhat.com>

On Sun, 19 Apr 2026 12:57:38 -0600 Nico Pache <npache@redhat.com> wrote:

> For khugepaged to support different mTHP orders, we must generalize this
> to check if the PMD is not shared by another VMA and that the order is
> enabled.
> 
> No functional change in this patch. Also correct a comment about the
> functionality of the revalidation and fix a double space issues.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 20 ++++++++++++--------
>  1 file changed, 12 insertions(+), 8 deletions(-)
> 

Acked-by: Usama Arif <usama.arif@linux.dev>

^ permalink raw reply

* Re: [PATCH v13 17/18] unwind_user/sframe/x86: Enable sframe unwinding on x86
From: Jens Remus @ 2026-04-20 12:35 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel, bpf, x86, linux-mm,
	Steven Rostedt
  Cc: Josh Poimboeuf, Masami Hiramatsu, Mathieu Desnoyers,
	Peter Zijlstra, Ingo Molnar, Jiri Olsa, Arnaldo Carvalho de Melo,
	Namhyung Kim, Thomas Gleixner, Andrii Nakryiko, Indu Bhagat,
	Jose E. Marchesi, Beau Belgrave, Linus Torvalds, Andrew Morton,
	Florian Weimer, Kees Cook, Carlos O'Donell, Sam James,
	Dylan Hatch, Borislav Petkov, Dave Hansen, David Hildenbrand,
	H. Peter Anvin, Liam R. Howlett, Lorenzo Stoakes, Michal Hocko,
	Mike Rapoport, Suren Baghdasaryan, Vlastimil Babka,
	Heiko Carstens, Vasily Gorbik, Steven Rostedt (Google)
In-Reply-To: <20260127150554.2760964-18-jremus@linux.ibm.com>

On 1/27/2026 4:05 PM, Jens Remus wrote:

> diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h

> @@ -15,6 +15,40 @@ static inline int unwind_user_word_size(struct pt_regs *regs)
>  	return user_64bit_mode(regs) ? 8 : 4;
>  }
>  
> +static inline int unwind_user_get_reg(unsigned long *val, unsigned int regnum)
> +{
> +#ifdef CONFIG_X86_64
> +	const struct pt_regs *regs = task_pt_regs(current);
> +
> +	switch (regnum) {
> +	/* DWARF register numbers 0..15 */
> +	case  0: *val = regs->ax; break;
> +	case  1: *val = regs->dx; break;
> +	case  2: *val = regs->cx; break;
> +	case  3: *val = regs->bx; break;
> +	case  4: *val = regs->si; break;
> +	case  5: *val = regs->di; break;
> +	case  6: *val = regs->bp; break;
> +	case  7: *val = regs->sp; break;
> +	case  8: *val = regs->r8; break;
> +	case  9: *val = regs->r9; break;
> +	case 10: *val = regs->r10; break;
> +	case 11: *val = regs->r11; break;
> +	case 12: *val = regs->r12; break;
> +	case 13: *val = regs->r13; break;
> +	case 14: *val = regs->r14; break;
> +	case 15: *val = regs->r15; break;
> +	default:
> +		return -EINVAL;
> +	}
> +	return 0;
> +#else /* !CONFIG_X86_64 */
> +	return -EINVAL;
> +#endif /* !CONFIG_X86_64 */
> +

Nit: Superfluous empty line.

> +}
> +#define unwind_user_get_reg unwind_user_get_reg
> +
>  #endif /* CONFIG_UNWIND_USER */
>  
>  #ifdef CONFIG_HAVE_UNWIND_USER_FP
Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303)
jremus@de.ibm.com / jremus@linux.ibm.com

IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Ehningen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/


^ permalink raw reply

* Re: [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Kohei Enju @ 2026-04-20 11:54 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: netdev, linux-trace-kernel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <b943ec1e-417c-4157-ab19-b34aa6d63688@linux.dev>

On 04/20 19:27, Jiayuan Chen wrote:
> 
> On 4/20/26 6:54 PM, Kohei Enju wrote:
> > Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
> > skb->napi_id shares storage with sender_cpu. RX tracepoints using
> > net_dev_rx_verbose_template read skb->napi_id directly and can therefore
> > report sender_cpu values as if they were NAPI IDs.
> > 
> > For example, on the loopback path this can report 1 as napi_id, where 1
> So I think veth_forward_skb->__netif_rx could be affected as well?

Yes. Just in case, I've confirmed the same behavior in the veth path.
The mentioned loopback path is just a single example of possibly
affected paths.

Thanks,
Kohei

> > comes from raw_smp_processor_id() + 1 in the XPS path:
> > 
> >    # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
> >    # taskset -c 0 ping -c 1 ::1
> > 
> > Report only valid NAPI IDs in these tracepoints and use 0 otherwise.
> > 
> > Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
> > Signed-off-by: Kohei Enju <kohei@enjuk.jp>
> > ---
> >   include/trace/events/net.h | 4 +++-
> >   1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> > index fdd9ad474ce3..dbc2c5598e35 100644
> > --- a/include/trace/events/net.h
> > +++ b/include/trace/events/net.h
> > @@ -10,6 +10,7 @@
> >   #include <linux/if_vlan.h>
> >   #include <linux/ip.h>
> >   #include <linux/tracepoint.h>
> > +#include <net/busy_poll.h>
> >   TRACE_EVENT(net_dev_start_xmit,
> > @@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
> >   	TP_fast_assign(
> >   		__assign_str(name);
> >   #ifdef CONFIG_NET_RX_BUSY_POLL
> > -		__entry->napi_id = skb->napi_id;
> > +		__entry->napi_id = napi_id_valid(skb->napi_id) ?
> > +				   skb->napi_id : 0;
> >   #else
> >   		__entry->napi_id = 0;
> >   #endif

^ permalink raw reply

* Re: [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Jiayuan Chen @ 2026-04-20 11:27 UTC (permalink / raw)
  To: Kohei Enju, netdev, linux-trace-kernel
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <20260420105427.162816-1-kohei@enjuk.jp>


On 4/20/26 6:54 PM, Kohei Enju wrote:
> Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
> skb->napi_id shares storage with sender_cpu. RX tracepoints using
> net_dev_rx_verbose_template read skb->napi_id directly and can therefore
> report sender_cpu values as if they were NAPI IDs.
>
> For example, on the loopback path this can report 1 as napi_id, where 1
So I think veth_forward_skb->__netif_rx could be affected as well?
> comes from raw_smp_processor_id() + 1 in the XPS path:
>
>    # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
>    # taskset -c 0 ping -c 1 ::1
>
> Report only valid NAPI IDs in these tracepoints and use 0 otherwise.
>
> Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
> Signed-off-by: Kohei Enju <kohei@enjuk.jp>
> ---
>   include/trace/events/net.h | 4 +++-
>   1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/include/trace/events/net.h b/include/trace/events/net.h
> index fdd9ad474ce3..dbc2c5598e35 100644
> --- a/include/trace/events/net.h
> +++ b/include/trace/events/net.h
> @@ -10,6 +10,7 @@
>   #include <linux/if_vlan.h>
>   #include <linux/ip.h>
>   #include <linux/tracepoint.h>
> +#include <net/busy_poll.h>
>   
>   TRACE_EVENT(net_dev_start_xmit,
>   
> @@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
>   	TP_fast_assign(
>   		__assign_str(name);
>   #ifdef CONFIG_NET_RX_BUSY_POLL
> -		__entry->napi_id = skb->napi_id;
> +		__entry->napi_id = napi_id_valid(skb->napi_id) ?
> +				   skb->napi_id : 0;
>   #else
>   		__entry->napi_id = 0;
>   #endif

^ permalink raw reply

* [PATCH net v1] net: validate skb->napi_id in RX tracepoints
From: Kohei Enju @ 2026-04-20 10:54 UTC (permalink / raw)
  To: netdev, linux-trace-kernel
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Kohei Enju

Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"),
skb->napi_id shares storage with sender_cpu. RX tracepoints using
net_dev_rx_verbose_template read skb->napi_id directly and can therefore
report sender_cpu values as if they were NAPI IDs.

For example, on the loopback path this can report 1 as napi_id, where 1
comes from raw_smp_processor_id() + 1 in the XPS path:

  # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }'
  # taskset -c 0 ping -c 1 ::1

Report only valid NAPI IDs in these tracepoints and use 0 otherwise.

Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices")
Signed-off-by: Kohei Enju <kohei@enjuk.jp>
---
 include/trace/events/net.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index fdd9ad474ce3..dbc2c5598e35 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -10,6 +10,7 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/tracepoint.h>
+#include <net/busy_poll.h>
 
 TRACE_EVENT(net_dev_start_xmit,
 
@@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 	TP_fast_assign(
 		__assign_str(name);
 #ifdef CONFIG_NET_RX_BUSY_POLL
-		__entry->napi_id = skb->napi_id;
+		__entry->napi_id = napi_id_valid(skb->napi_id) ?
+				   skb->napi_id : 0;
 #else
 		__entry->napi_id = 0;
 #endif
-- 
2.51.0


^ permalink raw reply related

* [PATCH] trace: remove the dead IS_ERR() check in trace_pipe_open()
From: Yash Suthar @ 2026-04-20 10:12 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mathieu.desnoyers, linux-kernel, linux-trace-kernel, Yash Suthar

in trace_pipe_open() already check the IS_ERR(iter) and
return early on error,so iter after will be valid and
it is safe to return 0 at end.

Signed-off-by: Yash Suthar <yashsuthar983@gmail.com>
---
 kernel/trace/trace_remote.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c
index d6c3f94d67cd..2a6cc000ec98 100644
--- a/kernel/trace/trace_remote.c
+++ b/kernel/trace/trace_remote.c
@@ -602,7 +602,7 @@ static int trace_pipe_open(struct inode *inode, struct file *filp)
 
 	filp->private_data = iter;
 
-	return IS_ERR(iter) ? PTR_ERR(iter) : 0;
+	return 0;
 }
 
 static int trace_pipe_release(struct inode *inode, struct file *filp)
-- 
2.43.0


^ permalink raw reply related

* Re: [RFC v4 0/7] ext4: fast commit: snapshot inode state for FC log
From: Li Chen @ 2026-04-20  9:37 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Zhang Yi, Andreas Dilger, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-trace-kernel, linux-kernel
In-Reply-To: <20260413131244.GB20496@macsyma-wired.lan>

Hi Theodore,

 ---- On Mon, 13 Apr 2026 21:12:44 +0800  Theodore Tso <tytso@mit.edu> wrote --- 
 > On Mon, Apr 13, 2026 at 09:01:28PM +0800, Li Chen wrote:
 > > Absolutely! It's great to learn about the Sashiko development site.
 > > I will address the real issues in the next version.
 > 
 > Note that Sashiko will sometimes report a pre-existing issue as if it
 > were a problem with the commit.  If that happens, feel free to ignore
 > its complaint; what I consider best practice is to either (a) fix it
 > in the a subsequent patch or patch series, or (b) leave a TODO in the
 > code.
 > 
 > I've asked the Sashiko folks to add way for URI's for each issue that
 > are identified by Sashiko, so we can put a URL in the TODO comment for
 > someone who wants to fix it later, and to make it easier for Sashiko
 > to identified pre-existing issues so it doesn't comment on the same
 > issue across multiple commit reviews (and perhaps save on the some LLM
 > token budget :-).
 > 
 > In the next few days, for patches sent to linux-ext4, Sashiko will
 > start e-mailing its reviews to the patch submitter and to me as the
 > maintainer.  Once we can reduce the false positive rate, I'll ask that
 > the reviews be cc'ed to the linux-ext4 mailing list.  But it seems
 > good enough that to send e-mails to the patch submitter and the
 > maintainer --- but that's a decision that each subsystem maintainer
 > will be making on their own.

Got it, thanks. I'll treat Sashiko as a review aid, fix the real issues in the next version, 
and leave unrelated pre-existing issues for follow-up or a TODO.

Regards,
Li​


^ permalink raw reply

* [PATCH v2] tracing: export live module tracepoint strings in printk_formats
From: Cao Ruichuang @ 2026-04-20  6:19 UTC (permalink / raw)
  To: rostedt; +Cc: petr.pavlu, linux-trace-kernel, linux-kernel, Cao Ruichuang
In-Reply-To: <20260413123359.32517-1-create0818@163.com>

tracepoint_string() documents that its strings are exported through
printk_formats so that user space can decode pointer fields recorded in
trace buffers.

That already works for built-in __tracepoint_str entries, but module
__tracepoint_str sections are not collected or exported today. As a
result, module tracepoint_string() users still show raw pointer values
in printk_formats consumers such as trace.dat decoders.

Record module __tracepoint_str sections when modules are loaded, expose
their live ranges through printk_formats, and teach
trace_is_tracepoint_string() to accept those live module strings too.

Keep the lifetime semantics tied to the module itself. This does not
copy strings into tracing-owned storage and does not preserve the
mappings after module unload.

On MODULE_STATE_GOING, the live module string ranges are removed again.
This relies on the existing tracing module notifier ordering: trace
event teardown runs first and resets module event buffers before these
auxiliary string mappings are dropped.

If the small auxiliary registry allocation fails, warn and continue
loading the module. printk_formats exposure is degraded in that case,
but tracing should not fail module load for missing debug metadata.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=217196
Assisted-by: Codex:GPT-5.4
Signed-off-by: Cao Ruichuang <create0818@163.com>
---
v2:
- replace the previous copied-string approach with live module section ranges
- record module __tracepoint_str ranges in struct module
- export only live module tracepoint strings in printk_formats
- remove module mappings on MODULE_STATE_GOING
- keep auxiliary registry allocation failure non-fatal and warn instead
- add explicit notifier priority and document the teardown ordering dependency

Tested in QEMU:
- basic repro showing module tracepoint_string() entries in printk_formats
- load/unload validation confirming mappings are removed after rmmod
- failed module init after MODULE_STATE_COMING with no stale mapping left
- targeted failslab injection on the notifier-time auxiliary allocation,
  confirming module load still succeeds, a warning is emitted, and the
  module mapping is not exported

 include/linux/module.h      |   2 +
 kernel/module/main.c        |   4 +
 kernel/trace/trace_printk.c | 153 ++++++++++++++++++++++++++++++++++--
 3 files changed, 152 insertions(+), 7 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 14f391b186c..e475466a785 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -515,6 +515,8 @@ struct module {
 #ifdef CONFIG_TRACING
 	unsigned int num_trace_bprintk_fmt;
 	const char **trace_bprintk_fmt_start;
+	unsigned int num_tracepoint_strings;
+	const char **tracepoint_strings_start;
 #endif
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call **trace_events;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index c3ce106c70a..d7d890138ac 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2672,6 +2672,10 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 	mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
 					 sizeof(*mod->trace_bprintk_fmt_start),
 					 &mod->num_trace_bprintk_fmt);
+	mod->tracepoint_strings_start =
+		section_objs(info, "__tracepoint_str",
+			     sizeof(*mod->tracepoint_strings_start),
+			     &mod->num_tracepoint_strings);
 #endif
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/* sechdrs[0].sh_size is always zero */
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 5ea5e0d76f0..2d41b0a63b3 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/rcupdate.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -24,10 +25,15 @@
 /*
  * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
  * which are queued on trace_bprintk_fmt_list.
+ *
+ * modules tracepoint_string() entries are kept as ranges into the owning
+ * module's __tracepoint_str section and are removed again when the module
+ * goes away.
  */
 static LIST_HEAD(trace_bprintk_fmt_list);
+static LIST_HEAD(tracepoint_str_list);
 
-/* serialize accesses to trace_bprintk_fmt_list */
+/* serialize accesses to module trace printk and tracepoint string lists */
 static DEFINE_MUTEX(btrace_mutex);
 
 struct trace_bprintk_fmt {
@@ -35,6 +41,13 @@ struct trace_bprintk_fmt {
 	const char *fmt;
 };
 
+struct tracepoint_mod_str {
+	struct list_head list;
+	struct module *mod;
+	const char **start;
+	unsigned int num;
+};
+
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
 	struct trace_bprintk_fmt *pos;
@@ -85,16 +98,70 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
 	mutex_unlock(&btrace_mutex);
 }
 
+static void hold_module_tracepoint_strings(struct module *mod)
+{
+	struct tracepoint_mod_str *tp_str;
+
+	if (!mod->num_tracepoint_strings)
+		return;
+
+	tp_str = kmalloc_obj(*tp_str);
+	if (!tp_str) {
+		pr_warn("tracing: Failed to expose module tracepoint strings for %s\n",
+			mod->name);
+		return;
+	}
+
+	tp_str->mod = mod;
+	tp_str->start = mod->tracepoint_strings_start;
+	tp_str->num = mod->num_tracepoint_strings;
+
+	mutex_lock(&btrace_mutex);
+	list_add_tail_rcu(&tp_str->list, &tracepoint_str_list);
+	mutex_unlock(&btrace_mutex);
+}
+
+static void release_module_tracepoint_strings(struct module *mod)
+{
+	struct tracepoint_mod_str *tp_str, *next;
+	struct tracepoint_mod_str *found = NULL;
+
+	mutex_lock(&btrace_mutex);
+	list_for_each_entry_safe(tp_str, next, &tracepoint_str_list, list) {
+		if (tp_str->mod != mod)
+			continue;
+
+		list_del_rcu(&tp_str->list);
+		found = tp_str;
+		break;
+	}
+	mutex_unlock(&btrace_mutex);
+
+	if (found) {
+		synchronize_rcu();
+		kfree(found);
+	}
+}
+
 static int module_trace_bprintk_format_notify(struct notifier_block *self,
 		unsigned long val, void *data)
 {
 	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
 
-		if (val == MODULE_STATE_COMING)
+	switch (val) {
+	case MODULE_STATE_COMING:
+		if (mod->num_trace_bprintk_fmt) {
+			const char **start = mod->trace_bprintk_fmt_start;
+			const char **end = start + mod->num_trace_bprintk_fmt;
+
 			hold_module_trace_bprintk_format(start, end);
+		}
+		hold_module_tracepoint_strings(mod);
+		break;
+	case MODULE_STATE_GOING:
+		/* trace event teardown runs first and clears module event buffers. */
+		release_module_tracepoint_strings(mod);
+		break;
 	}
 	return NOTIFY_OK;
 }
@@ -159,6 +226,55 @@ find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
 	return &mod_fmt->fmt;
 }
 
+static int count_mod_formats(void)
+{
+	struct trace_bprintk_fmt *p;
+	int count = 0;
+
+	list_for_each_entry(p, &trace_bprintk_fmt_list, list)
+		count++;
+
+	return count;
+}
+
+static const char **
+find_next_mod_tracepoint_str(int start_index, loff_t *pos)
+{
+	struct tracepoint_mod_str *tp_str;
+	int index = start_index;
+	unsigned int i;
+
+	list_for_each_entry(tp_str, &tracepoint_str_list, list) {
+		for (i = 0; i < tp_str->num; i++) {
+			if (index == *pos)
+				return tp_str->start + i;
+			index++;
+		}
+	}
+
+	return NULL;
+}
+
+static bool is_module_tracepoint_string(const char *str)
+{
+	struct tracepoint_mod_str *tp_str;
+	unsigned int i;
+	bool found = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tp_str, &tracepoint_str_list, list) {
+		for (i = 0; i < tp_str->num; i++) {
+			if (str == tp_str->start[i]) {
+				found = true;
+				goto out;
+			}
+		}
+	}
+out:
+	rcu_read_unlock();
+	return found;
+}
+
 static void format_mod_start(void)
 {
 	mutex_lock(&btrace_mutex);
@@ -181,6 +297,22 @@ find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
 {
 	return NULL;
 }
+
+static inline int count_mod_formats(void)
+{
+	return 0;
+}
+
+static inline const char **
+find_next_mod_tracepoint_str(int start_index, loff_t *pos)
+{
+	return NULL;
+}
+
+static inline bool is_module_tracepoint_string(const char *str)
+{
+	return false;
+}
 static inline void format_mod_start(void) { }
 static inline void format_mod_stop(void) { }
 #endif /* CONFIG_MODULES */
@@ -195,6 +327,7 @@ void trace_printk_control(bool enabled)
 __initdata_or_module static
 struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
+	.priority = 0,
 };
 
 int __trace_bprintk(unsigned long ip, const char *fmt, ...)
@@ -259,12 +392,13 @@ bool trace_is_tracepoint_string(const char *str)
 		if (str == *ptr)
 			return true;
 	}
-	return false;
+	return is_module_tracepoint_string(str);
 }
 
 static const char **find_next(void *v, loff_t *pos)
 {
 	const char **fmt = v;
+	int mod_formats;
 	int start_index;
 	int last_index;
 
@@ -292,7 +426,12 @@ static const char **find_next(void *v, loff_t *pos)
 		return __start___tracepoint_str + (*pos - last_index);
 
 	start_index += last_index;
-	return find_next_mod_format(start_index, v, fmt, pos);
+	mod_formats = count_mod_formats();
+	if (*pos < start_index + mod_formats)
+		return find_next_mod_format(start_index, v, fmt, pos);
+
+	start_index += mod_formats;
+	return find_next_mod_tracepoint_str(start_index, pos);
 }
 
 static void *
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox