Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v9 4/8] tracing/fprobe: Avoid kcalloc() in rcu_read_lock section
From: Masami Hiramatsu (Google) @ 2026-04-17 16:18 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177644266147.584467.8179035927318998910.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

fprobe_remove_node_in_module() is called under RCU read locked, but
this invokes kcalloc() if there are more than 8 fprobes installed
on the module. Sashiko warns it because kcalloc() can sleep [1].

 [1] https://sashiko.dev/#/patchset/177552432201.853249.5125045538812833325.stgit%40mhiramat.tok.corp.google.com

To fix this issue, expand the batch size to 128 and do not expand
the fprobe_addr_list, but just cancel walking on fprobe_ip_table,
update fgraph/ftrace_ops and retry the loop again.

Fixes: 0de4c70d04a4 ("tracing: fprobe: use rhltable for fprobe_ip_table")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v6:
  - Retry outside rhltable_walk_enter/exit() again.
 Changes in v5:
  - Skip updating ftrace_ops when fails to allocate memory in module
    unloading.
 Changes in v4:
  - fix a build error typo in case of CONFIG_DYNAMIC_FTRACE=n.
 Changes in v3:
  - Retry inside rhltable_walk_enter/exit().
  - Rename fprobe_set_ips() to fprobe_remove_ips().
  - Rename 'retry' label to 'again'.
---
 kernel/trace/fprobe.c |   92 ++++++++++++++++++++++++-------------------------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 2e232342cbd4..49016c3e7cd9 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -344,11 +344,10 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 }
 
 #ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
-			   int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
-	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, remove, reset);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
+	ftrace_set_filter_ips(&fprobe_ftrace_ops, ips, cnt, 1, 0);
 }
 #endif
 #else
@@ -367,10 +366,9 @@ static bool fprobe_is_ftrace(struct fprobe *fp)
 }
 
 #ifdef CONFIG_MODULES
-static void fprobe_set_ips(unsigned long *ips, unsigned int cnt, int remove,
-			   int reset)
+static void fprobe_remove_ips(unsigned long *ips, unsigned int cnt)
 {
-	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, remove, reset);
+	ftrace_set_filter_ips(&fprobe_graph_ops.ops, ips, cnt, 1, 0);
 }
 #endif
 #endif /* !CONFIG_DYNAMIC_FTRACE_WITH_ARGS && !CONFIG_DYNAMIC_FTRACE_WITH_REGS */
@@ -542,7 +540,7 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
 
 #ifdef CONFIG_MODULES
 
-#define FPROBE_IPS_BATCH_INIT 8
+#define FPROBE_IPS_BATCH_INIT 128
 /* instruction pointer address list */
 struct fprobe_addr_list {
 	int index;
@@ -550,45 +548,24 @@ struct fprobe_addr_list {
 	unsigned long *addrs;
 };
 
-static int fprobe_addr_list_add(struct fprobe_addr_list *alist, unsigned long addr)
+static int fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
+					 struct fprobe_addr_list *alist)
 {
-	unsigned long *addrs;
-
-	/* Previously we failed to expand the list. */
-	if (alist->index == alist->size)
-		return -ENOSPC;
-
-	alist->addrs[alist->index++] = addr;
-	if (alist->index < alist->size)
+	if (!within_module(node->addr, mod))
 		return 0;
 
-	/* Expand the address list */
-	addrs = kcalloc(alist->size * 2, sizeof(*addrs), GFP_KERNEL);
-	if (!addrs)
-		return -ENOMEM;
-
-	memcpy(addrs, alist->addrs, alist->size * sizeof(*addrs));
-	alist->size *= 2;
-	kfree(alist->addrs);
-	alist->addrs = addrs;
+	if (delete_fprobe_node(node))
+		return 0;
+	/* If no address list is available, we can't track this address. */
+	if (!alist->addrs)
+		return 0;
 
+	alist->addrs[alist->index++] = node->addr;
+	if (alist->index == alist->size)
+		return -ENOSPC;
 	return 0;
 }
 
-static void fprobe_remove_node_in_module(struct module *mod, struct fprobe_hlist_node *node,
-					 struct fprobe_addr_list *alist)
-{
-	if (!within_module(node->addr, mod))
-		return;
-	if (delete_fprobe_node(node))
-		return;
-	/*
-	 * If failed to update alist, just continue to update hlist.
-	 * Therefore, at list user handler will not hit anymore.
-	 */
-	fprobe_addr_list_add(alist, node->addr);
-}
-
 /* Handle module unloading to manage fprobe_ip_table. */
 static int fprobe_module_callback(struct notifier_block *nb,
 				  unsigned long val, void *data)
@@ -597,29 +574,50 @@ static int fprobe_module_callback(struct notifier_block *nb,
 	struct fprobe_hlist_node *node;
 	struct rhashtable_iter iter;
 	struct module *mod = data;
+	bool retry;
 
 	if (val != MODULE_STATE_GOING)
 		return NOTIFY_DONE;
 
 	alist.addrs = kcalloc(alist.size, sizeof(*alist.addrs), GFP_KERNEL);
-	/* If failed to alloc memory, we can not remove ips from hash. */
-	if (!alist.addrs)
-		return NOTIFY_DONE;
+	/*
+	 * If failed to alloc memory, ftrace_ops will not be able to remove ips from
+	 * hash, but we can still remove nodes from fprobe_ip_table, so we can avoid
+	 * the potential wrong callback. So just print a warning here and try to
+	 * continue without address list.
+	 */
+	WARN_ONCE(!alist.addrs,
+		"Failed to allocate memory for fprobe_addr_list, ftrace_ops will not be updated");
 
 	mutex_lock(&fprobe_mutex);
+again:
+	retry = false;
+	alist.index = 0;
 	rhltable_walk_enter(&fprobe_ip_table, &iter);
 	do {
 		rhashtable_walk_start(&iter);
 
 		while ((node = rhashtable_walk_next(&iter)) && !IS_ERR(node))
-			fprobe_remove_node_in_module(mod, node, &alist);
+			if (fprobe_remove_node_in_module(mod, node, &alist) < 0) {
+				retry = true;
+				break;
+			}
 
 		rhashtable_walk_stop(&iter);
-	} while (node == ERR_PTR(-EAGAIN));
+	} while (node == ERR_PTR(-EAGAIN) && !retry);
 	rhashtable_walk_exit(&iter);
+	/* Remove any ips from hash table(s) */
+	if (alist.index > 0) {
+		fprobe_remove_ips(alist.addrs, alist.index);
+		/*
+		 * If we break rhashtable walk loop except for -EAGAIN, we need
+		 * to restart looping from start for safety. Anyway, this is
+		 * not a hotpath.
+		 */
+		if (retry)
+			goto again;
+	}
 
-	if (alist.index > 0)
-		fprobe_set_ips(alist.addrs, alist.index, 1, 0);
 	mutex_unlock(&fprobe_mutex);
 
 	kfree(alist.addrs);


^ permalink raw reply related

* [PATCH v9 3/8] tracing/fprobe: Remove fprobe from hash in failure path
From: Masami Hiramatsu (Google) @ 2026-04-17 16:18 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177644266147.584467.8179035927318998910.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

When register_fprobe_ips() fails, it tries to remove a list of
fprobe_hash_node from fprobe_ip_table, but it missed to remove
fprobe itself from fprobe_table. Moreover, when removing
the fprobe_hash_node which is added to rhltable once, it must
use kfree_rcu() after removing from rhltable.

To fix these issues, this reuses unregister_fprobe() internal
code to rollback the half-way registered fprobe.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - Fix to check return value of add_fprobe_hash() and break
    loop if insert_fprobe_node() is failed.
 Changes in v7:
  - Remove RCU grace period wait, since fprobe itself is not
    that is not needed.
 Changes in v6:
  - Wait for an RCU grace period before returning error in
    unregister_fprobe_nolock().
 Changes in v5:
  - When rolling back an fprobe that failed to register, the
    fprobe_hash_node are forcibly removed and warn if failure.
 Changes in v4:
  - Remove short-cut case because we always need to upadte ftrace_ops.
  - Use guard(mutex) in register_fprobe_ips() to unlock it correctly.
  - Remove redundant !ret check in register_fprobe_ips().
  - Do not set hlist_array->size in failure case, instead,
    hlist_array->array[i].fp is set only when insertion is succeeded.
  Changes in v3:
  - Newly added.
---
 kernel/trace/fprobe.c |   85 +++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index a2b659006e0e..2e232342cbd4 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -79,20 +79,27 @@ static const struct rhashtable_params fprobe_rht_params = {
 };
 
 /* Node insertion and deletion requires the fprobe_mutex */
-static int insert_fprobe_node(struct fprobe_hlist_node *node)
+static int insert_fprobe_node(struct fprobe_hlist_node *node, struct fprobe *fp)
 {
+	int ret;
+
 	lockdep_assert_held(&fprobe_mutex);
 
-	return rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+	ret = rhltable_insert(&fprobe_ip_table, &node->hlist, fprobe_rht_params);
+	/* Set the fprobe pointer if insertion was successful. */
+	if (!ret)
+		WRITE_ONCE(node->fp, fp);
+	return ret;
 }
 
 /* Return true if there are synonims */
 static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 {
-	lockdep_assert_held(&fprobe_mutex);
 	bool ret;
 
-	/* Avoid double deleting */
+	lockdep_assert_held(&fprobe_mutex);
+
+	/* Avoid double deleting and non-inserted nodes */
 	if (READ_ONCE(node->fp) != NULL) {
 		WRITE_ONCE(node->fp, NULL);
 		rhltable_remove(&fprobe_ip_table, &node->hlist,
@@ -756,7 +763,6 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
 	fp->hlist_array = hlist_array;
 	hlist_array->fp = fp;
 	for (i = 0; i < num; i++) {
-		hlist_array->array[i].fp = fp;
 		addr = ftrace_location(addrs[i]);
 		if (!addr) {
 			fprobe_fail_cleanup(fp);
@@ -820,6 +826,8 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter
 }
 EXPORT_SYMBOL_GPL(register_fprobe);
 
+static int unregister_fprobe_nolock(struct fprobe *fp);
+
 /**
  * register_fprobe_ips() - Register fprobe to ftrace by address.
  * @fp: A fprobe data structure to be registered.
@@ -846,28 +854,22 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	if (ret)
 		return ret;
 
-	hlist_array = fp->hlist_array;
 	if (fprobe_is_ftrace(fp))
 		ret = fprobe_ftrace_add_ips(addrs, num);
 	else
 		ret = fprobe_graph_add_ips(addrs, num);
-
-	if (!ret) {
-		add_fprobe_hash(fp);
-		for (i = 0; i < hlist_array->size; i++) {
-			ret = insert_fprobe_node(&hlist_array->array[i]);
-			if (ret)
-				break;
-		}
-		/* fallback on insert error */
-		if (ret) {
-			for (i--; i >= 0; i--)
-				delete_fprobe_node(&hlist_array->array[i]);
-		}
+	if (ret) {
+		fprobe_fail_cleanup(fp);
+		return ret;
 	}
 
+	hlist_array = fp->hlist_array;
+	ret = add_fprobe_hash(fp);
+	for (i = 0; i < hlist_array->size && !ret; i++)
+		ret = insert_fprobe_node(&hlist_array->array[i], fp);
+
 	if (ret)
-		fprobe_fail_cleanup(fp);
+		unregister_fprobe_nolock(fp);
 
 	return ret;
 }
@@ -911,27 +913,12 @@ bool fprobe_is_registered(struct fprobe *fp)
 	return true;
 }
 
-/**
- * unregister_fprobe() - Unregister fprobe.
- * @fp: A fprobe data structure to be unregistered.
- *
- * Unregister fprobe (and remove ftrace hooks from the function entries).
- *
- * Return 0 if @fp is unregistered successfully, -errno if not.
- */
-int unregister_fprobe(struct fprobe *fp)
+static int unregister_fprobe_nolock(struct fprobe *fp)
 {
-	struct fprobe_hlist *hlist_array;
+	struct fprobe_hlist *hlist_array = fp->hlist_array;
 	unsigned long *addrs = NULL;
-	int ret = 0, i, count;
+	int i, count;
 
-	mutex_lock(&fprobe_mutex);
-	if (!fp || !fprobe_registered(fp)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	hlist_array = fp->hlist_array;
 	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
 	/*
 	 * This will remove fprobe_hash_node from the hash table even if
@@ -957,12 +944,26 @@ int unregister_fprobe(struct fprobe *fp)
 
 	kfree_rcu(hlist_array, rcu);
 	fp->hlist_array = NULL;
+	kfree(addrs);
 
-out:
-	mutex_unlock(&fprobe_mutex);
+	return 0;
+}
 
-	kfree(addrs);
-	return ret;
+/**
+ * unregister_fprobe() - Unregister fprobe.
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+	guard(mutex)(&fprobe_mutex);
+	if (!fp || !fprobe_registered(fp))
+		return -EINVAL;
+
+	return unregister_fprobe_nolock(fp);
 }
 EXPORT_SYMBOL_GPL(unregister_fprobe);
 


^ permalink raw reply related

* [PATCH v9 2/8] tracing/fprobe: Unregister fprobe even if memory allocation fails
From: Masami Hiramatsu (Google) @ 2026-04-17 16:18 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177644266147.584467.8179035927318998910.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

unregister_fprobe() can fail under memory pressure because of memory
allocation failure, but this maybe called from module unloading, and
usually there is no way to retry it. Moreover. trace_fprobe does not
check the return value.

To fix this problem, unregister fprobe and fprobe_hash_node even if
working memory allocation fails.
Anyway, if the last fprobe is removed, the filter will be freed.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v9:
  - Clear ftrace_ops filter when unregister it.
 Changes in v7:
  - Newly added.
---
 kernel/trace/fprobe.c |   25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index af9ba7250874..a2b659006e0e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -324,9 +324,10 @@ static void fprobe_ftrace_remove_ips(unsigned long *addrs, int num)
 	lockdep_assert_held(&fprobe_mutex);
 
 	fprobe_ftrace_active--;
-	if (!fprobe_ftrace_active)
+	if (!fprobe_ftrace_active) {
 		unregister_ftrace_function(&fprobe_ftrace_ops);
-	if (num)
+		ftrace_free_filter(&fprobe_ftrace_ops);
+	} else if (num)
 		ftrace_set_filter_ips(&fprobe_ftrace_ops, addrs, num, 1, 0);
 }
 
@@ -525,10 +526,10 @@ static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
 
 	fprobe_graph_active--;
 	/* Q: should we unregister it ? */
-	if (!fprobe_graph_active)
+	if (!fprobe_graph_active) {
 		unregister_ftrace_graph(&fprobe_graph_ops);
-
-	if (num)
+		ftrace_free_filter(&fprobe_graph_ops.ops);
+	} else if (num)
 		ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
 }
 
@@ -932,15 +933,19 @@ int unregister_fprobe(struct fprobe *fp)
 
 	hlist_array = fp->hlist_array;
 	addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
-	if (!addrs) {
-		ret = -ENOMEM;	/* TODO: Fallback to one-by-one loop */
-		goto out;
-	}
+	/*
+	 * This will remove fprobe_hash_node from the hash table even if
+	 * memory allocation fails. However, ftrace_ops will not be updated.
+	 * Anyway, when the last fprobe is unregistered, ftrace_ops is also
+	 * unregistered.
+	 */
+	if (!addrs)
+		pr_warn("Failed to allocate working array. ftrace_ops may not sync.\n");
 
 	/* Remove non-synonim ips from table and hash */
 	count = 0;
 	for (i = 0; i < hlist_array->size; i++) {
-		if (!delete_fprobe_node(&hlist_array->array[i]))
+		if (!delete_fprobe_node(&hlist_array->array[i]) && addrs)
 			addrs[count++] = hlist_array->array[i].addr;
 	}
 	del_fprobe_hash(fp);


^ permalink raw reply related

* [PATCH v9 1/8] tracing/fprobe: Reject registration of a registered fprobe before init
From: Masami Hiramatsu (Google) @ 2026-04-17 16:17 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177644266147.584467.8179035927318998910.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Reject registration of a registered fprobe which is on the fprobe
hash table before initializing fprobe.
The add_fprobe_hash() checks this re-register fprobe, but since
fprobe_init() clears hlist_array field, it is too late to check it.
It has to check the re-registration before touncing fprobe.

Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer")
Cc: stable@vger.kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v6:
  - Newly added.
---
 kernel/trace/fprobe.c |   21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 56d145017902..af9ba7250874 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -4,6 +4,7 @@
  */
 #define pr_fmt(fmt) "fprobe: " fmt
 
+#include <linux/cleanup.h>
 #include <linux/err.h>
 #include <linux/fprobe.h>
 #include <linux/kallsyms.h>
@@ -107,7 +108,7 @@ static bool delete_fprobe_node(struct fprobe_hlist_node *node)
 }
 
 /* Check existence of the fprobe */
-static bool is_fprobe_still_exist(struct fprobe *fp)
+static bool fprobe_registered(struct fprobe *fp)
 {
 	struct hlist_head *head;
 	struct fprobe_hlist *fph;
@@ -120,7 +121,7 @@ static bool is_fprobe_still_exist(struct fprobe *fp)
 	}
 	return false;
 }
-NOKPROBE_SYMBOL(is_fprobe_still_exist);
+NOKPROBE_SYMBOL(fprobe_registered);
 
 static int add_fprobe_hash(struct fprobe *fp)
 {
@@ -132,9 +133,6 @@ static int add_fprobe_hash(struct fprobe *fp)
 	if (WARN_ON_ONCE(!fph))
 		return -EINVAL;
 
-	if (is_fprobe_still_exist(fp))
-		return -EEXIST;
-
 	head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
 	hlist_add_head_rcu(&fp->hlist_array->hlist, head);
 	return 0;
@@ -149,7 +147,7 @@ static int del_fprobe_hash(struct fprobe *fp)
 	if (WARN_ON_ONCE(!fph))
 		return -EINVAL;
 
-	if (!is_fprobe_still_exist(fp))
+	if (!fprobe_registered(fp))
 		return -ENOENT;
 
 	fph->fp = NULL;
@@ -480,7 +478,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
 		if (!fp)
 			break;
 		curr += FPROBE_HEADER_SIZE_IN_LONG;
-		if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+		if (fprobe_registered(fp) && !fprobe_disabled(fp)) {
 			if (WARN_ON_ONCE(curr + size > size_words))
 				break;
 			fp->exit_handler(fp, trace->func, ret_ip, fregs,
@@ -839,12 +837,14 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 	struct fprobe_hlist *hlist_array;
 	int ret, i;
 
+	guard(mutex)(&fprobe_mutex);
+	if (fprobe_registered(fp))
+		return -EEXIST;
+
 	ret = fprobe_init(fp, addrs, num);
 	if (ret)
 		return ret;
 
-	mutex_lock(&fprobe_mutex);
-
 	hlist_array = fp->hlist_array;
 	if (fprobe_is_ftrace(fp))
 		ret = fprobe_ftrace_add_ips(addrs, num);
@@ -864,7 +864,6 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
 				delete_fprobe_node(&hlist_array->array[i]);
 		}
 	}
-	mutex_unlock(&fprobe_mutex);
 
 	if (ret)
 		fprobe_fail_cleanup(fp);
@@ -926,7 +925,7 @@ int unregister_fprobe(struct fprobe *fp)
 	int ret = 0, i, count;
 
 	mutex_lock(&fprobe_mutex);
-	if (!fp || !is_fprobe_still_exist(fp)) {
+	if (!fp || !fprobe_registered(fp)) {
 		ret = -EINVAL;
 		goto out;
 	}


^ permalink raw reply related

* [PATCH v9 0/8] tracing/fprobe: Fix fprobe_ip_table related bugs
From: Masami Hiramatsu (Google) @ 2026-04-17 16:17 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Menglong Dong, Mathieu Desnoyers, jiang.biao, linux-kernel,
	linux-trace-kernel

Hi,

Here is the 9th version of fprobe bugfix series.
The previous version is here.

https://lore.kernel.org/all/177633460058.3479617.11868368413034643565.stgit@mhiramat.tok.corp.google.com/

This version ensures the ftrace_ops filter is cleared when unregistering
an fprobe, even if memory allocation fails during the unregistration
process[2/8] and fixed module unloading issues by removing
fprobe_graph_active and fprobe_ftrace_active to handle cases where
fprobes are removed after a module is unloaded[6/8], and update
selftests/ftrace to updated the module test to use "trace-events-sample"
instead of "trace_events_sample", added checks for module unloading and
removed the core-kernel event case, and ensures the test module exists
when unloading it in the EXIT handler.[7/8]
(Also, this rebased on probes-v7.1)

There seems RCU sync issue repeatedly reported but it is in fprobe API
level issue. Unlike kprobes, fprobe users should use kfree_rcu() or
call synchronize_rcu() explicitly by themselves, because it will
use some common resource in its handler. Tracing subsystem does that.

Thank you!

Masami Hiramatsu (Google) (8):
      tracing/fprobe: Reject registration of a registered fprobe before init
      tracing/fprobe: Unregister fprobe even if memory allocation fails
      tracing/fprobe: Remove fprobe from hash in failure path
      tracing/fprobe: Avoid kcalloc() in rcu_read_lock section
      tracing/fprobe: Check the same type fprobe on table as the unregistered one
      tracing/fprobe: Fix to unregister ftrace_ops if it is empty on module unloading
      selftests/ftrace: Add a testcase for fprobe events on module
      selftests/ftrace: Add a testcase for multiple fprobe events


 kernel/trace/fprobe.c                              |  469 +++++++++++++-------
 .../test.d/dynevent/add_remove_fprobe_module.tc    |   87 ++++
 .../test.d/dynevent/add_remove_multiple_fprobe.tc  |   69 +++
 3 files changed, 464 insertions(+), 161 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_fprobe_module.tc
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/add_remove_multiple_fprobe.tc


base-commit: e0a384434ae1bdfb03954c46c464e3dbd3223ad6
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4] tracing: Bound synthetic-field strings with seq_buf
From: Steven Rostedt @ 2026-04-17 16:16 UTC (permalink / raw)
  To: Pengpeng Hou
  Cc: Masami Hiramatsu, Tom Zanussi, Mathieu Desnoyers,
	linux-trace-kernel, linux-kernel
In-Reply-To: <20260417223001.1-tracing-synth-v4-pengpeng@iscas.ac.cn>

On Fri, 17 Apr 2026 20:20:00 +0800
Pengpeng Hou <pengpeng@iscas.ac.cn> wrote:

> @ -2962,14 +2963,22 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
>  			 char *system, char *event_name, char *field_name)
>  {
>  	struct hist_field *event_var;
> +	struct seq_buf s;
>  	char *synthetic_name;
>  
>  	synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
>  	if (!synthetic_name)
>  		return ERR_PTR(-ENOMEM);
>  
> -	strcpy(synthetic_name, "synthetic_");
> -	strcat(synthetic_name, field_name);
> +	seq_buf_init(&s, synthetic_name, MAX_FILTER_STR_VAL);
> +	seq_buf_puts(&s, "synthetic_");
> +	seq_buf_puts(&s, field_name);

newline

> +	/* Terminate synthetic_name with a NUL. */
> +	seq_buf_str(&s);

newline

> +	if (seq_buf_has_overflowed(&s)) {
> +		kfree(synthetic_name);
> +		return ERR_PTR(-E2BIG);
> +	}
>  
>  	event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
>  
> @@ -3014,7 +3023,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
>  	struct trace_event_file *file;
>  	struct hist_field *key_field;
>  	struct hist_field *event_var;
> -	char *saved_filter;
> +	struct seq_buf s;
>  	char *cmd;
>  	int ret;
>  
> @@ -3059,28 +3068,35 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
>  		return ERR_PTR(-ENOMEM);
>  	}
>  
> +	seq_buf_init(&s, cmd, MAX_FILTER_STR_VAL);
> +
>  	/* Use the same keys as the compatible histogram */
> -	strcat(cmd, "keys=");
> +	seq_buf_puts(&s, "keys=");
>  
>  	for_each_hist_key_field(i, hist_data) {
>  		key_field = hist_data->fields[i];
>  		if (!first)
> -			strcat(cmd, ",");
> -		strcat(cmd, key_field->field->name);
> +			seq_buf_putc(&s, ',');
> +		seq_buf_puts(&s, key_field->field->name);
>  		first = false;
>  	}
>  
>  	/* Create the synthetic field variable specification */
> -	strcat(cmd, ":synthetic_");
> -	strcat(cmd, field_name);
> -	strcat(cmd, "=");
> -	strcat(cmd, field_name);
> +	seq_buf_printf(&s, ":synthetic_%s=%s", field_name, field_name);
>  
>  	/* Use the same filter as the compatible histogram */
> -	saved_filter = find_trigger_filter(hist_data, file);
> -	if (saved_filter) {
> -		strcat(cmd, " if ");
> -		strcat(cmd, saved_filter);
> +	{
> +		char *saved_filter = find_trigger_filter(hist_data, file);
> +
> +		if (saved_filter)
> +			seq_buf_printf(&s, " if %s", saved_filter);
> +	}
> +

Different function. Should have the comment about adding nul here too.

> +	seq_buf_str(&s);

newline

> +	if (seq_buf_has_overflowed(&s)) {
> +		kfree(cmd);
> +		kfree(var_hist);
> +		return ERR_PTR(-E2BIG);
>  	}

-- Steve

^ permalink raw reply

* Re: [PATCH] trace: propagate registration failure from tracing_start_*_record()
From: Steven Rostedt @ 2026-04-17 15:52 UTC (permalink / raw)
  To: Yash Suthar
  Cc: mhiramat, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
	skhan, me, syzbot+a1d25e53cd4a10f7f2d3
In-Reply-To: <20260417063827.84146-1-yashsuthar983@gmail.com>

On Fri, 17 Apr 2026 12:08:27 +0530
Yash Suthar <yashsuthar983@gmail.com> wrote:

> syzbot reported a WARN in tracepoint_probe_unregister():
> 
> tracing_start_sched_switch() increments sched_cmdline_ref /
> sched_tgid_ref before calling tracing_sched_register(), and its
> return value is discarded because the API is void. When the first
> register_trace_sched_*() fails (e.g. kmalloc under memory pressure
> or failslab), the function's fail_deprobe* labels roll back any
> partial probe registration, but the caller's refcount has already
> been bumped. The state is now desynced: refs > 0 but no probes in
> tp->funcs.
> 
> Later, when the caller pairs the start with a stop, the refcount
> walks back to 0 and tracing_sched_unregister() calls
> unregister_trace_sched_*() against an empty tp->funcs.
> func_remove() returns -ENOENT and the
> WARN_ON_ONCE(IS_ERR(old)) in tracepoint_remove_func() fires.
> 
> Fix: make tracing_start_sched_switch() and the two exported
> wrappers, tracing_start_cmdline_record() and
> tracing_start_tgid_record(), return int; register the probes
> before bumping the refcount; and propagate the error to callers
> so refs are only held on behalf of a caller whose registration
> actually succeeded.
> 
> Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks")
> Reported-by: syzbot+a1d25e53cd4a10f7f2d3@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?id=f93e97cd824071a2577a40cde9ecd957f59f87eb

Did you use AI to create any of this? If so you must disclose it. This
reads very much like an AI patch.

> 
> Signed-off-by: Yash Suthar <yashsuthar983@gmail.com>
> ---
>  kernel/trace/trace.c                 |  6 +++---
>  kernel/trace/trace.h                 |  4 ++--
>  kernel/trace/trace_events.c          | 28 +++++++++++++++++++--------
>  kernel/trace/trace_functions.c       |  8 +++++++-
>  kernel/trace/trace_functions_graph.c |  6 +++++-
>  kernel/trace/trace_sched_switch.c    | 29 ++++++++++++++++++----------
>  kernel/trace/trace_selftest.c        |  7 ++++++-
>  7 files changed, 62 insertions(+), 26 deletions(-)

NAK on all this. If you are under severe memory constraints that causes
this to fail, then you'll be hitting a bunch more errors.

> 
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 8bd4ec08fb36..e936eed99b27 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -3320,7 +3320,7 @@ void trace_printk_init_buffers(void)
>  	 * allocated here, then this was called by module code.
>  	 */
>  	if (global_trace.array_buffer.buffer)
> -		tracing_start_cmdline_record();
> +		(void)tracing_start_cmdline_record();

WTF??? Why are you adding the typecast of (void) here? Don't do that!


>  }
>  EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
>  
> @@ -3329,7 +3329,7 @@ void trace_printk_start_comm(void)
>  	/* Start tracing comms if trace printk is set */
>  	if (!buffers_allocated)
>  		return;
> -	tracing_start_cmdline_record();
> +	(void)tracing_start_cmdline_record();
>  }
>  
>  static void trace_printk_start_stop_comm(int enabled)
> @@ -3338,7 +3338,7 @@ static void trace_printk_start_stop_comm(int enabled)
>  		return;
>  
>  	if (enabled)
> -		tracing_start_cmdline_record();
> +		(void)tracing_start_cmdline_record();
>  	else
>  		tracing_stop_cmdline_record();
>  }
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index b6d42fe06115..6fe2c8429560 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -751,9 +751,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
>  int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
>  		      struct ftrace_regs *fregs);
>  
> -void tracing_start_cmdline_record(void);
> +int tracing_start_cmdline_record(void);
>  void tracing_stop_cmdline_record(void);
> -void tracing_start_tgid_record(void);
> +int tracing_start_tgid_record(void);
>  void tracing_stop_tgid_record(void);
>  
>  int register_tracer(struct tracer *type);
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 137b4d9bb116..e6713aa80a03 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -734,9 +734,9 @@ void trace_event_enable_cmd_record(bool enable)
>  			continue;
>  
>  		if (enable) {
> -			tracing_start_cmdline_record();
> -			set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
> -		} else {
> +			if (!tracing_start_cmdline_record())
> +				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
> +		} else if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
>  			tracing_stop_cmdline_record();
>  			clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
>  		}
> @@ -755,9 +755,9 @@ void trace_event_enable_tgid_record(bool enable)
>  			continue;
>  
>  		if (enable) {
> -			tracing_start_tgid_record();
> -			set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
> -		} else {
> +			if (!tracing_start_tgid_record())
> +				set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
> +		} else if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
>  			tracing_stop_tgid_record();
>  			clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
>  				  &file->flags);
> @@ -847,14 +847,26 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
>  				set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
>  
>  			if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) {
> +				ret = tracing_start_cmdline_record();
> +				if (ret) {
> +					pr_info("event trace: Could not enable event %s\n",
> +						trace_event_name(call));
> +					break;
> +				}
>  				cmd = true;
> -				tracing_start_cmdline_record();
>  				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
>  			}
>  
>  			if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
> +				ret = tracing_start_tgid_record();
> +				if (ret) {
> +					if (cmd)
> +						tracing_stop_cmdline_record();
> +					pr_info("event trace: Could not enable event %s\n",
> +						trace_event_name(call));
> +					break;
> +				}
>  				tgid = true;
> -				tracing_start_tgid_record();
>  				set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
>  			}
>  
> diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
> index c12795c2fb39..14d099734345 100644
> --- a/kernel/trace/trace_functions.c
> +++ b/kernel/trace/trace_functions.c
> @@ -146,6 +146,8 @@ static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
>  static int function_trace_init(struct trace_array *tr)
>  {
>  	ftrace_func_t func;
> +	int ret;
> +
>  	/*
>  	 * Instance trace_arrays get their ops allocated
>  	 * at instance creation. Unless it failed
> @@ -165,7 +167,11 @@ static int function_trace_init(struct trace_array *tr)
>  
>  	tr->array_buffer.cpu = raw_smp_processor_id();
>  
> -	tracing_start_cmdline_record();
> +	ret = tracing_start_cmdline_record();
> +	if (ret) {
> +		ftrace_reset_array_ops(tr);
> +		return ret;
> +	}
>  	tracing_start_function_trace(tr);
>  	return 0;
>  }
> diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
> index 1de6f1573621..6b27ed62fee8 100644
> --- a/kernel/trace/trace_functions_graph.c
> +++ b/kernel/trace/trace_functions_graph.c
> @@ -487,7 +487,11 @@ static int graph_trace_init(struct trace_array *tr)
>  	ret = register_ftrace_graph(tr->gops);
>  	if (ret)
>  		return ret;
> -	tracing_start_cmdline_record();
> +	ret = tracing_start_cmdline_record();
> +	if (ret) {
> +		unregister_ftrace_graph(tr->gops);
> +		return ret;
> +	}
>  
>  	return 0;
>  }
> diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
> index c46d584ded3b..683ea4ca1498 100644
> --- a/kernel/trace/trace_sched_switch.c
> +++ b/kernel/trace/trace_sched_switch.c
> @@ -89,12 +89,22 @@ static void tracing_sched_unregister(void)
>  	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
>  }
>  
> -static void tracing_start_sched_switch(int ops)
> +static int tracing_start_sched_switch(int ops)
>  {
> -	bool sched_register;
> +	int ret = 0;
>  
>  	mutex_lock(&sched_register_mutex);
> -	sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
> +
> +	/*
> +	 * If the registration fails, do not bump the reference count : the
> +	 * caller must observe the failure so it can avoid a later matching
> +	 * stop that would otherwise unregister probes that were never added.
> +	 */
> +	if (!sched_cmdline_ref && !sched_tgid_ref) {
> +		ret = tracing_sched_register();
> +		if (ret)
> +			goto out;
> +	}
>  
>  	switch (ops) {
>  	case RECORD_CMDLINE:
> @@ -105,10 +115,9 @@ static void tracing_start_sched_switch(int ops)
>  		sched_tgid_ref++;
>  		break;
>  	}
> -
> -	if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
> -		tracing_sched_register();

The only change that should deal with this would be:

	if (sched_register && (sched_cmdline_ref || sched_tgid_ref)) {
		WARN_ONCE(tracing_sched_register() < 0,
			"Failed to register trace command line caching. Requires reboot to fix");
	}

-- Steve



> +out:
>  	mutex_unlock(&sched_register_mutex);
> +	return ret;
>  }
>  
>  static void tracing_stop_sched_switch(int ops)
> @@ -130,9 +139,9 @@ static void tracing_stop_sched_switch(int ops)
>  	mutex_unlock(&sched_register_mutex);
>  }
>  
> -void tracing_start_cmdline_record(void)
> +int tracing_start_cmdline_record(void)
>  {
> -	tracing_start_sched_switch(RECORD_CMDLINE);
> +	return tracing_start_sched_switch(RECORD_CMDLINE);
>  }
>  
>  void tracing_stop_cmdline_record(void)
> @@ -140,9 +149,9 @@ void tracing_stop_cmdline_record(void)
>  	tracing_stop_sched_switch(RECORD_CMDLINE);
>  }
>  
> -void tracing_start_tgid_record(void)
> +int tracing_start_tgid_record(void)
>  {
> -	tracing_start_sched_switch(RECORD_TGID);
> +	return tracing_start_sched_switch(RECORD_TGID);
>  }
>  
>  void tracing_stop_tgid_record(void)
> diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
> index d88c44f1dfa5..238e7451f8e4 100644
> --- a/kernel/trace/trace_selftest.c
> +++ b/kernel/trace/trace_selftest.c
> @@ -1084,7 +1084,12 @@ trace_selftest_startup_function_graph(struct tracer *trace,
>  		warn_failed_init_tracer(trace, ret);
>  		goto out;
>  	}
> -	tracing_start_cmdline_record();
> +	ret = tracing_start_cmdline_record();
> +	if (ret) {
> +		unregister_ftrace_graph(&fgraph_ops);
> +		warn_failed_init_tracer(trace, ret);
> +		goto out;
> +	}
>  
>  	/* Sleep for a 1/10 of a second */
>  	msleep(100);


^ permalink raw reply

* Re: [PATCH 2/3] init: use static buffers for bootconfig extra command line
From: Breno Leitao @ 2026-04-17 15:38 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Andrew Morton, oss, paulmck, linux-trace-kernel, linux-kernel,
	kernel-team
In-Reply-To: <20260417104436.ece29fd5e2cb7a59c8cf8ac1@kernel.org>


On Fri, Apr 17, 2026 at 10:44:36AM +0900, Masami Hiramatsu wrote:
> On Wed, 15 Apr 2026 03:51:11 -0700
> Breno Leitao <leitao@debian.org> wrote:
>
> But if we can do it, should we continue using bootconfig? I mean
> it is easy to make a tool (or add a feature in tools/bootconfig)
> which converts bootconfig file to command line string and embeds
> it in the kernel. Hmm.

Sure, you are talking about a a tool that embeddeds it in the kernel binary,
something like:


0) Get a kernel and define CONFIG_BOOT_CONFIG_EMBED_FILE=".bootconfig"

1) Add an option in tools/bootconfig to convert bootconfig (.bootconfig)
   to a cmdline string ($ bootconfig -C kernel .bootconfig).
   Something like:
   # tools/bootconfig/bootconfig -C kernel .bootconfig
     mem=2G loglevel=7 debug nokaslr %

2) At kernel build time, run that tool on .bootconfig and embed the
   resulting string into the kernel image as a .init.rodata symbol
   (embedded_kernel_cmdline[]).

   # gdb -batch -ex 'x/s &embedded_kernel_cmdline' vmlinux
   0xffffffff87e108f8:    "mem=2G loglevel=7 debug nokaslr "

3) At boot, the arch's setup_arch() prepends that symbol to
   boot_command_line right before parse_early_param() — so early_param()
   handlers (mem=, earlycon=, loglevel=, ...) actually see kernel.*
   keys from the embedded bootconfig.

   This needs to be architecture by architecture. Something like:

	@@ -924,6 +925,13 @@ void __init setup_arch(char **cmdline_p)
		builtin_cmdline_added = true;
	#endif

	+       /*
	+        * Prepend kernel.* keys from the embedded bootconfig (rendered at
	+        * build time by tools/bootconfig) so parse_early_param() below sees
	+        * them. No-op when CONFIG_BOOT_CONFIG_EMBED=n.
	+        */
	+       xbc_prepend_embedded_cmdline(boot_command_line, COMMAND_LINE_SIZE);
	+
		strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
		*cmdline_p = command_line;

Am I describing your suggestion accordingly?

Thanks!
--breno

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-04-17 15:07 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Frank van der Linden, lsf-pc, linux-kernel, linux-cxl, cgroups,
	linux-mm, linux-trace-kernel, damon, kernel-team, gregkh, rafael,
	dakr, dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <6d4f702c-5ad6-4f84-a73e-c9e34965be98@kernel.org>

On Fri, Apr 17, 2026 at 11:50:58AM +0200, David Hildenbrand (Arm) wrote:
> On 4/16/26 03:24, Gregory Price wrote:
> > On Wed, Apr 15, 2026 at 12:47:50PM -0700, Frank van der Linden wrote:
> >>
> > 1GB ZONE_MOVABLE HugeTLBFS Pages is an example weird carve-out, because
> > the memory is in ZONE_MOVABLE to help make 1GB allocations more
> > reliable, but 1GB movable pages were removed from the kernel because
> > they're not easily migrated (and therefore may block hot-unplug).
> > 
> > (Thankfully they're back now, so VMs can live on this memory :P)
> 
> Heh, but longterm-pinning would fail on them (making vfio with VMs
> angry). Similar to CMA hugetlb.
> 

Yeah, depends how you configure things.  As long as you expose those
pages on a separate memfd and online it in ZONE_MOVABLE in the guest
to avoid vfio from touching it - you can have your cake and eat it too.

It's a bit of bodge but it works.

However...

> In the latter case, we should have a way to identify "this allocation is
> actually from the CMA owner, so longterm pinning is perfectly fine".
> Checking the CMA alloc state would be one approach, but that's rather
> nasty. I guess there would be ways to make that work.
> 
> I'd assume that people barely rely on 1GB ZONE_MOVABLE HugeTLBFS Pages
> (iow, mixing kernel-cmdline ZONE_MOVABLE creation with kernel-cmdline
> hugetlb reservation).
> 
> I'll note that there was long long ago a proposal of converting
> ZONE_MOVABLE to "sticky-movable" page blocks. It wouldn't really solve
> this problem, though, where the early boot code just does something
> that's rather stupid.
> 

I have been toying with hotpluggable CMA regions.

Interesting opportunity:

  Hotplug on a private node w/ (RECLAIM | DEMOTION | CMA | HUGETLBFS)

Now you have exactly two enabled consumers:
   1) HugeTLBFS
   2) vmscan.c demotion logic

In this regard, HugeTLBFS is the only one that can reach these pages in
a way that could result in the pages being pinned.

All other pages on the node are - by definition - movable, because they
can only reach the node via migration (demotion).

The system can't do fallback allocations to the node, so it operates a
bit slower as a general purpose memory pool - but if you decide you want
to optimize for that you can unplug/hotplug the memory back to a normal
node in ZONE_MOVABLE - without rebooting.

~Gregory

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-04-17 14:45 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <46837cea-5d90-49d8-be67-7306e0e89aa3@kernel.org>

On Fri, Apr 17, 2026 at 11:37:36AM +0200, David Hildenbrand (Arm) wrote:
> > 
> > I'm not married to __GFP_PRIVATE, but it has been reliable for me.
> 
> Yes, we should carefully describe which semantics we want to achieve, to
> then figure out how we could achieve them.
>

Yeah, __GFP_THISNODE does seem similar enough at first look - but its
semantic is actually backwards from the problem we're trying to solve.

__GFP_THISNODE says:  Don't fall back   (restrict access)
__GFP_PRIVATE says:   Enable Allocation (allow access)

But I think there is merit in asking the question whether the problem
is a GFP flag or the current node iterations thoughout the system.

My concern is essentially some driver doing something like:

   for node in possible_nodes:
       alloc_pages_node(..., node, __GFP_THISNODE);

Which, while silly looking, its not hard to imagine such a pattern
accidentally creeping into code in a less obvious form.

I'll take some time to chew on it - maybe the answer is private nodes
should not be in the default node iteration macros either.

I had briefly considered this, but had moved on when I figured out
removing these nodes from the fallback lists.

> >> Again, I am not sure about compaction and khugepaged. All we want to
> >> guarantee is that our memory does not leave the private node.
> >>
> >> That doesn't require any __GFP_PRIVATE magic, just en-lighting these
> >> subsystems that private nodes must use __GFP_THISNODE and must not leak
> >> to other nodes.
> > 
> > This is where specific use-cases matter.
> > 
> > In the compressed memory example - the device doesn't care about memory
> > leaving - but it cares about memory arriving and *and being modified*.
> > (more on this in your next question)
> 
> Right, but naive me would say that that's a memory allocation problem,
> right?
> 

Allocation is only 1 part of the problem - the second is modification.

Putting aside that I don't think this memory should be mempolicy
enabled for the moment - the problem is best described in code:

    /* We have a 512MB compressed memory region */
    buf = malloc(1GB);
    mbind(buf, compressed_node); 

    /* Nothing is faulted yet - our first chance to catch OOM */
    memset(buf, 0x42, 1GB);  /* Allocation - compressed nicely */

    /* Pages are now faulted and have R/W PTEs */
    memcpy(buf, uncompressible, 1GB); 

    /* There is a bear chasing you now, run fast. */

There is nothing an operating system can do to slow down the writer in
this scenario - the memory is faulted and mapped R/W in the page tables.

Another way to think about this is that modification is basically a
"Re-allocation" on the device with the CPU and OS removed from the loop.

So you need both allocation control (private node, dmeotion only) and
modification control (PTE write-protection) to make this reliable.

> khugepaged() wants to allocate a 2M page to collapse. Goes to the buddy
> to allocate it.
> 
> Buddy has to say no if the device cannot support it.
> 
> So there are free pages but we just don't want to hand them out.
>

On the allocation side - I think we can borrow from kernel free page
reporting and/or ballooning to control this aspect.

But on the khugepaged observation... hmm

If we regularly scanned the compressed node, we could soft-protect them
similar to the way numa balancing sets prot_none.

Combined with the node being demotion-only, this might be sufficient
unless you're riding the line pretty hard.

If a write-protect node attribute is a bridge too far, this might be
the best we can do.

Hmmmm. As usual, you have given me something very interesting to chew on
- thank you David.

> > 
> > tl;dr: informative mechanism - but it probably should be dropped,
> > it makes no sense (it's device memory, pinnings mean nothing?).
> 
> What I was thinking: We still have different zone options for this memory.
> 
> Expose memory to ZONE_MOVABLE -> no longterm pinning allowed.
> 
> Expose memory to ZONE_NORMAL -> longterm pinning allowed.
>

Yeah I have this in my pile of notes somewhere and it just fell out of
my context window.

This is actually a nice example of how isolation is better dealt with at
the node level, while ZONE suddenly becomes just another attribute bit.

In my response to Alistair, I pointed out that zones almost become
meaningless on a private node (almost).

If you have a private node in ZONE_NORMAL, and your services are in full
control of how the allocations occur and what code touches them - you
can still (in theory) guarantee the unpluggability of that memory with
proper startup/teardown of the service.

So what's the use in ZONE_MOVABLE existing for a private node? :]

> > 
> > Yeah i'm trying to avoid it, and the answer may actually just exist in
> > the task-death and VMA cleanup path rather than the folio-free path.
> > 
> > From what i've seen of accelerator drivers that implement this, when you
> > inform the driver of a memory region with a task, the driver should have
> > a mechanism to take references on that VMA (or something like this) - so
> > that when the task dies the driver has a way to be notified of the VMA
> > being cleaned up.
> > 
> > This probably exists - I just haven't gotten there yet.
> 
> That sounds reasonable. Alternatively, maybe the buddy can just inform
> the driver about pages getting freed?
>
> Again, just a another random thought. But if these nodes are already
> special-private, then why not enlighten the buddy in some way.
> 
> That also aligns with my "buddy rejects to hand out free pages if the
> device says no" case.
> 
> Something to thinker about.
> 

The only thing i'll push back on here is this implies an ops callback
in the buddy (on free, at least - alloc could be a bitcheck on pgdat).

But yes, the current RFC has a free_folio() callback just like
zone_device.  The problem starts to become obvious when you let
other parts of mm/ touch those pages.

There are at least 3 or 4 different paths back into the buddy that
would need to be instrumented this way.

Some of them are called in NMI contexts.

The questions about "What is safe" start piling up very quick, and they
are hard to answer definitively.  I think we should at make strong attempt
to avoid such things entirely if possible.

~Gregory

^ permalink raw reply

* Re: [PATCH] ftrace: fix use-after-free of mod->name in function_stat_show()
From: Steven Rostedt @ 2026-04-17 14:18 UTC (permalink / raw)
  To: Xiang Gao
  Cc: mhiramat, mark.rutland, mathieu.desnoyers, linux-kernel,
	linux-trace-kernel, Xiang Gao
In-Reply-To: <20260416083335.920555-1-gxxa03070307@gmail.com>


The tracing subsystem expects subjects to start with a capital letter:

  ftrace: Fix use-after-free of mod-name in function_stat_show()


On Thu, 16 Apr 2026 16:33:35 +0800
Xiang Gao <gxxa03070307@gmail.com> wrote:

> From: Xiang Gao <gaoxiang17@xiaomi.com>
> 
> function_stat_show() uses guard(rcu)() inside the else block to hold
> the RCU read lock while calling __module_text_address() and accessing
> mod->name. However, guard(rcu)() ties the RCU read lock lifetime to
> the scope of the else block. The original code stores mod->name into
> refsymbol and uses it in snprintf() after the else block exits,
> at which point the RCU read lock has already been released. If the
> module is concurrently unloaded, mod->name is freed, causing a
> use-after-free.
> 
> Fix by moving the snprintf() call into each branch of the if/else,
> so that mod->name is only accessed while the RCU read lock is held.
> refsymbol now points to the local str buffer (which already contains
> the formatted string) rather than to mod->name, and is only used
> afterwards as a non-NULL indicator to skip the kallsyms_lookup()
> fallback.

Was AI used for any part of this patch? Including finding the bug? If
so, it must be disclosed.

> 
> Signed-off-by: Xiang Gao <gaoxiang17@xiaomi.com>
> ---
>  kernel/trace/ftrace.c | 10 ++++++----
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> index 413310912609..6217b363203c 100644
> --- a/kernel/trace/ftrace.c
> +++ b/kernel/trace/ftrace.c
> @@ -559,21 +559,23 @@ static int function_stat_show(struct seq_file *m, void *v)
>  		unsigned long offset;
>  
>  		if (core_kernel_text(rec->ip)) {
> -			refsymbol = "_text";
>  			offset = rec->ip - (unsigned long)_text;
> +			snprintf(str, sizeof(str), "  %s+%#lx",
> +				 "_text", offset);
> +			refsymbol = str;
>  		} else {
>  			struct module *mod;
>  
>  			guard(rcu)();

Just move guard(rcu) out of this if statement to include the below
reference. No need to make the code worse. This really looks like AI
slop :-(

-- Steve


>  			mod = __module_text_address(rec->ip);
>  			if (mod) {
> -				refsymbol = mod->name;
>  				/* Calculate offset from module's text entry address. */
>  				offset = rec->ip - (unsigned long)mod->mem[MOD_TEXT].base;
> +				snprintf(str, sizeof(str), "  %s+%#lx",
> +					 mod->name, offset);
> +				refsymbol = str;
>  			}
>  		}
> -		if (refsymbol)
> -			snprintf(str, sizeof(str), "  %s+%#lx", refsymbol, offset);
>  	}
>  	if (!refsymbol)
>  		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);


^ permalink raw reply

* Re: [PATCH v2 07/28] fsnotify: add FSNOTIFY_EVENT_RENAME data type
From: Jan Kara @ 2026-04-17 11:56 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein, Calum Mackay, linux-fsdevel,
	linux-kernel, linux-trace-kernel, linux-doc, linux-nfs
In-Reply-To: <20260416-dir-deleg-v2-7-851426a550f6@kernel.org>

On Thu 16-04-26 10:35:08, Jeff Layton wrote:
> Add a new fsnotify_rename_data struct and FSNOTIFY_EVENT_RENAME data
> type that carries both the moved dentry and the inode that was
> overwritten by the rename (if any).
> 
> Update fsnotify_data_inode(), fsnotify_data_dentry(), and
> fsnotify_data_sb() to handle the new type, and add a new
> fsnotify_data_rename_target() helper for extracting the overwritten
> target inode.
> 
> Update fsnotify_move() to use the new data type for FS_RENAME and
> FS_MOVED_TO events, passing the overwritten target inode through the
> event data. FS_MOVED_FROM is unchanged since the source directory
> doesn't need overwrite information.
> 
> This is done so that fsnotify consumers like nfsd can atomically
> observe the overwritten file when a rename replaces an existing entry,
> without needing a separate FS_DELETE event.
> 
> Assisted-by: Claude (Anthropic Claude Code)
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  include/linux/fsnotify.h         |  8 ++++++--
>  include/linux/fsnotify_backend.h | 20 ++++++++++++++++++++
>  2 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
> index 079c18bcdbde..bda798bc67bc 100644
> --- a/include/linux/fsnotify.h
> +++ b/include/linux/fsnotify.h
> @@ -257,6 +257,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
>  	__u32 new_dir_mask = FS_MOVED_TO;
>  	__u32 rename_mask = FS_RENAME;
>  	const struct qstr *new_name = &moved->d_name;
> +	struct fsnotify_rename_data rd = {
> +		.moved = moved,
> +		.target = target,
> +	};
>  
>  	if (isdir) {
>  		old_dir_mask |= FS_ISDIR;
> @@ -265,12 +269,12 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
>  	}
>  
>  	/* Event with information about both old and new parent+name */
> -	fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
> +	fsnotify_name(rename_mask, &rd, FSNOTIFY_EVENT_RENAME,
>  		      old_dir, old_name, 0);
>  
>  	fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
>  		      old_dir, old_name, fs_cookie);
> -	fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
> +	fsnotify_name(new_dir_mask, &rd, FSNOTIFY_EVENT_RENAME,
>  		      new_dir, new_name, fs_cookie);
>  
>  	if (target)
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 66e185bd1b1b..f8c8fb7f34ae 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -311,6 +311,7 @@ enum fsnotify_data_type {
>  	FSNOTIFY_EVENT_DENTRY,
>  	FSNOTIFY_EVENT_MNT,
>  	FSNOTIFY_EVENT_ERROR,
> +	FSNOTIFY_EVENT_RENAME,
>  };
>  
>  struct fs_error_report {
> @@ -335,6 +336,11 @@ struct fsnotify_mnt {
>  	u64 mnt_id;
>  };
>  
> +struct fsnotify_rename_data {
> +	struct dentry *moved;	/* the dentry that was renamed */
> +	struct inode *target;	/* inode overwritten by rename, or NULL */
> +};
> +
>  static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
>  {
>  	switch (data_type) {
> @@ -348,6 +354,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
>  		return d_inode(file_range_path(data)->dentry);
>  	case FSNOTIFY_EVENT_ERROR:
>  		return ((struct fs_error_report *)data)->inode;
> +	case FSNOTIFY_EVENT_RENAME:
> +		return d_inode(((const struct fsnotify_rename_data *)data)->moved);
>  	default:
>  		return NULL;
>  	}
> @@ -363,6 +371,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ
>  		return ((const struct path *)data)->dentry;
>  	case FSNOTIFY_EVENT_FILE_RANGE:
>  		return file_range_path(data)->dentry;
> +	case FSNOTIFY_EVENT_RENAME:
> +		return ((struct fsnotify_rename_data *)data)->moved;
>  	default:
>  		return NULL;
>  	}
> @@ -395,6 +405,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
>  		return file_range_path(data)->dentry->d_sb;
>  	case FSNOTIFY_EVENT_ERROR:
>  		return ((struct fs_error_report *) data)->sb;
> +	case FSNOTIFY_EVENT_RENAME:
> +		return ((const struct fsnotify_rename_data *)data)->moved->d_sb;
>  	default:
>  		return NULL;
>  	}
> @@ -430,6 +442,14 @@ static inline struct fs_error_report *fsnotify_data_error_report(
>  	}
>  }
>  
> +static inline struct inode *fsnotify_data_rename_target(const void *data,
> +							int data_type)
> +{
> +	if (data_type == FSNOTIFY_EVENT_RENAME)
> +		return ((const struct fsnotify_rename_data *)data)->target;
> +	return NULL;
> +}
> +
>  static inline const struct file_range *fsnotify_data_file_range(
>  							const void *data,
>  							int data_type)
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v2 05/28] fsnotify: new tracepoint in fsnotify()
From: Jan Kara @ 2026-04-17 11:49 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein, Calum Mackay, linux-fsdevel,
	linux-kernel, linux-trace-kernel, linux-doc, linux-nfs
In-Reply-To: <20260416-dir-deleg-v2-5-851426a550f6@kernel.org>

On Thu 16-04-26 10:35:06, Jeff Layton wrote:
> Add a tracepoint so we can see exactly how this is being called.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/notify/fsnotify.c            |  5 ++++
>  include/trace/events/fsnotify.h | 51 +++++++++++++++++++++++++++++++++++++++++
>  include/trace/misc/fsnotify.h   | 35 ++++++++++++++++++++++++++++
>  3 files changed, 91 insertions(+)
> 
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index 9995de1710e5..5448738635f6 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -14,6 +14,9 @@
>  #include <linux/fsnotify_backend.h>
>  #include "fsnotify.h"
>  
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/fsnotify.h>
> +
>  /*
>   * Clear all of the marks on an inode when it is being evicted from core
>   */
> @@ -504,6 +507,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
>  	int ret = 0;
>  	__u32 test_mask, marks_mask = 0;
>  
> +	trace_fsnotify(mask, data, data_type, dir, file_name, inode, cookie);
> +
>  	if (path)
>  		mnt = real_mount(path->mnt);
>  
> diff --git a/include/trace/events/fsnotify.h b/include/trace/events/fsnotify.h
> new file mode 100644
> index 000000000000..341bbd57a39b
> --- /dev/null
> +++ b/include/trace/events/fsnotify.h
> @@ -0,0 +1,51 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM fsnotify
> +
> +#if !defined(_TRACE_FSNOTIFY_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_FSNOTIFY_H
> +
> +#include <linux/tracepoint.h>
> +
> +#include <trace/misc/fsnotify.h>
> +
> +TRACE_EVENT(fsnotify,
> +	TP_PROTO(__u32 mask, const void *data, int data_type,
> +		 struct inode *dir, const struct qstr *file_name,
> +		 struct inode *inode, u32 cookie),
> +
> +	TP_ARGS(mask, data, data_type, dir, file_name, inode, cookie),
> +
> +	TP_STRUCT__entry(
> +		__field(__u32, mask)
> +		__field(unsigned long, dir_ino)
> +		__field(unsigned long, ino)
> +		__field(dev_t, s_dev)
> +		__field(int, data_type)
> +		__field(u32, cookie)
> +		__string(file_name, file_name ? (const char *)file_name->name : "")
> +	),
> +
> +	TP_fast_assign(
> +		__entry->mask = mask;
> +		__entry->dir_ino = dir ? dir->i_ino : 0;
> +		__entry->ino = inode ? inode->i_ino : 0;
> +		__entry->s_dev = dir ? dir->i_sb->s_dev :
> +				 inode ? inode->i_sb->s_dev : 0;
> +		__entry->data_type = data_type;
> +		__entry->cookie = cookie;
> +		__assign_str(file_name);
> +	),
> +
> +	TP_printk("dev=%d:%d dir=%lu ino=%lu data_type=%d cookie=0x%x mask=0x%x %s name=%s",
> +		  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
> +		  __entry->dir_ino, __entry->ino,
> +		  __entry->data_type, __entry->cookie,
> +		  __entry->mask, show_fsnotify_mask(__entry->mask),
> +		  __get_str(file_name))
> +);
> +
> +#endif /* _TRACE_FSNOTIFY_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/include/trace/misc/fsnotify.h b/include/trace/misc/fsnotify.h
> new file mode 100644
> index 000000000000..a201e1bd6d8c
> --- /dev/null
> +++ b/include/trace/misc/fsnotify.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Display helpers for fsnotify events
> + */
> +
> +#include <linux/fsnotify_backend.h>
> +
> +#define show_fsnotify_mask(mask) \
> +	__print_flags(mask, "|", \
> +		{ FS_ACCESS,		"ACCESS" }, \
> +		{ FS_MODIFY,		"MODIFY" }, \
> +		{ FS_ATTRIB,		"ATTRIB" }, \
> +		{ FS_CLOSE_WRITE,	"CLOSE_WRITE" }, \
> +		{ FS_CLOSE_NOWRITE,	"CLOSE_NOWRITE" }, \
> +		{ FS_OPEN,		"OPEN" }, \
> +		{ FS_MOVED_FROM,	"MOVED_FROM" }, \
> +		{ FS_MOVED_TO,		"MOVED_TO" }, \
> +		{ FS_CREATE,		"CREATE" }, \
> +		{ FS_DELETE,		"DELETE" }, \
> +		{ FS_DELETE_SELF,	"DELETE_SELF" }, \
> +		{ FS_MOVE_SELF,		"MOVE_SELF" }, \
> +		{ FS_OPEN_EXEC,		"OPEN_EXEC" }, \
> +		{ FS_UNMOUNT,		"UNMOUNT" }, \
> +		{ FS_Q_OVERFLOW,	"Q_OVERFLOW" }, \
> +		{ FS_ERROR,		"ERROR" }, \
> +		{ FS_OPEN_PERM,		"OPEN_PERM" }, \
> +		{ FS_ACCESS_PERM,	"ACCESS_PERM" }, \
> +		{ FS_OPEN_EXEC_PERM,	"OPEN_EXEC_PERM" }, \
> +		{ FS_PRE_ACCESS,	"PRE_ACCESS" }, \
> +		{ FS_MNT_ATTACH,	"MNT_ATTACH" }, \
> +		{ FS_MNT_DETACH,	"MNT_DETACH" }, \
> +		{ FS_EVENT_ON_CHILD,	"EVENT_ON_CHILD" }, \
> +		{ FS_RENAME,		"RENAME" }, \
> +		{ FS_DN_MULTISHOT,	"DN_MULTISHOT" }, \
> +		{ FS_ISDIR,		"ISDIR" })
> 
> -- 
> 2.53.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: David Hildenbrand (Arm) @ 2026-04-17  9:50 UTC (permalink / raw)
  To: Gregory Price, Frank van der Linden
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <aeA6aNDpQ-U5UJCs@gourry-fedora-PF4VCD3F>

On 4/16/26 03:24, Gregory Price wrote:
> On Wed, Apr 15, 2026 at 12:47:50PM -0700, Frank van der Linden wrote:
>>
>> This has been a really great discussion. I just wanted to add a few
>> points that I think I have mentioned in other forums, but not here.
>>
>> In essence, this is a discussion about memory properties and the level
>> at which they should be dealt with. Right now there are basically 3
>> levels: pageblocks, zones and nodes. While these levels exist for good
>> reasons, they also sometimes lead to issues. There's duplication of
>> functionality. MIGRATE_CMA and ZONE_MOVABLE both implement the same
>> basic property, but at different levels (attempts have been made to
>> merge them, but it didn't work out).
> 
> I have made this observation as well.  ZONEs in particular are a bit
> odd because they're somehow simultaneously too broad and too narrow in
> terms of what they control and what they're used for.
> 
> 1GB ZONE_MOVABLE HugeTLBFS Pages is an example weird carve-out, because
> the memory is in ZONE_MOVABLE to help make 1GB allocations more
> reliable, but 1GB movable pages were removed from the kernel because
> they're not easily migrated (and therefore may block hot-unplug).
> 
> (Thankfully they're back now, so VMs can live on this memory :P)

Heh, but longterm-pinning would fail on them (making vfio with VMs
angry). Similar to CMA hugetlb.

In the latter case, we should have a way to identify "this allocation is
actually from the CMA owner, so longterm pinning is perfectly fine".
Checking the CMA alloc state would be one approach, but that's rather
nasty. I guess there would be ways to make that work.

I'd assume that people barely rely on 1GB ZONE_MOVABLE HugeTLBFS Pages
(iow, mixing kernel-cmdline ZONE_MOVABLE creation with kernel-cmdline
hugetlb reservation).

I'll note that there was long long ago a proposal of converting
ZONE_MOVABLE to "sticky-movable" page blocks. It wouldn't really solve
this problem, though, where the early boot code just does something
that's rather stupid.


-- 
Cheers,

David

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: David Hildenbrand (Arm) @ 2026-04-17  9:39 UTC (permalink / raw)
  To: Frank van der Linden, Gregory Price
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <CAPTztWajm_JLpp9BjRcX=h72r25ELrXeGkOXVachybBxLJGS=g@mail.gmail.com>

On 4/15/26 21:47, Frank van der Linden wrote:
> On Wed, Apr 15, 2026 at 8:18 AM Gregory Price <gourry@gourry.net> wrote:
>>
>> On Wed, Apr 15, 2026 at 11:49:59AM +0200, David Hildenbrand (Arm) wrote:
>>
>> As a preface - the current RFC was informed by ZONE_DEVICE patterns.
>>
>> I think that was useful as a way to find existing friction points - but
>> ultimately wrong for this new interface.
>>
>> I don't thinks an ops struct here is the right design, and I think there
>> are only a few patterns that actually make sense for device memory using
>> nodes this way.
>>
>> So there's going to be a *major* contraction in the complexity of this
>> patch series (hopefully I'll have something next week), and much of what
>> you point out below is already in-flight.
>>
>>>
>> ... snip ...
>>>
>>> A related series proposed some  MEM_READ/WRITE backend requests [1]
>>>
>>> [1] https://lists.nongnu.org/archive/html/qemu-devel/2024-09/msg02693.html
>>>
>>
>> Oh interesting, thank you for the reference here.
>>
>>>
>>> Something else people were discussing in the past was to physically
>>> limit the area where virtio queues could be placed.
>>>
>>
>> That is functionally what I did - the idea was pretty simple, just have
>> a separate memfd/node dedicated for the queues:
>>
>> guest_memory = memfd(MAP_PRIVATE)
>> net_memory = memfd(MAP_SHARED)
>>
>> And boom, you get what you want.
>>
>> So yeah "It works" - but there's likely other ways to do this too, and
>> as you note re: compatibility, i'm not sure virtio actually wants this,
>> but it's a nice proof-of-concept for a network device on the host that
>> carries its own memory.
>>
>> I'll try post my hack as an example with the next RFC version, as I
>> think it's informative.
>>
>>>
>>> But that's a different "fallback" problem, no?
>>>
>>> You want allocations that target the "special node" to fallback to
>>> *other* nodes, but not other allocations to fallback to *this special* node.
>>>
>> ... snip - slight reordering to put thoughts together ...
>>>
>>> Needs a second thought regarding fallback logic I raised above.
>>>
>>> What I think would have to be audited is the usage of __GFP_THISNODE by
>>> kernel allocations, where we would not actually want to allocate from
>>> this private node.
>>>
>>
>> This is fair, and I a re-visit is absolutely warranted.
>>
>> Re-examining the quick audit from my last response suggests - I should
>> never have seen leakage in those cases, but the fallbacks are needed.
>>
>> So yes, this all requires a second look (and a third, and a ninth).
>>
>> I'm not married to __GFP_PRIVATE, but it has been reliable for me.
>>
>>> Maybe we could just outright refuse *any* non-user (movable) allocations
>>> that target the node, even with __GFP_THISNODE.
>>>
>>> Because, why would we want kernel allocations to even end up on a
>>> private node that is supposed to only be consumed by user space? Or
>>> which use cases are there where we would want to place kernel
>>> allocations on there?
>>>
>>
>> As a start, maybe? But as a permanent invariant?  I would wonder whether
>> the decision here would lock us into a design.
>>
>> But then - this is all kernel internal, so i think it would be feasible
>> to change this out from under users without backward compatibility pain.
>>
>> So far I have done my best to avoid changing any userland interfaces in
>> a way that would fundamentally change the contracts.  If anything
>> private-node other than just the node's `has_memory_private` attribute
>> leaks into userland, someone messed up.
>>
>> So... I think that's reasonable.
>>
>>>
>>> I assume you will be as LSF/MM? Would be good to discuss some of that in
>>> person.
>>>
>>
>> Yes, looking forward to it :]
>>
>>
>>>
>>>
>>> Again, I am not sure about compaction and khugepaged. All we want to
>>> guarantee is that our memory does not leave the private node.
>>>
>>> That doesn't require any __GFP_PRIVATE magic, just en-lighting these
>>> subsystems that private nodes must use __GFP_THISNODE and must not leak
>>> to other nodes.
>>
>> This is where specific use-cases matter.
>>
>> In the compressed memory example - the device doesn't care about memory
>> leaving - but it cares about memory arriving and *and being modified*.
>> (more on this in your next question)
>>
>> So i'm not convinced *all possible devices* would always want to support
>> move_pages(), mbind(), and set_mempolicy().
>>
>> But, I do want to give this serious thought, and I agree the absolute
>> minimal patch set could just be the fallback control mechanism and
>> mm/ component filters/audit on __GFP_*.
>>
>>
>>>
>>> I'm missing why these are even opt-in. What's the problem with allowing
>>> mbind and mempolicy to use these nodes in some of your drivers?
>>>
>>
>> First:
>>
>> In my latest working branch these two flags have been folded into just
>> _OPS_MEMPOLICY and any other migration interaction is just handled by
>> filtering with the GFP flag.
>>
>>
>> on always allowing mbind and mempolicy vs opt-in
>> ---
>>
>> A proper compressed memory solution should not allow mbind/mempolicy.
>>
>> Compressed memory is different from normal memory - as the kernel can
>> percieves free memory (many unused struct page in the buddy) when the
>> device knows there's none left (the physical capacity is actually full).
>>
>> Any form of write to a compressed memory device is essentially a
>> dangerous condition (OOMs = poison, not oom_kill()).
>>
>> So you need two controls:  Allocation and (userland) Write protection
>> I implemented via:
>>     - Demotion-only (allocations only happen in reclaim path)
>>     - Write-protecting the entire node
>>
>> (I fully accept that a write-protection extension here might be a bridge
>>  to far, but please stick with me for the sake of exploration).
>>
>>
>> There's a serious argument to limit these devices to using an mbind
>> pattern, but I wanted to make a full-on attempt to integrate this device
>> into the demotion path as a transparent tier (kinda like zswap).
>>
>> I could not square write-protection with mempolicy, so i had to make
>> them both optional and mutually exclusive.
>>
>> If you limit the device to mbind interactions, you do limit what can
>> crash - but this forces userland software to be less portable by design:
>>
>>   - am i running on a system where this device is present?
>>   - is that device exposing its memory on a node?
>>   - which node?
>>   - what memory can i put on that node? (can you prevent a process from
>>     putting libc on that node?)
>>   - how much compression ratio is left on the device?
>>   - can i safety write to this virtual address?
>>   - should i write-protect compressed VMAs? Can i handle those faults?
>>   - many more
>>
>> That sounds a lot like re-implementing a bunch of mm/ in userland, and
>> that's exactly where we were at with DAX.  We know this pattern failed.
>>
>> I'm trying to very much avoid repeating these mistakes, and so I'm very
>> much trying to find a good path forward here that results in transparent
>> usage of this memory.
>>
>>
>>> I also have some questions about longterm pinnings, but that's better
>>> discussed in person :)
>>>
>>
>> The longterm pin extention came from auditing existing zone_device
>> filters.
>>
>> tl;dr: informative mechanism - but it probably should be dropped,
>> it makes no sense (it's device memory, pinnings mean nothing?).
>>
>>
>>>
>>> Right, that's rather invasive.
>>>
>>
>> Yeah i'm trying to avoid it, and the answer may actually just exist in
>> the task-death and VMA cleanup path rather than the folio-free path.
>>
>> From what i've seen of accelerator drivers that implement this, when you
>> inform the driver of a memory region with a task, the driver should have
>> a mechanism to take references on that VMA (or something like this) - so
>> that when the task dies the driver has a way to be notified of the VMA
>> being cleaned up.
>>
>> This probably exists - I just haven't gotten there yet.
>>
>> ~Gregory
> 
> This has been a really great discussion. I just wanted to add a few
> points that I think I have mentioned in other forums, but not here.
> 
> In essence, this is a discussion about memory properties and the level
> at which they should be dealt with. Right now there are basically 3
> levels: pageblocks, zones and nodes. While these levels exist for good
> reasons, they also sometimes lead to issues. There's duplication of
> functionality. MIGRATE_CMA and ZONE_MOVABLE both implement the same
> basic property, but at different levels (attempts have been made to
> merge them, but it didn't work out). There's also memory with clashing
> properties inhabiting the same data structure: LRUs. Having strictly
> movable memory on the same LRU as unmovable memory is a mismatch. It
> leads to the well known problem of reclaim done in the name of an
> unmovable allocation attempt can be entirely pointless in the face of
> large amounts of ZONE_MOVABLE or MIGRATE_CMA memory: the anon LRU will
> be chock full of movable-only pages. Reclaiming them is useless for
> your allocation, and skipping them leads to locking up the system
> because you're holding on to the LRU lock a long time.
> 
> So, looking at having some properties set at the node level makes
> sense to me even in the non-device case. But perhaps that is out of
> scope for the initial discussion.
> 
> One use case that seems like a good match for private nodes is guest
> memory. Guest memory is special enough to want to allocate / maintain
> it separately, which is acknowledged by the introduction of
> guest_memfd.

Yes. There is now an interface to configure mbind() for guest_memfd. So
with that and some tweaks, maybe that ... would just work, if we get the
mbind() interaction right?

-- 
Cheers,

David

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: David Hildenbrand (Arm) @ 2026-04-17  9:37 UTC (permalink / raw)
  To: Gregory Price
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <ad-r7hwIdnvKsrh9@gourry-fedora-PF4VCD3F>

On 4/15/26 17:17, Gregory Price wrote:
> On Wed, Apr 15, 2026 at 11:49:59AM +0200, David Hildenbrand (Arm) wrote:
>> On 4/13/26 19:05, Gregory Price wrote:
> 
> As a preface - the current RFC was informed by ZONE_DEVICE patterns.

:)

> 
> I think that was useful as a way to find existing friction points - but
> ultimately wrong for this new interface.
> 
> I don't thinks an ops struct here is the right design, and I think there
> are only a few patterns that actually make sense for device memory using
> nodes this way.
> 
> So there's going to be a *major* contraction in the complexity of this
> patch series (hopefully I'll have something next week), and much of what
> you point out below is already in-flight.

Sounds like this discussion was valuable. Sorry for not being that
responsive  ... repeatedly :)


[...]

>>
>> Something else people were discussing in the past was to physically
>> limit the area where virtio queues could be placed.
>>
> 
> That is functionally what I did - the idea was pretty simple, just have
> a separate memfd/node dedicated for the queues:
> 
> guest_memory = memfd(MAP_PRIVATE)
> net_memory = memfd(MAP_SHARED)
> 
> And boom, you get what you want.
> 
> So yeah "It works" - but there's likely other ways to do this too, and
> as you note re: compatibility, i'm not sure virtio actually wants this,
> but it's a nice proof-of-concept for a network device on the host that
> carries its own memory.
> 

Jup.


[...]

>> Needs a second thought regarding fallback logic I raised above.
>>
>> What I think would have to be audited is the usage of __GFP_THISNODE by
>> kernel allocations, where we would not actually want to allocate from
>> this private node.
>>
> 
> This is fair, and I a re-visit is absolutely warranted.
> 
> Re-examining the quick audit from my last response suggests - I should
> never have seen leakage in those cases, but the fallbacks are needed.
> 
> So yes, this all requires a second look (and a third, and a ninth).
> 
> I'm not married to __GFP_PRIVATE, but it has been reliable for me.

Yes, we should carefully describe which semantics we want to achieve, to
then figure out how we could achieve them.

> 
>> Maybe we could just outright refuse *any* non-user (movable) allocations
>> that target the node, even with __GFP_THISNODE.
>>
>> Because, why would we want kernel allocations to even end up on a
>> private node that is supposed to only be consumed by user space? Or
>> which use cases are there where we would want to place kernel
>> allocations on there?
>>
> 
> As a start, maybe? But as a permanent invariant?  I would wonder whether
> the decision here would lock us into a design.
> 
> But then - this is all kernel internal, so i think it would be feasible
> to change this out from under users without backward compatibility pain.

Right. Was just an idea, whether it would currently even make sense to
allow any kernel allocations on there.

The handful of kernel allocations that would be allowed to end up on
there would likely be extremely special.

[...]

>> Again, I am not sure about compaction and khugepaged. All we want to
>> guarantee is that our memory does not leave the private node.
>>
>> That doesn't require any __GFP_PRIVATE magic, just en-lighting these
>> subsystems that private nodes must use __GFP_THISNODE and must not leak
>> to other nodes.
> 
> This is where specific use-cases matter.
> 
> In the compressed memory example - the device doesn't care about memory
> leaving - but it cares about memory arriving and *and being modified*.
> (more on this in your next question)

Right, but naive me would say that that's a memory allocation problem,
right?

khugepaged() wants to allocate a 2M page to collapse. Goes to the buddy
to allocate it.

Buddy has to say no if the device cannot support it.

So there are free pages but we just don't want to hand them out.

I am being very naive here about the compressed memory scenario, because
it's one of these extremely weird corner cases ;)

[...]

>>> If you want the mbind contract to stay intact:
>>>
>>>    NP_OPS_MIGRATION (mbind can generate migrations)
>>>    NP_OPS_MEMPOLICY (this just tells mempolicy.c to allow the node)
>>
>> I'm missing why these are even opt-in. What's the problem with allowing
>> mbind and mempolicy to use these nodes in some of your drivers?
>>
> 
> First:
> 
> In my latest working branch these two flags have been folded into just
> _OPS_MEMPOLICY and any other migration interaction is just handled by
> filtering with the GFP flag. 
> 
> 
> on always allowing mbind and mempolicy vs opt-in
> ---
> 
> A proper compressed memory solution should not allow mbind/mempolicy.
> 
> Compressed memory is different from normal memory - as the kernel can
> percieves free memory (many unused struct page in the buddy) when the
> device knows there's none left (the physical capacity is actually full).
> 
> Any form of write to a compressed memory device is essentially a
> dangerous condition (OOMs = poison, not oom_kill()).
> 
> So you need two controls:  Allocation and (userland) Write protection
> I implemented via:
>     - Demotion-only (allocations only happen in reclaim path)
>     - Write-protecting the entire node
> 
> (I fully accept that a write-protection extension here might be a bridge
>  to far, but please stick with me for the sake of exploration).
> 
> 
> There's a serious argument to limit these devices to using an mbind
> pattern, but I wanted to make a full-on attempt to integrate this device
> into the demotion path as a transparent tier (kinda like zswap).
> 
> I could not square write-protection with mempolicy, so i had to make
> them both optional and mutually exclusive.
> 
> If you limit the device to mbind interactions, you do limit what can
> crash - but this forces userland software to be less portable by design:
> 
>   - am i running on a system where this device is present?
>   - is that device exposing its memory on a node?
>   - which node?
>   - what memory can i put on that node? (can you prevent a process from
>     putting libc on that node?)
>   - how much compression ratio is left on the device?
>   - can i safety write to this virtual address?
>   - should i write-protect compressed VMAs? Can i handle those faults?
>   - many more
> 
> That sounds a lot like re-implementing a bunch of mm/ in userland, and
> that's exactly where we were at with DAX.  We know this pattern failed.
> 
> I'm trying to very much avoid repeating these mistakes, and so I'm very
> much trying to find a good path forward here that results in transparent
> usage of this memory.
> 

As stated above, maybe that's really just a memory allocation problem
for mbind/khugepaged etc, and the memory allocator would need hooks to
say "well, I do have that free memory. but sorry bro, you really cannot
have it right now because it's actually not really free right now, -ENOMEM".

Devil is in the detail, I suppose.

(again, I consider such devices an extreme corner cases; if it makes the
overall design waaaaayy to complicated, we might just want to say "we
cannot reasonably support this without shittifying MM". But maybe there
are ways to handle this in a better way, as of above)

> 
>> I also have some questions about longterm pinnings, but that's better
>> discussed in person :)
>>
> 
> The longterm pin extention came from auditing existing zone_device
> filters.  
> 
> tl;dr: informative mechanism - but it probably should be dropped,
> it makes no sense (it's device memory, pinnings mean nothing?).

What I was thinking: We still have different zone options for this memory.

Expose memory to ZONE_MOVABLE -> no longterm pinning allowed.

Expose memory to ZONE_NORMAL -> longterm pinning allowed.

And if we don't even allow arbitrary kernel allocations to end up
ZONE_NORMAL of these special nodes, we can just start using ZONE_NORMAL
and let user space (using vfio/iouring fixed buffers etc) consume this
private memory with longterm pinning.

Just a random thought.

> 
> 
>>>
>>> The task dies and frees the pages back to the buddy - the question is
>>> whether the 4-5 free_folio paths (put_folio, put_unref_folios, etc) can
>>> all eat an ops.free_folio() callback to inform the driver the memory has
>>> been freed.
>>
>> Right, that's rather invasive.
>>
> 
> Yeah i'm trying to avoid it, and the answer may actually just exist in
> the task-death and VMA cleanup path rather than the folio-free path.
> 
> From what i've seen of accelerator drivers that implement this, when you
> inform the driver of a memory region with a task, the driver should have
> a mechanism to take references on that VMA (or something like this) - so
> that when the task dies the driver has a way to be notified of the VMA
> being cleaned up.
> 
> This probably exists - I just haven't gotten there yet.

That sounds reasonable. Alternatively, maybe the buddy can just inform
the driver about pages getting freed?

Again, just a another random thought. But if these nodes are already
special-private, then why not enlighten the buddy in some way.

That also aligns with my "buddy rejects to hand out free pages if the
device says no" case.

Something to thinker about.

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 0/3] mm: split the file's i_mmap tree for NUMA
From: Huang Shijie @ 2026-04-17  6:59 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: akpm, viro, brauner, linux-mm, linux-kernel, linux-arm-kernel,
	linux-fsdevel, muchun.song, osalvador, linux-trace-kernel,
	linux-perf-users, linux-parisc, nvdimm, zhongyuan, fangbaoshun,
	yingzhiwei
In-Reply-To: <76pfiwabdgsej6q2yxfh3efuqvsyg7mt7rvl5itzzjyhdrto5r@53viaxsackzv>

On Mon, Apr 13, 2026 at 05:33:21PM +0200, Mateusz Guzik wrote:
> On Mon, Apr 13, 2026 at 02:20:39PM +0800, Huang Shijie wrote:
> >   In NUMA, there are maybe many NUMA nodes and many CPUs.
> > For example, a Hygon's server has 12 NUMA nodes, and 384 CPUs.
> > In the UnixBench tests, there is a test "execl" which tests
> > the execve system call.
> > 
> >   When we test our server with "./Run -c 384 execl",
> > the test result is not good enough. The i_mmap locks contended heavily on
> > "libc.so" and "ld.so". For example, the i_mmap tree for "libc.so" can have 
> > over 6000 VMAs, all the VMAs can be in different NUMA mode.
> > The insert/remove operations do not run quickly enough.
> > 
> > patch 1 & patch 2 are try to hide the direct access of i_mmap.
> > patch 3 splits the i_mmap into sibling trees, and we can get better 
> > performance with this patch set:
> >     we can get 77% performance improvement(10 times average)
> > 
> 
> To my reading you kept the lock as-is and only distributed the protected
> state.
> 
> While I don't doubt the improvement, I'm confident should you take a
> look at the profile you are going to find this still does not scale with
> rwsem being one of the problems (there are other global locks, some of
> which have experimental patches for).
> 
> Apart from that this does nothing to help high core systems which are
> all one node, which imo puts another question mark on this specific
> proposal.
> 
> Of course one may question whether a RB tree is the right choice here,
> it may be the lock-protected cost can go way down with merely a better
> data structure.
> 
> Regardless of that, for actual scalability, there will be no way around
> decentralazing locking around this and partitioning per some core count
> (not just by numa awareness).
> 
> Decentralizing locking is definitely possible, but I have not looked
> into specifics of how problematic it is. Best case scenario it will
> merely with separate locks. Worst case scenario something needs a fully
> stabilized state for traversal, in that case another rw lock can be
> slapped around this, creating locking order read lock -> per-subset
> write lock -- this will suffer scalability due to the read locking, but
> it will still scale drastically better as apart from that there will be
> no serialization. In this setting the problematic consumer will write
> lock the new thing to stabilize the state.
For your proposal in no-numa, I hope you can create a patch set for it.
I can test it in our machine.

Thanks
Huang Shijie


^ permalink raw reply

* [PATCH] trace: propagate registration failure from tracing_start_*_record()
From: Yash Suthar @ 2026-04-17  6:38 UTC (permalink / raw)
  To: rostedt, mhiramat
  Cc: mathieu.desnoyers, linux-kernel, linux-trace-kernel, skhan, me,
	Yash Suthar, syzbot+a1d25e53cd4a10f7f2d3

syzbot reported a WARN in tracepoint_probe_unregister():

tracing_start_sched_switch() increments sched_cmdline_ref /
sched_tgid_ref before calling tracing_sched_register(), and its
return value is discarded because the API is void. When the first
register_trace_sched_*() fails (e.g. kmalloc under memory pressure
or failslab), the function's fail_deprobe* labels roll back any
partial probe registration, but the caller's refcount has already
been bumped. The state is now desynced: refs > 0 but no probes in
tp->funcs.

Later, when the caller pairs the start with a stop, the refcount
walks back to 0 and tracing_sched_unregister() calls
unregister_trace_sched_*() against an empty tp->funcs.
func_remove() returns -ENOENT and the
WARN_ON_ONCE(IS_ERR(old)) in tracepoint_remove_func() fires.

Fix: make tracing_start_sched_switch() and the two exported
wrappers, tracing_start_cmdline_record() and
tracing_start_tgid_record(), return int; register the probes
before bumping the refcount; and propagate the error to callers
so refs are only held on behalf of a caller whose registration
actually succeeded.

Fixes: d914ba37d714 ("tracing: Add support for recording tgid of tasks")
Reported-by: syzbot+a1d25e53cd4a10f7f2d3@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?id=f93e97cd824071a2577a40cde9ecd957f59f87eb

Signed-off-by: Yash Suthar <yashsuthar983@gmail.com>
---
 kernel/trace/trace.c                 |  6 +++---
 kernel/trace/trace.h                 |  4 ++--
 kernel/trace/trace_events.c          | 28 +++++++++++++++++++--------
 kernel/trace/trace_functions.c       |  8 +++++++-
 kernel/trace/trace_functions_graph.c |  6 +++++-
 kernel/trace/trace_sched_switch.c    | 29 ++++++++++++++++++----------
 kernel/trace/trace_selftest.c        |  7 ++++++-
 7 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8bd4ec08fb36..e936eed99b27 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3320,7 +3320,7 @@ void trace_printk_init_buffers(void)
 	 * allocated here, then this was called by module code.
 	 */
 	if (global_trace.array_buffer.buffer)
-		tracing_start_cmdline_record();
+		(void)tracing_start_cmdline_record();
 }
 EXPORT_SYMBOL_GPL(trace_printk_init_buffers);
 
@@ -3329,7 +3329,7 @@ void trace_printk_start_comm(void)
 	/* Start tracing comms if trace printk is set */
 	if (!buffers_allocated)
 		return;
-	tracing_start_cmdline_record();
+	(void)tracing_start_cmdline_record();
 }
 
 static void trace_printk_start_stop_comm(int enabled)
@@ -3338,7 +3338,7 @@ static void trace_printk_start_stop_comm(int enabled)
 		return;
 
 	if (enabled)
-		tracing_start_cmdline_record();
+		(void)tracing_start_cmdline_record();
 	else
 		tracing_stop_cmdline_record();
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b6d42fe06115..6fe2c8429560 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -751,9 +751,9 @@ void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
 int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
 		      struct ftrace_regs *fregs);
 
-void tracing_start_cmdline_record(void);
+int tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
-void tracing_start_tgid_record(void);
+int tracing_start_tgid_record(void);
 void tracing_stop_tgid_record(void);
 
 int register_tracer(struct tracer *type);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 137b4d9bb116..e6713aa80a03 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -734,9 +734,9 @@ void trace_event_enable_cmd_record(bool enable)
 			continue;
 
 		if (enable) {
-			tracing_start_cmdline_record();
-			set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
-		} else {
+			if (!tracing_start_cmdline_record())
+				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+		} else if (file->flags & EVENT_FILE_FL_RECORDED_CMD) {
 			tracing_stop_cmdline_record();
 			clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 		}
@@ -755,9 +755,9 @@ void trace_event_enable_tgid_record(bool enable)
 			continue;
 
 		if (enable) {
-			tracing_start_tgid_record();
-			set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
-		} else {
+			if (!tracing_start_tgid_record())
+				set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+		} else if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
 			tracing_stop_tgid_record();
 			clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
 				  &file->flags);
@@ -847,14 +847,26 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 				set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
 
 			if (tr->trace_flags & TRACE_ITER(RECORD_CMD)) {
+				ret = tracing_start_cmdline_record();
+				if (ret) {
+					pr_info("event trace: Could not enable event %s\n",
+						trace_event_name(call));
+					break;
+				}
 				cmd = true;
-				tracing_start_cmdline_record();
 				set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 
 			if (tr->trace_flags & TRACE_ITER(RECORD_TGID)) {
+				ret = tracing_start_tgid_record();
+				if (ret) {
+					if (cmd)
+						tracing_stop_cmdline_record();
+					pr_info("event trace: Could not enable event %s\n",
+						trace_event_name(call));
+					break;
+				}
 				tgid = true;
-				tracing_start_tgid_record();
 				set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
 			}
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c12795c2fb39..14d099734345 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -146,6 +146,8 @@ static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
 static int function_trace_init(struct trace_array *tr)
 {
 	ftrace_func_t func;
+	int ret;
+
 	/*
 	 * Instance trace_arrays get their ops allocated
 	 * at instance creation. Unless it failed
@@ -165,7 +167,11 @@ static int function_trace_init(struct trace_array *tr)
 
 	tr->array_buffer.cpu = raw_smp_processor_id();
 
-	tracing_start_cmdline_record();
+	ret = tracing_start_cmdline_record();
+	if (ret) {
+		ftrace_reset_array_ops(tr);
+		return ret;
+	}
 	tracing_start_function_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 1de6f1573621..6b27ed62fee8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -487,7 +487,11 @@ static int graph_trace_init(struct trace_array *tr)
 	ret = register_ftrace_graph(tr->gops);
 	if (ret)
 		return ret;
-	tracing_start_cmdline_record();
+	ret = tracing_start_cmdline_record();
+	if (ret) {
+		unregister_ftrace_graph(tr->gops);
+		return ret;
+	}
 
 	return 0;
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c46d584ded3b..683ea4ca1498 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -89,12 +89,22 @@ static void tracing_sched_unregister(void)
 	unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
 }
 
-static void tracing_start_sched_switch(int ops)
+static int tracing_start_sched_switch(int ops)
 {
-	bool sched_register;
+	int ret = 0;
 
 	mutex_lock(&sched_register_mutex);
-	sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
+
+	/*
+	 * If the registration fails, do not bump the reference count : the
+	 * caller must observe the failure so it can avoid a later matching
+	 * stop that would otherwise unregister probes that were never added.
+	 */
+	if (!sched_cmdline_ref && !sched_tgid_ref) {
+		ret = tracing_sched_register();
+		if (ret)
+			goto out;
+	}
 
 	switch (ops) {
 	case RECORD_CMDLINE:
@@ -105,10 +115,9 @@ static void tracing_start_sched_switch(int ops)
 		sched_tgid_ref++;
 		break;
 	}
-
-	if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
-		tracing_sched_register();
+out:
 	mutex_unlock(&sched_register_mutex);
+	return ret;
 }
 
 static void tracing_stop_sched_switch(int ops)
@@ -130,9 +139,9 @@ static void tracing_stop_sched_switch(int ops)
 	mutex_unlock(&sched_register_mutex);
 }
 
-void tracing_start_cmdline_record(void)
+int tracing_start_cmdline_record(void)
 {
-	tracing_start_sched_switch(RECORD_CMDLINE);
+	return tracing_start_sched_switch(RECORD_CMDLINE);
 }
 
 void tracing_stop_cmdline_record(void)
@@ -140,9 +149,9 @@ void tracing_stop_cmdline_record(void)
 	tracing_stop_sched_switch(RECORD_CMDLINE);
 }
 
-void tracing_start_tgid_record(void)
+int tracing_start_tgid_record(void)
 {
-	tracing_start_sched_switch(RECORD_TGID);
+	return tracing_start_sched_switch(RECORD_TGID);
 }
 
 void tracing_stop_tgid_record(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index d88c44f1dfa5..238e7451f8e4 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1084,7 +1084,12 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 		warn_failed_init_tracer(trace, ret);
 		goto out;
 	}
-	tracing_start_cmdline_record();
+	ret = tracing_start_cmdline_record();
+	if (ret) {
+		unregister_ftrace_graph(&fgraph_ops);
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
 
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v4] tracing: Bound synthetic-field strings with seq_buf
From: Pengpeng Hou @ 2026-04-17 12:20 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Tom Zanussi, Mathieu Desnoyers, linux-trace-kernel, linux-kernel,
	pengpeng
In-Reply-To: <20260409103001.1-tracing-hist-synth-v3-pengpeng@iscas.ac.cn>

The synthetic field helpers build a prefixed synthetic variable name and
a generated hist command in fixed MAX_FILTER_STR_VAL buffers. The
current code appends those strings with raw strcat(), so long key lists,
field names, or saved filters can run past the end of the staging
buffers.

Build both strings with seq_buf and propagate -E2BIG if either the
synthetic variable name or the generated command exceeds
MAX_FILTER_STR_VAL. This keeps the existing tracing-side limit while
using the helper intended for bounded command construction.

Fixes: 02205a6752f2 ("tracing: Add support for 'field variables'")
Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
---
Changes since v3:
- add the requested comment before seq_buf_str()
- keep the saved_filter lookup next to its use
- drop the unrelated event_var simplification from the previous respin

 kernel/trace/trace_events_hist.c | 44 ++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 73ea180cad55..99da91461abc 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/security.h>
+#include <linux/seq_buf.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
@@ -2962,14 +2963,22 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
 			 char *system, char *event_name, char *field_name)
 {
 	struct hist_field *event_var;
+	struct seq_buf s;
 	char *synthetic_name;
 
 	synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
 	if (!synthetic_name)
 		return ERR_PTR(-ENOMEM);
 
-	strcpy(synthetic_name, "synthetic_");
-	strcat(synthetic_name, field_name);
+	seq_buf_init(&s, synthetic_name, MAX_FILTER_STR_VAL);
+	seq_buf_puts(&s, "synthetic_");
+	seq_buf_puts(&s, field_name);
+	/* Terminate synthetic_name with a NUL. */
+	seq_buf_str(&s);
+	if (seq_buf_has_overflowed(&s)) {
+		kfree(synthetic_name);
+		return ERR_PTR(-E2BIG);
+	}
 
 	event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
 
@@ -3014,7 +3023,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 	struct trace_event_file *file;
 	struct hist_field *key_field;
 	struct hist_field *event_var;
-	char *saved_filter;
+	struct seq_buf s;
 	char *cmd;
 	int ret;
 
@@ -3059,28 +3068,35 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	seq_buf_init(&s, cmd, MAX_FILTER_STR_VAL);
+
 	/* Use the same keys as the compatible histogram */
-	strcat(cmd, "keys=");
+	seq_buf_puts(&s, "keys=");
 
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
 		if (!first)
-			strcat(cmd, ",");
-		strcat(cmd, key_field->field->name);
+			seq_buf_putc(&s, ',');
+		seq_buf_puts(&s, key_field->field->name);
 		first = false;
 	}
 
 	/* Create the synthetic field variable specification */
-	strcat(cmd, ":synthetic_");
-	strcat(cmd, field_name);
-	strcat(cmd, "=");
-	strcat(cmd, field_name);
+	seq_buf_printf(&s, ":synthetic_%s=%s", field_name, field_name);
 
 	/* Use the same filter as the compatible histogram */
-	saved_filter = find_trigger_filter(hist_data, file);
-	if (saved_filter) {
-		strcat(cmd, " if ");
-		strcat(cmd, saved_filter);
+	{
+		char *saved_filter = find_trigger_filter(hist_data, file);
+
+		if (saved_filter)
+			seq_buf_printf(&s, " if %s", saved_filter);
+	}
+
+	seq_buf_str(&s);
+	if (seq_buf_has_overflowed(&s)) {
+		kfree(cmd);
+		kfree(var_hist);
+		return ERR_PTR(-E2BIG);
 	}
 
 	var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH v3 2/2] tracing: Bound histogram expression strings with seq_buf
From: Pengpeng Hou @ 2026-04-17 12:28 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-trace-kernel, linux-kernel, pengpeng
In-Reply-To: <20260417223002.1-tracing-expr-v3-pengpeng@iscas.ac.cn>

expr_str() allocates a fixed MAX_FILTER_STR_VAL buffer and then builds
expression names with a series of raw strcat() appends. Nested operands,
constants and field flags can push the rendered string past that fixed
limit before the name is attached to the hist field.

Build the expression strings with seq_buf and return -E2BIG when the
rendered name would exceed MAX_FILTER_STR_VAL.

Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
---
Changes since v2:
- split the ERR_PTR() conversion into patch 1/2 as requested by Steven
  Rostedt
- keep this patch focused on the seq_buf conversion and overflow
  detection

 kernel/trace/trace_events_hist.c | 67 +++++++++++++++-------
 1 file changed, 45 insertions(+), 22 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 954e0beb7f0a..3a74880ac4d1 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/security.h>
+#include <linux/seq_buf.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/stacktrace.h>
@@ -1738,32 +1739,35 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 	return flags_str;
 }
 
-static void expr_field_str(struct hist_field *field, char *expr)
+static bool expr_field_str(struct hist_field *field, struct seq_buf *s)
 {
+	const char *field_name;
+
 	if (field->flags & HIST_FIELD_FL_VAR_REF)
-		strcat(expr, "$");
-	else if (field->flags & HIST_FIELD_FL_CONST) {
-		char str[HIST_CONST_DIGITS_MAX];
+		seq_buf_putc(s, '$');
+	else if (field->flags & HIST_FIELD_FL_CONST)
+		seq_buf_printf(s, "%llu", field->constant);
 
-		snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant);
-		strcat(expr, str);
-	}
+	field_name = hist_field_name(field, 0);
+	if (!field_name)
+		return false;
 
-	strcat(expr, hist_field_name(field, 0));
+	seq_buf_puts(s, field_name);
 
 	if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
 		const char *flags_str = get_hist_field_flags(field);
 
-		if (flags_str) {
-			strcat(expr, ".");
-			strcat(expr, flags_str);
-		}
+		if (flags_str)
+			seq_buf_printf(s, ".%s", flags_str);
 	}
+
+	return !seq_buf_has_overflowed(s);
 }
 
 static char *expr_str(struct hist_field *field, unsigned int level)
 {
 	char *expr;
+	struct seq_buf s;
 	int ret = -EINVAL;
 
 	if (level > 1)
@@ -1773,49 +1777,68 @@ static char *expr_str(struct hist_field *field, unsigned int level)
 	if (!expr)
 		return ERR_PTR(-ENOMEM);
 
+	seq_buf_init(&s, expr, MAX_FILTER_STR_VAL);
+
 	if (!field->operands[0]) {
-		expr_field_str(field, expr);
+		if (!expr_field_str(field, &s)) {
+			ret = -E2BIG;
+			goto free;
+		}
+		seq_buf_str(&s);
 		return expr;
 	}
 
 	if (field->operator == FIELD_OP_UNARY_MINUS) {
 		char *subexpr;
 
-		strcat(expr, "-(");
+		seq_buf_puts(&s, "-(");
 		subexpr = expr_str(field->operands[0], ++level);
 		if (IS_ERR(subexpr)) {
 			ret = PTR_ERR(subexpr);
 			goto free;
 		}
-		strcat(expr, subexpr);
-		strcat(expr, ")");
+		seq_buf_puts(&s, subexpr);
+		seq_buf_putc(&s, ')');
 
 		kfree(subexpr);
+		if (seq_buf_has_overflowed(&s)) {
+			ret = -E2BIG;
+			goto free;
+		}
 
+		seq_buf_str(&s);
 		return expr;
 	}
 
-	expr_field_str(field->operands[0], expr);
+	if (!expr_field_str(field->operands[0], &s)) {
+		ret = -E2BIG;
+		goto free;
+	}
 
 	switch (field->operator) {
 	case FIELD_OP_MINUS:
-		strcat(expr, "-");
+		seq_buf_putc(&s, '-');
 		break;
 	case FIELD_OP_PLUS:
-		strcat(expr, "+");
+		seq_buf_putc(&s, '+');
 		break;
 	case FIELD_OP_DIV:
-		strcat(expr, "/");
+		seq_buf_putc(&s, '/');
 		break;
 	case FIELD_OP_MULT:
-		strcat(expr, "*");
+		seq_buf_putc(&s, '*');
 		break;
 	default:
 		goto free;
 	}
 
-	expr_field_str(field->operands[1], expr);
+	if (seq_buf_has_overflowed(&s) ||
+	    !expr_field_str(field->operands[1], &s)) {
+		ret = -E2BIG;
+		goto free;
+	}
 
+	seq_buf_str(&s);
 	return expr;
 
 free:
	kfree(expr);
	return ERR_PTR(ret);
}
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* [PATCH v3 1/2] tracing: Return ERR_PTR() from expr_str()
From: Pengpeng Hou @ 2026-04-17 12:24 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-trace-kernel, linux-kernel, pengpeng
In-Reply-To: <20260409123001.1-tracing-hist-expr-v2-pengpeng@iscas.ac.cn>

expr_str() already has failure cases for invalid recursion depth and
allocation failure, but it currently reports them as a bare NULL. Teach
it to return ERR_PTR()-encoded errors and update parse_unary() and
parse_expr() to propagate those errors.

This keeps the error conversion separate from the string-building change
so the follow-up seq_buf patch can stay focused on the overflow fix
itself.

Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
---
Changes since v2:
- split the ERR_PTR() conversion out as its own patch as requested by
  Steven Rostedt

 kernel/trace/trace_events_hist.c | 33 +++++++++++++++++-----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 73ea180cad55..954e0beb7f0a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1764,13 +1764,14 @@ static void expr_field_str(struct hist_field *field, char *expr)
 static char *expr_str(struct hist_field *field, unsigned int level)
 {
 	char *expr;
+	int ret = -EINVAL;
 
 	if (level > 1)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
 	if (!expr)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	if (!field->operands[0]) {
 		expr_field_str(field, expr);
@@ -1782,9 +1783,9 @@ static char *expr_str(struct hist_field *field, unsigned int level)
 
 		strcat(expr, "-(");
 		subexpr = expr_str(field->operands[0], ++level);
-		if (!subexpr) {
-			kfree(expr);
-			return NULL;
+		if (IS_ERR(subexpr)) {
+			ret = PTR_ERR(subexpr);
+			goto free;
 		}
 		strcat(expr, subexpr);
 		strcat(expr, ")");
@@ -1810,13 +1811,16 @@ static char *expr_str(struct hist_field *field, unsigned int level)
 		strcat(expr, "*");
 		break;
 	default:
-		kfree(expr);
-		return NULL;
+		goto free;
 	}
 
 	expr_field_str(field->operands[1], expr);
 
 	return expr;
+
+free:
+	kfree(expr);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -2630,6 +2634,11 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
 	expr->is_signed = operand1->is_signed;
 	expr->operator = FIELD_OP_UNARY_MINUS;
 	expr->name = expr_str(expr, 0);
+	if (IS_ERR(expr->name)) {
+		ret = PTR_ERR(expr->name);
+		expr->name = NULL;
+		goto free;
+	}
 	expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
 	if (!expr->type) {
 		ret = -ENOMEM;
@@ -2842,6 +2851,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 		destroy_hist_field(operand1, 0);
 
 		expr->name = expr_str(expr, 0);
+		if (IS_ERR(expr->name)) {
+			ret = PTR_ERR(expr->name);
+			expr->name = NULL;
+			goto free_expr;
+		}
 	} else {
 		/* The operand sizes should be the same, so just pick one */
 		expr->size = operand1->size;
@@ -2855,6 +2869,11 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 		}
 
 		expr->name = expr_str(expr, 0);
+		if (IS_ERR(expr->name)) {
+			ret = PTR_ERR(expr->name);
+			expr->name = NULL;
+			goto free_expr;
+		}
 	}
 
 	return expr;
-- 
2.50.1 (Apple Git-155)


^ permalink raw reply related

* Re: [PATCH v3] tracing/hist: bound synthetic-field strings with seq_buf
From: Pengpeng Hou @ 2026-04-17  3:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Tom Zanussi, Mathieu Desnoyers,
	linux-trace-kernel, linux-kernel, pengpeng
In-Reply-To: <20260409103001.1-tracing-hist-synth-v3-pengpeng@iscas.ac.cn>

Hi Steve,

Thanks, I'll respin this.

I'll resend this with the tracing-style subject, add the clarifying
comment for `seq_buf_str()`, keep the `event_var` simplification out of
this patch, and move `saved_filter` back next to its point of use in
`create_field_var_hist()`.

Thanks,
Pengpeng

^ permalink raw reply

* Re: [PATCH v2] tracing/hist: bound expression strings with seq_buf
From: Pengpeng Hou @ 2026-04-17  3:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-trace-kernel,
	linux-kernel, pengpeng
In-Reply-To: <20260409123001.1-tracing-hist-expr-v2-pengpeng@iscas.ac.cn>

Hi Steve,

Thanks, I'll respin this.

I'll split it into two patches, one for the `NULL` to `ERR_PTR()`
conversion and one for the `seq_buf` conversion, and I'll use the
tracing-style subjects on the resend.

Thanks,
Pengpeng

^ permalink raw reply

* Re: [PATCH v2] bootconfig: Apply early options from embedded config
From: Masami Hiramatsu @ 2026-04-17  1:46 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Jonathan Corbet, Shuah Khan, linux-kernel, linux-trace-kernel,
	linux-doc, oss, paulmck, rostedt, kernel-team, Kiryl Shutsemau
In-Reply-To: <ad9wwUJ3lh_536Xy@gmail.com>

On Wed, 15 Apr 2026 04:15:57 -0700
Breno Leitao <leitao@debian.org> wrote:

> On Tue, Apr 07, 2026 at 03:19:09AM -0700, Breno Leitao wrote:
> > On Fri, Apr 03, 2026 at 11:45:19AM +0900, Masami Hiramatsu wrote:
> > > > I'm still uncertain about this approach. The goal is to identify and
> > > > categorize the early parameters that are parsed prior to bootconfig
> > > > initialization.
> > >
> > > Yes, if we support early parameters in bootconfig, we need to clarify
> > > which parameters are inherently unsupportable, and document it.
> > > Currently it is easy to say that it does not support the parameter
> > > defined with "early_param()". Similary, maybe we should introduce
> > > "arch_param()" or something like it (or support all of them).
> > >
> > > >
> > > > Moreover, this work could become obsolete if bootconfig's initialization
> > > > point shifts earlier or later in the boot sequence, necessitating
> > > > another comprehensive analysis.
> > >
> > > If we can init it before calling setup_arch(), yes, we don't need to
> > > check it. So that is another option. Do you think it is feasible to
> > > support all of them? (Of course, theologically we can do, but the
> > > question is the use case and requirements.)
> >
> > I don't believe all early parameters can be supported by bootconfig.
> > Some are inherently incompatible as far as I understand, while others
> > depend on bootconfig's initialization point in the boot sequence.
> 
> I've developed a patch series that relocates bootconfig initialization
> to occur before setup_arch().
> 
> Adopting this approach would streamline the categorization considerably,
> as only a small subset of kernel parameters are parsed before
> setup_arch() is called.
> 
> This enables a clearer distinction: parameters processed *before*
> setup_arch() versus those handled afterward, rather than classifying
> based on what occurs before bootconfig initialization.
> 
> Just to close the look and link both discussion together, the proposed
> patch series is available at:
> 
> https://lore.kernel.org/all/20260415-bootconfig_earlier-v1-0-cf160175de5e@debian.org/


Thanks for working on this series!! Let me review the series.

BTW, I found that the current __setup(), early_param(), module_param()
are a bit complicated, for example, __setup() and early_param() are
stored in the different array of module_param(), and those can use
the same parameter (e.g. console).

And as you found some of early_param() options are only applied via
command line. Maybe we can introduce another special macro which is
only for command line.

Thanks,

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH 2/3] init: use static buffers for bootconfig extra command line
From: Masami Hiramatsu @ 2026-04-17  1:44 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, oss, paulmck, linux-trace-kernel, linux-kernel,
	kernel-team
In-Reply-To: <20260415-bootconfig_earlier-v1-2-cf160175de5e@debian.org>

On Wed, 15 Apr 2026 03:51:11 -0700
Breno Leitao <leitao@debian.org> wrote:

> Replace memblock_alloc/memblock_free in xbc_make_cmdline() with
> static __initdata buffers. This removes the last memblock dependency
> from the bootconfig loading path, completing the prerequisite for
> running bootconfig parsing before memblock is available.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  init/main.c | 24 ++++++++++++------------
>  1 file changed, 12 insertions(+), 12 deletions(-)
> 
> diff --git a/init/main.c b/init/main.c
> index 96f93bb06c490..b9feca55e01f9 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -369,11 +369,18 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size,
>  }
>  #undef rest
>  
> +/*
> + * Static buffers for bootconfig-generated command line parameters.
> + * Two separate buffers are needed because both "kernel" and "init"
> + * parameters are stored simultaneously.
> + */
> +static char extra_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
> +static char extra_initargs_buf[COMMAND_LINE_SIZE] __initdata;

This is not good to me, since bootconfig supports bigger config file
than COMMAND_LINE_SIZE. Even if we limits the size for embedded
bootconfig file, it should depends on CONFIG_BOOT_CONFIG_EMBED.

Please use XBC_DATA_MAX instead of COMMAND_LINE_SIZE, or calculate
expected data length when compiling kernel.

But if we can do it, should we continue using bootconfig? I mean
it is easy to make a tool (or add a feature in tools/bootconfig)
which converts bootconfig file to command line string and embeds
it in the kernel. Hmm.

Thanks,

> +
>  /* Make an extra command line under given key word */
> -static char * __init xbc_make_cmdline(const char *key)
> +static char * __init xbc_make_cmdline(const char *key, char *new_cmdline)
>  {
>  	struct xbc_node *root;
> -	char *new_cmdline;
>  	int ret, len = 0;
>  
>  	root = xbc_find_node(key);
> @@ -382,19 +389,12 @@ static char * __init xbc_make_cmdline(const char *key)
>  
>  	/* Count required buffer size */
>  	len = xbc_snprint_cmdline(NULL, 0, root);
> -	if (len <= 0)
> +	if (len <= 0 || len >= COMMAND_LINE_SIZE)
>  		return NULL;
>  
> -	new_cmdline = memblock_alloc(len + 1, SMP_CACHE_BYTES);
> -	if (!new_cmdline) {
> -		pr_err("Failed to allocate memory for extra kernel cmdline.\n");
> -		return NULL;
> -	}
> -
>  	ret = xbc_snprint_cmdline(new_cmdline, len + 1, root);
>  	if (ret < 0 || ret > len) {
>  		pr_err("Failed to print extra kernel cmdline.\n");
> -		memblock_free(new_cmdline, len + 1);
>  		return NULL;
>  	}
>  
> @@ -467,9 +467,9 @@ static void __init setup_boot_config(void)
>  		xbc_get_info(&ret, NULL);
>  		pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret);
>  		/* keys starting with "kernel." are passed via cmdline */
> -		extra_command_line = xbc_make_cmdline("kernel");
> +		extra_command_line = xbc_make_cmdline("kernel", extra_cmdline_buf);
>  		/* Also, "init." keys are init arguments */
> -		extra_init_args = xbc_make_cmdline("init");
> +		extra_init_args = xbc_make_cmdline("init", extra_initargs_buf);
>  	}
>  	return;
>  }
> 
> -- 
> 2.52.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox