Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [RFC PATCH 2/2] ftrace: disable optimistic spinning for ftrace_lock
From: Yafang Shao @ 2026-03-04  7:46 UTC (permalink / raw)
  To: peterz, mingo, will, boqun, longman, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers
  Cc: linux-kernel, linux-trace-kernel, bpf, Yafang Shao
In-Reply-To: <20260304074650.58165-1-laoar.shao@gmail.com>

mutex_lock_nospin() is used for ftrace_lock to selectively disable
optimistic spinning.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 kernel/trace/ftrace.c | 52 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 827fb9a0bf0d..b8cca4f76118 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1284,7 +1284,7 @@ static void clear_ftrace_mod_list(struct list_head *head)
 	if (!head)
 		return;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	list_for_each_entry_safe(p, n, head, list)
 		free_ftrace_mod(p);
 	mutex_unlock(&ftrace_lock);
@@ -4254,7 +4254,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	void *p = NULL;
 	loff_t l;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	if (unlikely(ftrace_disabled))
 		return NULL;
@@ -4362,7 +4362,7 @@ static __init void ftrace_check_work_func(struct work_struct *work)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		test_for_valid_rec(rec);
 	} while_for_each_ftrace_rec();
@@ -5123,7 +5123,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
 	if (!new_hash)
 		goto out; /* warn? */
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	list_for_each_entry_safe(ftrace_mod, n, head, list) {
 
@@ -5159,7 +5159,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
 	if (enable && list_empty(head))
 		new_hash->flags &= ~FTRACE_HASH_FL_MOD;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	ftrace_hash_move_and_update_ops(ops, orig_hash,
 					      new_hash, enable);
@@ -5465,7 +5465,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
 		return -EINVAL;
 
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	/* Check if the probe_ops is already registered */
 	list_for_each_entry(iter, &tr->func_probes, list) {
 		if (iter->probe_ops == probe_ops) {
@@ -5540,7 +5540,7 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
 		}
 	}
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	if (!count) {
 		/* Nothing was added? */
@@ -5619,7 +5619,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
 			return -EINVAL;
 	}
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	/* Check if the probe_ops is already registered */
 	list_for_each_entry(iter, &tr->func_probes, list) {
 		if (iter->probe_ops == probe_ops) {
@@ -5679,7 +5679,7 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr,
 		goto out_unlock;
 	}
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	WARN_ON(probe->ref < count);
 
@@ -5943,7 +5943,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 			goto out_regex_unlock;
 	}
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
 	mutex_unlock(&ftrace_lock);
 
@@ -6205,7 +6205,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
 	 * Now the ftrace_ops_list_func() is called to do the direct callers.
 	 * We can safely change the direct functions attached to each entry.
 	 */
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
@@ -6625,7 +6625,7 @@ int update_ftrace_direct_mod(struct ftrace_ops *ops, struct ftrace_hash *hash, b
 	 * Now the ftrace_ops_list_func() is called to do the direct callers.
 	 * We can safely change the direct functions attached to each entry.
 	 */
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
@@ -6980,7 +6980,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 		} else
 			orig_hash = &iter->ops->func_hash->notrace_hash;
 
-		mutex_lock(&ftrace_lock);
+		mutex_lock_nospin(&ftrace_lock);
 		ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
 						      iter->hash, filter_hash);
 		mutex_unlock(&ftrace_lock);
@@ -7464,7 +7464,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops,
  */
 void ftrace_destroy_filter_files(struct ftrace_ops *ops)
 {
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	if (ops->flags & FTRACE_OPS_FL_ENABLED)
 		ftrace_shutdown(ops, 0);
 	ops->flags |= FTRACE_OPS_FL_DELETED;
@@ -7571,7 +7571,7 @@ static int ftrace_process_locs(struct module *mod,
 	if (!start_pg)
 		return -ENOMEM;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	/*
 	 * Core and each module needs their own pages, as
@@ -7868,7 +7868,7 @@ void ftrace_release_mod(struct module *mod)
 	struct ftrace_page *tmp_page = NULL;
 	struct ftrace_page *pg;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	/*
 	 * To avoid the UAF problem after the module is unloaded, the
@@ -7938,7 +7938,7 @@ void ftrace_module_enable(struct module *mod)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	if (ftrace_disabled)
 		goto out_unlock;
@@ -8267,7 +8267,7 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 	key.ip = start;
 	key.flags = end;	/* overload flags, as it is unsigned long */
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	/*
 	 * If we are freeing module init memory, then check if
@@ -8686,7 +8686,7 @@ static void clear_ftrace_pids(struct trace_array *tr, int type)
 
 void ftrace_clear_pids(struct trace_array *tr)
 {
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	clear_ftrace_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
 
@@ -8695,7 +8695,7 @@ void ftrace_clear_pids(struct trace_array *tr)
 
 static void ftrace_pid_reset(struct trace_array *tr, int type)
 {
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	clear_ftrace_pids(tr, type);
 
 	ftrace_update_pid_func();
@@ -8713,7 +8713,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos)
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	rcu_read_lock_sched();
 
 	pid_list = rcu_dereference_sched(tr->function_pids);
@@ -8766,7 +8766,7 @@ static void *fnpid_start(struct seq_file *m, loff_t *pos)
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	rcu_read_lock_sched();
 
 	pid_list = rcu_dereference_sched(tr->function_no_pids);
@@ -9057,7 +9057,7 @@ static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
 			unsigned long ip = entry->ip;
 			bool found_op = false;
 
-			mutex_lock(&ftrace_lock);
+			mutex_lock_nospin(&ftrace_lock);
 			do_for_each_ftrace_op(op, ftrace_ops_list) {
 				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
 					continue;
@@ -9106,7 +9106,7 @@ static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
 			unsigned long ip = entry->ip;
 			bool found_op = false;
 
-			mutex_lock(&ftrace_lock);
+			mutex_lock_nospin(&ftrace_lock);
 			do_for_each_ftrace_op(op, ftrace_ops_list) {
 				if (!(op->flags & FTRACE_OPS_FL_DIRECT))
 					continue;
@@ -9153,7 +9153,7 @@ static int register_ftrace_function_nolock(struct ftrace_ops *ops)
 
 	ftrace_ops_init(ops);
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 
 	ret = ftrace_startup(ops, 0);
 
@@ -9200,7 +9200,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
-	mutex_lock(&ftrace_lock);
+	mutex_lock_nospin(&ftrace_lock);
 	ret = ftrace_shutdown(ops, 0);
 	mutex_unlock(&ftrace_lock);
 
-- 
2.47.3


^ permalink raw reply related

* [RFC PATCH 1/2] locking: add mutex_lock_nospin()
From: Yafang Shao @ 2026-03-04  7:46 UTC (permalink / raw)
  To: peterz, mingo, will, boqun, longman, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers
  Cc: linux-kernel, linux-trace-kernel, bpf, Yafang Shao
In-Reply-To: <20260304074650.58165-1-laoar.shao@gmail.com>

Introduce mutex_lock_nospin(), a helper that disables optimistic spinning
on the owner for specific heavy locks. This prevents long spinning times
that can lead to latency spikes for other tasks on the same runqueue.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/linux/mutex.h  |  3 +++
 kernel/locking/mutex.c | 39 ++++++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index ecaa0440f6ec..1e488bb24b57 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -189,11 +189,13 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 extern int __must_check _mutex_lock_killable(struct mutex *lock,
 		unsigned int subclass, struct lockdep_map *nest_lock) __cond_acquires(0, lock);
 extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) __acquires(lock);
+extern void mutex_lock_nospin_nested(struct mutex *lock, unsigned int subclass);
 
 #define mutex_lock(lock) mutex_lock_nested(lock, 0)
 #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
 #define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL)
 #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)
+#define mutex_lock_nospin(lock) mutex_lock_nospin_nested(lock, 0)
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
@@ -215,6 +217,7 @@ extern void mutex_lock(struct mutex *lock) __acquires(lock);
 extern int __must_check mutex_lock_interruptible(struct mutex *lock) __cond_acquires(0, lock);
 extern int __must_check mutex_lock_killable(struct mutex *lock) __cond_acquires(0, lock);
 extern void mutex_lock_io(struct mutex *lock) __acquires(lock);
+extern void mutex_lock_nospin(struct mutex *lock) __acquires(lock);
 
 # define mutex_lock_nested(lock, subclass) mutex_lock(lock)
 # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2a1d165b3167..03d3b0749882 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -290,6 +290,14 @@ void __sched mutex_lock(struct mutex *lock)
 		__mutex_lock_slowpath(lock);
 }
 EXPORT_SYMBOL(mutex_lock);
+
+void __sched mutex_lock_nospin(struct mutex *lock)
+{
+	might_sleep();
+
+	if (!__mutex_trylock_fast(lock))
+		__mutex_lock_nospin(lock);
+}
 #endif
 
 #include "ww_mutex.h"
@@ -443,8 +451,11 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
  */
 static __always_inline bool
 mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
-		      struct mutex_waiter *waiter)
+		      struct mutex_waiter *waiter, const bool nospin)
 {
+	if (nospin)
+		return false;
+
 	if (!waiter) {
 		/*
 		 * The purpose of the mutex_can_spin_on_owner() function is
@@ -577,7 +588,8 @@ EXPORT_SYMBOL(ww_mutex_unlock);
 static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass,
 		    struct lockdep_map *nest_lock, unsigned long ip,
-		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
+		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx,
+		    const bool nospin)
 {
 	DEFINE_WAKE_Q(wake_q);
 	struct mutex_waiter waiter;
@@ -615,7 +627,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 	trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
 	if (__mutex_trylock(lock) ||
-	    mutex_optimistic_spin(lock, ww_ctx, NULL)) {
+	    mutex_optimistic_spin(lock, ww_ctx, NULL, nospin)) {
 		/* got the lock, yay! */
 		lock_acquired(&lock->dep_map, ip);
 		if (ww_ctx)
@@ -716,7 +728,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 			 * to run.
 			 */
 			clear_task_blocked_on(current, lock);
-			if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+			if (mutex_optimistic_spin(lock, ww_ctx, &waiter, nospin))
 				break;
 			set_task_blocked_on(current, lock);
 			trace_contention_begin(lock, LCB_F_MUTEX);
@@ -773,14 +785,21 @@ static int __sched
 __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 	     struct lockdep_map *nest_lock, unsigned long ip)
 {
-	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false);
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false, false);
+}
+
+static int __sched
+__mutex_lock_nospin(struct mutex *lock, unsigned int state, unsigned int subclass,
+		    struct lockdep_map *nest_lock, unsigned long ip)
+{
+	return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false, true);
 }
 
 static int __sched
 __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
 		unsigned long ip, struct ww_acquire_ctx *ww_ctx)
 {
-	return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
+	return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true, false);
 }
 
 /**
@@ -861,11 +880,17 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass)
 
 	token = io_schedule_prepare();
 	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
-			    subclass, NULL, _RET_IP_, NULL, 0);
+			    subclass, NULL, _RET_IP_, NULL, 0, false);
 	io_schedule_finish(token);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_io_nested);
 
+void __sched
+mutex_lock_nospin_nested(struct mutex *lock, unsigned int subclass)
+{
+	__mutex_lock_nospin(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+}
+
 static inline int
 ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
-- 
2.47.3


^ permalink raw reply related

* [RFC PATCH 0/2] disable optimistic spinning for ftrace_lock
From: Yafang Shao @ 2026-03-04  7:46 UTC (permalink / raw)
  To: peterz, mingo, will, boqun, longman, rostedt, mhiramat,
	mark.rutland, mathieu.desnoyers
  Cc: linux-kernel, linux-trace-kernel, bpf, Yafang Shao

Background
==========

One of our latency-sensitive services reported random CPU %sys spikes.
After a thorough investigation, we finally identified the root cause of the
CPU %sys spikes. The key kernel stacks are as follows:

- Task A

2026-02-14-16:53:40.938243: [CPU198] 2156302(bpftrace) cgrp:4019437 pod:4019253

        find_kallsyms_symbol+142
        module_address_lookup+104
        kallsyms_lookup_buildid+203
        kallsyms_lookup+20
        print_rec+64
        t_show+67
        seq_read_iter+709
        seq_read+165
        vfs_read+165
        ksys_read+103
        __x64_sys_read+25
        do_syscall_64+56
        entry_SYSCALL_64_after_hwframe+100

This task (2156302, bpftrace) is reading the
/sys/kernel/debug/tracing/available_filter_functions to check if a function
is traceable:

  https://github.com/bpftrace/bpftrace/blob/master/src/tracefs/tracefs.h#L21

Reading the available_filter_functions file is time-consuming, as it
contains tens of thousands of functions:

  $ cat /sys/kernel/debug/tracing/available_filter_functions | wc -l
  61081

  $ time cat /sys/kernel/debug/tracing/available_filter_functions > /dev/null

  real    0m0.452s
  user    0m0.000s
  sys     0m0.452s

Consequently, the ftrace_lock is held by this task for an extended period.

- Other Tasks

2026-02-14-16:53:41.437094: [CPU79] 2156308(bpftrace) cgrp:4019437 pod:4019253

        mutex_spin_on_owner+108
        __mutex_lock.constprop.0+1132
        __mutex_lock_slowpath+19
        mutex_lock+56
        t_start+51
        seq_read_iter+250
        seq_read+165
        vfs_read+165
        ksys_read+103
        __x64_sys_read+25
        do_syscall_64+56
        entry_SYSCALL_64_after_hwframe+100

Since ftrace_lock is held by Task-A and Task-A is actively running on a
CPU, all other tasks waiting for the same lock will spin on their
respective CPUs. This leads to increased CPU pressure.

Reproduction
============

This issue can be reproduced simply by running
`cat available_filter_functions`.

- Single process reading available_filter_functions:

  $ time cat /sys/kernel/tracing/available_filter_functions > /dev/null

  real    0m0.452s
  user    0m0.001s
  sys     0m0.451s

- Six processes reading available_filter_functions simultaneously:

  for i in `seq 0 5`; do
    time cat /sys/kernel/tracing/available_filter_functions > /dev/null &
  done

  The results are as follows:

  real    0m1.801s
  user    0m0.000s
  sys     0m1.779s

  real    0m1.804s
  user    0m0.001s
  sys     0m1.791s

  real    0m1.805s
  user    0m0.000s
  sys     0m1.792s

  real    0m1.806s
  user    0m0.001s
  sys     0m1.796s

  As more processes are added, the system time increases correspondingly.

Solution
========

One approach is to optimize the reading of available_filter_functions to
make it as fast as possible. However, the risk lies in the contention
caused by optimistic spin locking.

Therefore, we need to consider an alternative solution that avoids
optimistic spinning for heavy mutexes that may be held for long durations.
Note that we do not want to disable CONFIG_MUTEX_SPIN_ON_OWNER entirely, as
that could lead to unexpected performance regressions.

In this patch, a new wrapper mutex_lock_nospin() is used for ftrace_lock to
selectively disable optimistic spinning.

Yafang Shao (2):
  locking: add mutex_lock_nospin()
  ftrace: disable optimistic spinning for ftrace_lock

 include/linux/mutex.h  |  3 +++
 kernel/locking/mutex.c | 39 +++++++++++++++++++++++++------
 kernel/trace/ftrace.c  | 52 +++++++++++++++++++++---------------------
 3 files changed, 61 insertions(+), 33 deletions(-)

-- 
2.47.3

^ permalink raw reply

* Re: [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run3 (2)
From: Qing Wang @ 2026-03-04  7:26 UTC (permalink / raw)
  To: syzbot+9ea7c90be2b24e189592
  Cc: andrii, ast, bpf, daniel, eddyz87, haoluo, john.fastabend, jolsa,
	kpsingh, linux-kernel, linux-trace-kernel, martin.lau,
	mathieu.desnoyers, mattbobrowski, mhiramat, rostedt, sdf, song,
	syzkaller-bugs, wangqing7171, yonghong.song
In-Reply-To: <69a3e5ad.050a0220.3a55be.0056.GAE@google.com>

#syz test

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0378e83b4099..b1e3e9f59930 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3783,6 +3783,8 @@ static void bpf_raw_tp_link_release(struct bpf_link *link)
 
 	bpf_probe_unregister(raw_tp->btp, raw_tp);
 	bpf_put_raw_tracepoint(raw_tp->btp);
+
+	tracepoint_synchronize_unregister();
 }
 
 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)

^ permalink raw reply related

* Re: [PATCH v2 000/110] vfs: change inode->i_ino from unsigned long to u64
From: NeilBrown @ 2026-03-04  6:26 UTC (permalink / raw)
  To: Jeff Layton
  Cc: David Howells, Alexander Viro, Christian Brauner, Jan Kara,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Dan Williams,
	Matthew Wilcox, Eric Biggers, Theodore Y. Ts'o, Muchun Song,
	Oscar Salvador, David Hildenbrand, Paulo Alcantara,
	Andreas Dilger, Jan Kara, Jaegeuk Kim, Chao Yu, Trond Myklebust,
	Anna Schumaker, Chuck Lever, Olga Kornievskaia, Dai Ngo,
	Tom Talpey, Steve French, Ronnie Sahlberg, Shyam Prasad N,
	Bharath SM, Alexander Aring, Ryusuke Konishi, Viacheslav Dubeyko,
	Eric Van Hensbergen, Latchesar Ionkov, Dominique Martinet,
	Christian Schoenebeck, David Sterba, Marc Dionne, Ian Kent,
	Luis de Bethencourt, Salah Triki, Tigran A. Aivazian,
	Ilya Dryomov, Alex Markuze, Jan Harkes, coda, Nicolas Pitre,
	Tyler Hicks, Amir Goldstein, Christoph Hellwig,
	John Paul Adrian Glaubitz, Yangtao Li, Mikulas Patocka,
	David Woodhouse, Richard Weinberger, Dave Kleikamp,
	Konstantin Komarov, Mark Fasheh, Joel Becker, Joseph Qi,
	Mike Marshall, Martin Brandenburg, Miklos Szeredi, Anders Larsen,
	Zhihao Cheng, Damien Le Moal, Naohiro Aota, Johannes Thumshirn,
	John Johansen, Paul Moore, James Morris, Serge E. Hallyn,
	Mimi Zohar, Roberto Sassu, Dmitry Kasatkin, Eric Snowberg, Fan Wu,
	Stephen Smalley, Ondrej Mosnacek, Casey Schaufler, Alex Deucher,
	Christian König, David Airlie, Simona Vetter, Sumit Semwal,
	Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	David S. Miller, Jakub Kicinski, Simon Horman, Oleg Nesterov,
	Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Darrick J. Wong,
	Martin Schiller, Eric Paris, Joerg Reuter, Marcel Holtmann,
	Johan Hedberg, Luiz Augusto von Dentz, Oliver Hartkopp,
	Marc Kleine-Budde, David Ahern, Neal Cardwell, Steffen Klassert,
	Herbert Xu, Remi Denis-Courmont, Marcelo Ricardo Leitner,
	Xin Long, Magnus Karlsson, Maciej Fijalkowski, Stanislav Fomichev,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, linux-fsdevel, linux-kernel, linux-trace-kernel,
	nvdimm, fsverity, linux-mm, netfs, linux-ext4, linux-f2fs-devel,
	linux-nfs, linux-cifs, samba-technical, linux-nilfs, v9fs,
	linux-afs, autofs, ceph-devel, codalist, ecryptfs, linux-mtd,
	jfs-discussion, ntfs3, ocfs2-devel, devel, linux-unionfs,
	apparmor, linux-security-module, linux-integrity, selinux,
	amd-gfx, dri-devel, linux-media, linaro-mm-sig, netdev,
	linux-perf-users, linux-fscrypt, linux-xfs, linux-hams, linux-x25,
	audit, linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <1c28e34c7167acf4e20c3e201476504135aa44e8.camel@kernel.org>

On Tue, 03 Mar 2026, Jeff Layton wrote:
> On Tue, 2026-03-03 at 10:55 +0000, David Howells wrote:
> > Jeff Layton <jlayton@kernel.org> wrote:
> > 
> > > This version splits the change up to be more bisectable. It first adds a
> > > new kino_t typedef and a new "PRIino" macro to hold the width specifier
> > > for format strings. The conversion is done, and then everything is
> > > changed to remove the new macro and typedef.
> > 
> > Why remove the typedef?  It might be better to keep it.
> > 
> 
> Why? After this change, internel kernel inodes will be u64's -- full
> stop. I don't see what the macro or typedef will buy us at that point.

Implicit documentation?
ktime_t is (now) always s64, but we still keep the typedef;

It would be cool if we could teach vsprintf to understand some new
specifier to mean "kinode_t" or "ktime_t" etc.  But that would trigger
gcc warnings.

NeilBrown

^ permalink raw reply

* Re: [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run9
From: syzbot @ 2026-03-04  4:06 UTC (permalink / raw)
  To: andrii, ast, bpf, daniel, eddyz87, haoluo, john.fastabend, jolsa,
	kpsingh, linux-kernel, linux-trace-kernel, martin.lau,
	mathieu.desnoyers, mattbobrowski, mhiramat, rostedt, sdf, song,
	syzkaller-bugs, wangqing7171, yonghong.song
In-Reply-To: <20260304033822.107697-1-wangqing7171@gmail.com>

Hello,

syzbot has tested the proposed patch and the reproducer did not trigger any issue:

Reported-by: syzbot+b4c5ad098c821bf8d8bc@syzkaller.appspotmail.com
Tested-by: syzbot+b4c5ad098c821bf8d8bc@syzkaller.appspotmail.com

Tested on:

commit:         0031c068 Merge tag 'cgroup-for-7.0-rc2-fixes' of git:/..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1434f0ba580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=c5c49ee0942d1cdb
dashboard link: https://syzkaller.appspot.com/bug?extid=b4c5ad098c821bf8d8bc
compiler:       Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
patch:          https://syzkaller.appspot.com/x/patch.diff?x=17e5bb5a580000

Note: testing is done by a robot and is best-effort only.

^ permalink raw reply

* Re: [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run9
From: Qing Wang @ 2026-03-04  3:38 UTC (permalink / raw)
  To: syzbot+b4c5ad098c821bf8d8bc
  Cc: andrii, ast, bpf, daniel, eddyz87, haoluo, john.fastabend, jolsa,
	kpsingh, linux-kernel, linux-trace-kernel, martin.lau,
	mathieu.desnoyers, mattbobrowski, mhiramat, rostedt, sdf, song,
	syzkaller-bugs, yonghong.song
In-Reply-To: <69a7276a.a70a0220.b118c.0011.GAE@google.com>

#syz test

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0378e83b4099..b1e3e9f59930 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3783,6 +3783,8 @@ static void bpf_raw_tp_link_release(struct bpf_link *link)
 
 	bpf_probe_unregister(raw_tp->btp, raw_tp);
 	bpf_put_raw_tracepoint(raw_tp->btp);
+
+	tracepoint_synchronize_unregister();
 }
 
 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)

^ permalink raw reply related

* Re: [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run9
From: Qing Wang @ 2026-03-04  3:33 UTC (permalink / raw)
  To: syzbot+b4c5ad098c821bf8d8bc
  Cc: andrii, ast, bpf, daniel, eddyz87, haoluo, john.fastabend, jolsa,
	kpsingh, linux-kernel, linux-trace-kernel, martin.lau,
	mathieu.desnoyers, mattbobrowski, mhiramat, rostedt, sdf, song,
	syzkaller-bugs, yonghong.song
In-Reply-To: <69a7276a.a70a0220.b118c.0011.GAE@google.com>

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0378e83b4099..b1e3e9f59930 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3783,6 +3783,8 @@ static void bpf_raw_tp_link_release(struct bpf_link *link)
 
 	bpf_probe_unregister(raw_tp->btp, raw_tp);
 	bpf_put_raw_tracepoint(raw_tp->btp);
+
+	tracepoint_synchronize_unregister();
 }
 
 static void bpf_raw_tp_link_dealloc(struct bpf_link *link)

^ permalink raw reply related

* [PATCH] tracing: Disable preemption in the tracepoint callbacks handling filtered pids
From: Steven Rostedt @ 2026-03-04  2:57 UTC (permalink / raw)
  To: LKML, Linux trace kernel; +Cc: Masami Hiramatsu, Mathieu Desnoyers

From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>

Filtering PIDs for events triggered the following during selftests:

[37] event tracing - restricts events based on pid notrace filtering
[  155.874095]
[  155.874869] =============================
[  155.876037] WARNING: suspicious RCU usage
[  155.877287] 7.0.0-rc1-00004-g8cd473a19bc7 #7 Not tainted
[  155.879263] -----------------------------
[  155.882839] kernel/trace/trace_events.c:1057 suspicious rcu_dereference_check() usage!
[  155.889281]
[  155.889281] other info that might help us debug this:
[  155.889281]
[  155.894519]
[  155.894519] rcu_scheduler_active = 2, debug_locks = 1
[  155.898068] no locks held by ftracetest/4364.
[  155.900524]
[  155.900524] stack backtrace:
[  155.902645] CPU: 1 UID: 0 PID: 4364 Comm: ftracetest Not tainted 7.0.0-rc1-00004-g8cd473a19bc7 #7 PREEMPT(lazy)
[  155.902648] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014
[  155.902651] Call Trace:
[  155.902655]  <TASK>
[  155.902659]  dump_stack_lvl+0x67/0x90
[  155.902665]  lockdep_rcu_suspicious+0x154/0x1a0
[  155.902672]  event_filter_pid_sched_process_fork+0x9a/0xd0
[  155.902678]  kernel_clone+0x367/0x3a0
[  155.902689]  __x64_sys_clone+0x116/0x140
[  155.902696]  do_syscall_64+0x158/0x460
[  155.902700]  ? entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  155.902702]  ? trace_irq_disable+0x1d/0xc0
[  155.902709]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  155.902711] RIP: 0033:0x4697c3
[  155.902716] Code: 1f 84 00 00 00 00 00 64 48 8b 04 25 10 00 00 00 45 31 c0 31 d2 31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 89 c2 85 c0 75 2c 64 48 8b 04 25 10 00 00
[  155.902718] RSP: 002b:00007ffc41150428 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
[  155.902721] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004697c3
[  155.902722] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[  155.902724] RBP: 0000000000000000 R08: 0000000000000000 R09: 000000003fccf990
[  155.902725] R10: 000000003fccd690 R11: 0000000000000246 R12: 0000000000000001
[  155.902726] R13: 000000003fce8103 R14: 0000000000000001 R15: 0000000000000000
[  155.902733]  </TASK>
[  155.902747]

The tracepoint callbacks recently were changed to allow preemption. The
event PID filtering callbacks that were attached to the fork and exit
tracepoints expected preemption disabled in order to access the RCU
protected PID lists.

Add a guard(preempt)() to protect the references to the PID list.

Cc: stable@vger.kernel.org
Fixes: a46023d5616e ("tracing: Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast")
Link: https://patch.msgid.link/20260303131706.96057f61a48a34c43ce1e396@kernel.org
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e989362c7709..a4c6469e15d0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1039,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = data;
 
+	guard(preempt)();
 	pid_list = rcu_dereference_raw(tr->filtered_pids);
 	trace_filter_add_remove_task(pid_list, NULL, task);
 
@@ -1054,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data,
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = data;
 
+	guard(preempt)();
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 	trace_filter_add_remove_task(pid_list, self, task);
 
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH] ftrace: Disable preemption in the tracepoint callbacks handling filtered pids
From: Steven Rostedt @ 2026-03-04  2:13 UTC (permalink / raw)
  To: Masami Hiramatsu (Google); +Cc: LKML, Linux Trace Kernel, Mathieu Desnoyers
In-Reply-To: <20260304104833.7b157f3cd2ff911cbcf77660@kernel.org>

On Wed, 4 Mar 2026 10:48:33 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> I think we can combine those since it fixes the same commit.
> (just for bisectability)

They can still be separate. One is for function tracing and the other
is for events. They both may have the same fixes tag, but it shouldn't
affect bisectablility, as they require different things to be enabled
to trigger.

> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

-- Steve

^ permalink raw reply

* Re: [PATCH 1/2] selftests/tracing: Fix to make --logdir option work again
From: Masami Hiramatsu @ 2026-03-04  2:04 UTC (permalink / raw)
  To: Masami Hiramatsu (Google), Shuah Khan
  Cc: Steven Rostedt, Shuah Khan, Gabriele Monaco, Mathieu Desnoyers,
	linux-kernel, linux-trace-kernel, linux-kselftest
In-Reply-To: <177071725191.2369897.14781037901532893911.stgit@mhiramat.tok.corp.google.com>

Hi Shuah,

Could you pick these 2 patches to the selftests tree or should I pick it?

Thanks,

On Tue, 10 Feb 2026 18:54:12 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:

> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> Since commit a0aa283c53a7 ("selftest/ftrace: Generalise ftracetest to
> use with RV") moved the default LOG_DIR setting after --logdir option
> parser, it overwrites the user given LOG_DIR.
> This fixes it to check the --logdir option parameter when setting new
> default LOG_DIR with a new TOP_DIR.
> 
> Fixes: a0aa283c53a7 ("selftest/ftrace: Generalise ftracetest to use with RV")
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>  tools/testing/selftests/ftrace/ftracetest |   18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest
> index 3230bd54dba8..0a56bf209f6c 100755
> --- a/tools/testing/selftests/ftrace/ftracetest
> +++ b/tools/testing/selftests/ftrace/ftracetest
> @@ -130,8 +130,7 @@ parse_opts() { # opts
>        shift 1
>      ;;
>      --logdir|-l)
> -      LOG_DIR=$2
> -      LINK_PTR=
> +      USER_LOG_DIR=$2
>        shift 2
>      ;;
>      --rv)
> @@ -199,6 +198,7 @@ fi
>  TOP_DIR=`absdir $0`
>  TEST_DIR=$TOP_DIR/test.d
>  TEST_CASES=`find_testcases $TEST_DIR`
> +USER_LOG_DIR=
>  KEEP_LOG=0
>  KTAP=0
>  DEBUG=0
> @@ -210,12 +210,18 @@ RV_TEST=0
>  # Parse command-line options
>  parse_opts $*
>  
> +[ $DEBUG -ne 0 ] && set -x
> +
> +# TOP_DIR can be changed for rv. Setting log directory.
>  LOG_TOP_DIR=$TOP_DIR/logs
>  LOG_DATE=`date +%Y%m%d-%H%M%S`
> -LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/
> -LINK_PTR=$LOG_TOP_DIR/latest
> -
> -[ $DEBUG -ne 0 ] && set -x
> +if [ -n "$USER_LOG_DIR" ]; then
> +  LOG_DIR=$USER_LOG_DIR
> +  LINK_PTR=
> +else
> +  LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/
> +  LINK_PTR=$LOG_TOP_DIR/latest
> +fi
>  
>  if [ $RV_TEST -ne 0 ]; then
>  	TRACING_DIR=$TRACING_DIR/rv
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4 4/5] mm: rename zone->lock to zone->_lock
From: SeongJae Park @ 2026-03-04  1:50 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: SeongJae Park, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Axel Rasmussen, Yuanchu Xie,
	Wei Xu, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Rafael J. Wysocki, Pavel Machek, Len Brown, Brendan Jackman,
	Johannes Weiner, Zi Yan, Oscar Salvador, Qi Zheng, Shakeel Butt,
	linux-kernel, linux-mm, linux-trace-kernel, linux-pm
In-Reply-To: <aabvc4Xhc9qBfaG7@shell.ilvokhin.com>

On Tue, 3 Mar 2026 14:25:55 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> On Mon, Mar 02, 2026 at 02:37:43PM -0800, Andrew Morton wrote:
> > On Mon, 2 Mar 2026 15:10:03 +0100 "Vlastimil Babka (SUSE)" <vbabka@kernel.org> wrote:
> > 
> > > On 2/27/26 17:00, Dmitry Ilvokhin wrote:
> > > > This intentionally breaks direct users of zone->lock at compile time so
> > > > all call sites are converted to the zone lock wrappers. Without the
> > > > rename, present and future out-of-tree code could continue using
> > > > spin_lock(&zone->lock) and bypass the wrappers and tracing
> > > > infrastructure.
> > > > 
> > > > No functional change intended.
> > > > 
> > > > Suggested-by: Andrew Morton <akpm@linux-foundation.org>
> > > > Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> > > > Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> > > > Acked-by: SeongJae Park <sj@kernel.org>
> > > 
> > > I see some more instances of 'zone->lock' in comments in
> > > include/linux/mmzone.h and under Documentation/ but otherwise LGTM.
> > > 
> > 
> > I fixed (most of) that in the previous version but my fix was lost.
> 
> Thanks for the fixups, Andrew.
> 
> I still see a few 'zone->lock' references in Documentation remain on
> mm-new. This patch cleans them up, as noted by Vlastimil.
> 
> I'm happy to adjust this patch if anything else needs attention.
> 
> From 9142d5a8b60038fa424a6033253960682e5a51f4 Mon Sep 17 00:00:00 2001
> From: Dmitry Ilvokhin <d@ilvokhin.com>
> Date: Tue, 3 Mar 2026 06:13:13 -0800
> Subject: [PATCH] mm: fix remaining zone->lock references
> 
> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> ---
>  Documentation/mm/physical_memory.rst | 4 ++--
>  Documentation/trace/events-kmem.rst  | 8 ++++----
>  2 files changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/mm/physical_memory.rst b/Documentation/mm/physical_memory.rst
> index b76183545e5b..e344f93515b6 100644
> --- a/Documentation/mm/physical_memory.rst
> +++ b/Documentation/mm/physical_memory.rst
> @@ -500,11 +500,11 @@ General
>  ``nr_isolate_pageblock``
>    Number of isolated pageblocks. It is used to solve incorrect freepage counting
>    problem due to racy retrieving migratetype of pageblock. Protected by
> -  ``zone->lock``. Defined only when ``CONFIG_MEMORY_ISOLATION`` is enabled.
> +  ``zone_lock``. Defined only when ``CONFIG_MEMORY_ISOLATION`` is enabled.

Dmitry's original patch [1] was doing 's/zone->lock/zone->_lock/', which aligns
to my expectation.  But this patch is doing 's/zone->lock/zone_lock/'.  Same
for the rest of this patch.

I was initially thinking this is just a mistake, but I also found Andrew is
doing same change [2], so I'm bit confused.  Is this an intentional change?

[1] https://lore.kernel.org/d61500c5784c64e971f4d328c57639303c475f81.1772206930.git.d@ilvokhin.com
[2] https://lore.kernel.org/20260302143743.220eed4feb36d7572fe726cc@linux-foundation.org


Thanks,
SJ

[...]

^ permalink raw reply

* Re: [PATCH] ftrace: Disable preemption in the tracepoint callbacks handling filtered pids
From: Masami Hiramatsu @ 2026-03-04  1:48 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: LKML, Linux Trace Kernel, Mathieu Desnoyers
In-Reply-To: <20260303093949.4210e854@gandalf.local.home>

On Tue, 3 Mar 2026 09:39:49 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 3 Mar 2026 13:17:06 +0900
> Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> 
> > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> > index 9928da636c9d..3b3aaf4831e9 100644
> > --- a/kernel/trace/trace_events.c
> > +++ b/kernel/trace/trace_events.c
> > @@ -1039,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
> >  	struct trace_pid_list *pid_list;
> >  	struct trace_array *tr = data;
> >  
> > +	guard(preempt)();
> >  	pid_list = rcu_dereference_raw(tr->filtered_pids);
> >  	trace_filter_add_remove_task(pid_list, NULL, task);
> >  
> > @@ -1054,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data,
> >  	struct trace_pid_list *pid_list;
> >  	struct trace_array *tr = data;
> >  
> > +	guard(preempt)();
> >  	pid_list = rcu_dereference_sched(tr->filtered_pids);
> >  	trace_filter_add_remove_task(pid_list, self, task);
> >  
> 
> Yep. I can make them two different patches. Do you want me to take this as
> your patch? I'll need a SoB from you then.

I think we can combine those since it fixes the same commit.
(just for bisectability)

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thank you,

> 
> -- Steve


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [BUG] kprobes: WARNING in __arm_kprobe_ftrace when kprobe-ftrace arming fails with -ENOMEM under fault injection
From: Masami Hiramatsu @ 2026-03-04  1:44 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Zw Tang, Naveen N Rao, Steven Rostedt, linux-kernel,
	linux-trace-kernel, linux-perf-users, Arnaldo Carvalho de Melo,
	David S . Miller
In-Reply-To: <20260302173551.2555701-1-sashal@kernel.org>

On Mon,  2 Mar 2026 12:35:49 -0500
Sasha Levin <sashal@kernel.org> wrote:

> https://lore.kernel.org/all/CAPHJ_V+J6YDb_wX2nhXU6kh466Dt_nyDSas-1i_Y8s7tqY-Mzw@mail.gmail.com/
> 
> ## 1. Bug Summary
> 
> A WARNING is triggered in `__arm_kprobe_ftrace()` at `kernel/kprobes.c:1147` when fault injection causes `ftrace_set_filter_ip()` to return `-ENOMEM` during kprobe arming via `perf_event_open()`. This is a false-positive warning — the error path itself is correct and the error propagates cleanly to userspace, but the `WARN_ONCE()` macro fires a kernel warning splat that is inappropriate for a recoverable allocation failure. The affected subsystem is kprobes/ftrace. Severity: warning only (no crash, hang, or data corruption).

Thanks for analysis and Nice catch Zw Tang!
This looks good analysis to me.

> 
> ## 2. Stack Trace Analysis
> 
> ```
> WARNING: kernel/kprobes.c:1147 at arm_kprobe+0x563/0x620, CPU#0
> Call Trace:
>  <TASK>
>  enable_kprobe+0x1fc/0x2c0
>  enable_trace_kprobe+0x227/0x4b0
>  kprobe_register+0x84/0xc0
>  perf_trace_event_init+0x527/0xa20
>  perf_kprobe_init+0x156/0x200
>  perf_kprobe_event_init+0x101/0x1c0
>  perf_try_init_event+0x145/0xa10
>  perf_event_alloc+0x1f91/0x5390
>  __do_sys_perf_event_open+0x557/0x2d50
>  do_syscall_64+0x129/0x1160
>  entry_SYSCALL_64_after_hwframe+0x4b/0x53
>  </TASK>
> ```
> 
> The crash point is `__arm_kprobe_ftrace()` at `kernel/kprobes.c:1147`, inlined into `arm_kprobe()`. The calling chain is process context: `perf_event_open()` syscall -> `perf_kprobe_event_init()` -> `enable_trace_kprobe()` -> `enable_kprobe()` -> `arm_kprobe()` -> `__arm_kprobe_ftrace()`. R12 holds `0xfffffff4` which is `-12` (`-ENOMEM`), confirming the allocation failure injected by fault injection.
> 
> ## 3. Root Cause Analysis
> 
> The root cause is an overly aggressive `WARN_ONCE()` in `__arm_kprobe_ftrace()` at `kernel/kprobes.c:1147`:
> 
> ```c
> ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
> if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret))
>     return ret;
> ```
> 
> Prior to commit 9c89bb8e3272 ("kprobes: treewide: Cleanup the error messages for kprobes"), this was a simple `pr_debug()`. That commit promoted it to `WARN_ONCE()` as part of a treewide message cleanup, under the rationale that failures here indicate unexpected conditions. However, `ftrace_set_filter_ip()` calls into memory allocation paths (e.g., `ftrace_hash_move_and_update_ops()` -> `__ftrace_hash_update_ipmodify()` or `ftrace_hash` allocation), and those allocations can legitimately fail under memory pressure or fault injection.
> 
> The error handling is actually correct — the `-ENOMEM` propagates back through `arm_kprobe()` -> `enable_kprobe()` and ultimately causes the `perf_event_open()` syscall to return an error to userspace. The only problem is the spurious `WARN_ONCE()` which triggers a kernel warning splat and stack trace for what is a recoverable, non-buggy situation.
> 
> The same issue also applies to the `WARN()` on line 1152 for `register_ftrace_function()`, which can also fail with `-ENOMEM`.
> 
> ## 4. Affected Versions
> 
> This issue was introduced by commit 9c89bb8e3272 ("kprobes: treewide: Cleanup the error messages for kprobes"), which first appeared in v5.16-rc1. All kernel versions from v5.16 onward are affected, including the reporter's v7.0.0-rc1.
> 
> This is a regression from v5.15, where the same failure path used `pr_debug()` and did not emit any warning.
> 
> ## 5. Relevant Commits and Fixes
> 
> Introducing commit:
>   9c89bb8e3272 ("kprobes: treewide: Cleanup the error messages for kprobes")
>   Author: Masami Hiramatsu <mhiramat@kernel.org>
>   Merged in v5.16-rc1
> 
> This commit changed `pr_debug()` to `WARN_ONCE()` in `__arm_kprobe_ftrace()` for the `ftrace_set_filter_ip()` failure path, and changed a `pr_debug()` to `WARN()` for the `register_ftrace_function()` failure path.
> 
> No fix for this issue exists in mainline or stable as of the reporter's kernel version.
> 
> The suggested fix is to downgrade both `WARN_ONCE()` (line 1147) and `WARN()` (line 1152) in `__arm_kprobe_ftrace()` back to `pr_warn_once()` / `pr_warn()` respectively. This preserves the improved error messages from 9c89bb8e3272 while avoiding spurious warning splats on recoverable failures. The error code is already propagated correctly to the caller.
> 
> ## 6. Prior Discussions
> 
> No prior reports of this specific issue were found on lore.kernel.org. No related mailing list discussions or proposed patches addressing the WARN severity in `__arm_kprobe_ftrace()` were found.
> 
> ## 7. Suggested Actions
> 
> 1. The fix is to downgrade the WARN_ONCE/WARN in __arm_kprobe_ftrace()
>    (kernel/kprobes.c lines 1147 and 1152) to pr_warn_once/pr_warn.
>    Specifically:
> 
>    - Line 1147: Change WARN_ONCE(ret < 0, ...) to a simple
>      if (ret < 0) { pr_warn_once(...); return ret; }
> 
>    - Line 1152: Change WARN(ret < 0, ...) to a simple
>      if (ret < 0) { pr_warn(...); goto err_ftrace; }

Sounds reasonable. Let's fix it.

> 
> 2. This is a low-severity issue that only manifests with fault injection
>    enabled. No data corruption or crash occurs — the error is correctly
>    propagated to userspace. The warning is cosmetic but noisy and can
>    cause false-positive syzbot/syzkaller reports.
> 

Yeah, it is just too aggressive warnings. Let me remove if the error
can be handled correctly.

Thank you,

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v2 053/110] uprobes: use PRIino format for i_ino
From: Masami Hiramatsu @ 2026-03-04  1:12 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Steven Rostedt,
	Mathieu Desnoyers, Dan Williams, Matthew Wilcox, Eric Biggers,
	Theodore Y. Ts'o, Muchun Song, Oscar Salvador,
	David Hildenbrand, David Howells, Paulo Alcantara, Andreas Dilger,
	Jan Kara, Jaegeuk Kim, Chao Yu, Trond Myklebust, Anna Schumaker,
	Chuck Lever, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Steve French, Ronnie Sahlberg, Shyam Prasad N, Bharath SM,
	Alexander Aring, Ryusuke Konishi, Viacheslav Dubeyko,
	Eric Van Hensbergen, Latchesar Ionkov, Dominique Martinet,
	Christian Schoenebeck, David Sterba, Marc Dionne, Ian Kent,
	Luis de Bethencourt, Salah Triki, Tigran A. Aivazian,
	Ilya Dryomov, Alex Markuze, Jan Harkes, coda, Nicolas Pitre,
	Tyler Hicks, Amir Goldstein, Christoph Hellwig,
	John Paul Adrian Glaubitz, Yangtao Li, Mikulas Patocka,
	David Woodhouse, Richard Weinberger, Dave Kleikamp,
	Konstantin Komarov, Mark Fasheh, Joel Becker, Joseph Qi,
	Mike Marshall, Martin Brandenburg, Miklos Szeredi, Anders Larsen,
	Zhihao Cheng, Damien Le Moal, Naohiro Aota, Johannes Thumshirn,
	John Johansen, Paul Moore, James Morris, Serge E. Hallyn,
	Mimi Zohar, Roberto Sassu, Dmitry Kasatkin, Eric Snowberg, Fan Wu,
	Stephen Smalley, Ondrej Mosnacek, Casey Schaufler, Alex Deucher,
	Christian König, David Airlie, Simona Vetter, Sumit Semwal,
	Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	David S. Miller, Jakub Kicinski, Simon Horman, Oleg Nesterov,
	Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Darrick J. Wong,
	Martin Schiller, Eric Paris, Joerg Reuter, Marcel Holtmann,
	Johan Hedberg, Luiz Augusto von Dentz, Oliver Hartkopp,
	Marc Kleine-Budde, David Ahern, Neal Cardwell, Steffen Klassert,
	Herbert Xu, Remi Denis-Courmont, Marcelo Ricardo Leitner,
	Xin Long, Magnus Karlsson, Maciej Fijalkowski, Stanislav Fomichev,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, linux-fsdevel, linux-kernel, linux-trace-kernel,
	nvdimm, fsverity, linux-mm, netfs, linux-ext4, linux-f2fs-devel,
	linux-nfs, linux-cifs, samba-technical, linux-nilfs, v9fs,
	linux-afs, autofs, ceph-devel, codalist, ecryptfs, linux-mtd,
	jfs-discussion, ntfs3, ocfs2-devel, devel, linux-unionfs,
	apparmor, linux-security-module, linux-integrity, selinux,
	amd-gfx, dri-devel, linux-media, linaro-mm-sig, netdev,
	linux-perf-users, linux-fscrypt, linux-xfs, linux-hams, linux-x25,
	audit, linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <20260302-iino-u64-v2-53-e5388800dae0@kernel.org>

On Mon, 02 Mar 2026 15:24:37 -0500
Jeff Layton <jlayton@kernel.org> wrote:

> Convert uprobes i_ino format strings to use the PRIino format
> macro in preparation for the widening of i_ino via kino_t.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

Looks good to me.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Thanks,

> ---
>  kernel/events/uprobes.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
> index 923b24b321cc0fbdecaf016645cdac0457a74463..d5bf51565851223730c63b50436c493c0c05eafd 100644
> --- a/kernel/events/uprobes.c
> +++ b/kernel/events/uprobes.c
> @@ -344,7 +344,7 @@ __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
>  static void update_ref_ctr_warn(struct uprobe *uprobe,
>  				struct mm_struct *mm, short d)
>  {
> -	pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
> +	pr_warn("ref_ctr %s failed for inode: 0x%" PRIino "x offset: "
>  		"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
>  		d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
>  		(unsigned long long) uprobe->offset,
> @@ -982,7 +982,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
>  static void
>  ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
>  {
> -	pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
> +	pr_warn("ref_ctr_offset mismatch. inode: 0x%" PRIino "x offset: 0x%llx "
>  		"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
>  		uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
>  		(unsigned long long) cur_uprobe->ref_ctr_offset,
> 
> -- 
> 2.53.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH 3/2] tracing: Show TID and flags for PI futex system call trace event
From: Steven Rostedt @ 2026-03-03 22:44 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260303214735.002154462@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

For the futex system call trace event for FUTEX_LOCK_PI and
FUTEX_UNLOCK_PI commands, show the TID from the value (which is usually in
hex) as well as translate the flags (DIED and WAITERS).

 pi_mutex_hammer-1098    [000] .....   121.876928: sys_futex(uaddr: 0x560f40cc8180 (0x450) tid: 1104, FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7f2f9d4b1e50 (0.000100000))
 pi_mutex_hammer-1128    [000] .....   121.877120: sys_futex(uaddr: 0x560f40cc8180 (0x8000042a) tid: 1066 (WAITERS), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7f2f8e493e50 (0.000100000))
 pi_mutex_hammer-1106    [000] .....   121.877242: sys_futex(uaddr: 0x560f40cc8180 (0x80000452) tid: 1106 (WAITERS), FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)

This makes it easier to see the hand off of a mutex and who the owner was.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d128dcb218a7..b3eddfee1a6b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -300,6 +300,9 @@ sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata
 
 	trace_seq_printf(s, "%s(", entry->name);
 
+	op = trace->args[1];
+	cmd = op & FUTEX_CMD_MASK;
+
 	for (i = 0; !done && i < entry->nb_args; i++) {
 
 		if (trace_seq_has_overflowed(s))
@@ -314,11 +317,29 @@ sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata
 					trace_seq_printf(s, " (%u)", val);
 				else
 					trace_seq_printf(s, " (0x%x)", val);
+
+				switch(cmd) {
+				case FUTEX_LOCK_PI:
+				case FUTEX_UNLOCK_PI:
+					trace_seq_printf(s, " tid: %d", val & FUTEX_TID_MASK);
+
+					if (!(val & (FUTEX_OWNER_DIED|FUTEX_WAITERS)))
+						break;
+
+					trace_seq_puts(s, " (");
+					if (val & FUTEX_WAITERS)
+						trace_seq_puts(s, "WAITERS");
+					if (val & FUTEX_OWNER_DIED) {
+						if (op & FUTEX_WAITERS)
+							trace_seq_putc(s, '|');
+						trace_seq_puts(s, "DIED");
+					}
+					trace_seq_putc(s, ')');
+					break;
+				}
 			}
 			continue;
 		case 1:
-			op = trace->args[i];
-			cmd = op & FUTEX_CMD_MASK;
 			if (cmd <= FUTEX_LOCK_PI2)
 				trace_seq_printf(s, ", %s", __futex_cmds[cmd]);
 			else
-- 
2.51.0


^ permalink raw reply related

* [PATCH 2/2] tracing: Update futex syscall trace event to show more commands
From: Steven Rostedt @ 2026-03-03 21:47 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260303214735.002154462@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

Make the futex syscall trace event a little more smart. Have it read the
futex_op instruction to determine what else it can save and print. For the
appropriate options, it will read the utime (timespec) parameter and show
its output as well as the uaddr2.

 futex_requeue_p-1154    [004] .....   144.568339: sys_futex(uaddr: 0x5652b178d834 (0x482), FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
 futex_requeue_p-1162    [002] .....   144.568696: sys_futex(uaddr: 0x7f763b7fece0 (2), FUTEX_WAIT|FUTEX_PRIVATE_FLAG, val: 2)
 futex_requeue_p-1151    [000] .....   144.568700: sys_futex(uaddr: 0x7f763b7fece0 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
 futex_requeue_p-1162    [002] .....   144.568705: sys_futex(uaddr: 0x7f763b7fece0 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
 futex_requeue_p-1151    [000] .....   144.568715: sys_futex(uaddr: 0x7f764369e990 (0x483), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 1155)
 futex_requeue_p-1155    [005] .....   144.569420: sys_futex(uaddr: 0x5652b178d838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7ffdacfba500 (143.890024054), uaddr2: 0x5652b178d834 (0), val3: 0)
 futex_requeue_p-1155    [005] .....   144.569454: sys_futex(uaddr: 0x5652b178d834 (0), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 138 +++++++++++++++++++++++++++++-----
 1 file changed, 121 insertions(+), 17 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bc60a0497bcc..d128dcb218a7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -273,10 +273,18 @@ static __always_inline bool futex_cmd_has_addr2(u32 cmd)
 	return false;
 }
 
+struct futex_data {
+	u32		val1;
+	u32		val2;
+	unsigned long	ts1;
+	unsigned long	ts2;
+};
+
 static enum print_line_t
 sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
 		      struct trace_seq *s, struct trace_event *event, int ent_size)
 {
+	struct futex_data *data;
 	bool done = false;
 	unsigned int op, cmd;
 	void *end = (void *)trace + ent_size;
@@ -285,8 +293,10 @@ sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata
 
 	/* Set ptr to the user space copied area */
 	ptr = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
-	if (ptr + 4 > end)
-		ptr = NULL;
+	if (ptr + sizeof(*data) > end)
+		data = NULL;
+	else
+		data = ptr;
 
 	trace_seq_printf(s, "%s(", entry->name);
 
@@ -298,8 +308,8 @@ sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata
 		switch (i) {
 		case 0:
 			trace_seq_printf(s, "uaddr: 0x%lx", trace->args[i]);
-			if (ptr) {
-				u32 val = *(u32 *)ptr;
+			if (data) {
+				u32 val = data->val1;
 				if (val < 10)
 					trace_seq_printf(s, " (%u)", val);
 				else
@@ -338,6 +348,15 @@ sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata
 			trace_seq_printf(s, ", timespec: 0x%lx",
 					 trace->args[i]);
 
+			if (!data)
+				continue;
+
+			if (!data->ts1 && !data->ts2) {
+				trace_seq_puts(s, " (0)");
+				continue;
+			}
+			trace_seq_printf(s, " (%lu.%09lu)",
+					 data->ts1, data->ts2);
 			continue;
 
 		case 4:
@@ -346,6 +365,12 @@ sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata
 				continue;
 			}
 			trace_seq_printf(s, ", uaddr2: 0x%lx", trace->args[i]);
+
+			if (!data)
+				continue;
+
+			trace_seq_printf(s, " (%x)", data->val2);
+			continue;
 		}
 
 		trace_seq_printf(s, ", %s: %lu", entry->args[i],
@@ -570,9 +595,9 @@ sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
 			"\"uaddr: 0x%%lx (0x%%lx) cmd=%%s%%s%%s");
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			"  val: 0x%%x timeout/val2: 0x%%llx");
+			"  val: 0x%%x timeout/val2: 0x%%llx (%%lu.%%lu)");
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			" uaddr2: 0x%%lx val3: 0x%%x\", ");
+			" uaddr2: 0x%%lx (0x%%lx) val3: 0x%%x\", ");
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
 			" REC->uaddr,");
@@ -618,10 +643,12 @@ sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 			FUTEX_CLOCK_REALTIME);
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			" REC->val, REC->utime,");
+			" REC->val, REC->utime, REC->__ts1, REC->__ts2,");
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			" REC->uaddr, REC->val3");
+			" REC->uaddr,");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->__value2, REC->val3");
 	return pos;
 }
 
@@ -724,7 +751,39 @@ static int __init futex_fields(struct trace_event_call *call, int offset)
 	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
 				 FILTER_OTHER);
 	if (ret)
-		kfree(arg);
+		goto free;
+	offset += sizeof(int);
+
+	arg = kstrdup("__value2", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
+				 FILTER_OTHER);
+	if (ret)
+		goto free;
+	offset += sizeof(int);
+
+	arg = kstrdup("__ts1", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "unsigned long", arg, offset,
+				 sizeof(unsigned long), 0, FILTER_OTHER);
+	if (ret)
+		goto free;
+	offset += sizeof(long);
+
+	arg = kstrdup("__ts2", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "unsigned long", arg, offset,
+				 sizeof(unsigned long), 0, FILTER_OTHER);
+	if (ret)
+		goto free;
+
+	return 0;
+
+free:
+	kfree(arg);
 	return ret;
 }
 
@@ -897,11 +956,51 @@ static int syscall_copy_user_array(char *buf, const char __user *ptr,
 	return 0;
 }
 
+struct tp_futex_data {
+	u32			cmd;
+	const u32		__user *val1;
+	const u32 		__user *val2;
+	void			__user *timeout;
+};
+
+static int syscall_copy_futex(char *buf, const char __user *ptr,
+			      size_t size, void *data)
+{
+	struct tp_futex_data *tp_data = data;
+	struct futex_data *fdata = (void *)buf;
+	int cmd = tp_data->cmd & FUTEX_CMD_MASK;
+	int ret;
+
+	memset(fdata, 0, sizeof(*fdata));
+
+	if (tp_data->val1) {
+		ret = __copy_from_user(&fdata->val1, tp_data->val1, 4);
+		if (ret)
+			return -1;
+	}
+
+	if (tp_data->val2 && futex_cmd_has_addr2(cmd)) {
+		ret = __copy_from_user(&fdata->val2, tp_data->val2, 4);
+		if (ret)
+			return -1;
+	}
+
+	if (tp_data->timeout && futex_cmd_has_timeout(cmd)) {
+		/* Copies both ts1 and ts2 */
+		ret = __copy_from_user(&fdata->ts1, tp_data->timeout,
+				       sizeof(long) * 2);
+		if (ret)
+			return -1;
+	}
+
+	return 0;
+}
+
 static int
 syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
 {
 	struct syscall_user_buffer *sbuf;
-	const char __user *ptr;
+	struct tp_futex_data tp_data;
 	char *buf;
 
 	/* buf_size of zero means user doesn't want user space read */
@@ -913,14 +1012,18 @@ syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
 	if (!sbuf)
 		return -1;
 
-	ptr = (char __user *)args[0];
+	tp_data.cmd = args[1];
+	tp_data.val1 = (u32 __user *)args[0];
+	tp_data.val2 = (u32 __user *)args[4];
+	tp_data.timeout = (u64 __user *)args[3];
 
-	*buffer = trace_user_fault_read(&sbuf->buf, ptr, 4, NULL, NULL);
+	*buffer = trace_user_fault_read(&sbuf->buf, NULL, 0,
+					syscall_copy_futex, &tp_data);
 	if (!*buffer)
 		return -1;
 
-	/* Add room for the value */
-	*size += 4;
+	/* Add room for values */
+	*size += sizeof(struct futex_data);
 
 	buf = *buffer;
 
@@ -931,12 +1034,13 @@ static void syscall_put_futex(struct syscall_metadata *sys_data,
 			      struct syscall_trace_enter *entry,
 			      char *buffer)
 {
-	u32 *ptr;
+	struct futex_data *fdata = (void *)buffer;
+	struct futex_data *data;
 
 	/* Place the futex key into the storage */
-	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+	data = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
 
-	*ptr = *(u32 *)buffer;
+	*data = *fdata;
 }
 
 static char *sys_fault_user(unsigned int buf_size,
-- 
2.51.0



^ permalink raw reply related

* [PATCH 0/2] tracing: Read user data from futex system call trace event
From: Steven Rostedt @ 2026-03-03 21:47 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal


We are looking at the performance of futexes and require a bit more
information when tracing them.

The two patches here extend the system call reading of user space to
create specific handling of the futex system call. It now reads the
user space relevant data (the addr, utime and addr2), as well as
parses the flags. This adds a little smarts to the trace event as
it only shows the parameters that are relevant, as well as parses
utime as either a timespec or as val2 depending on the futex_op.

Here's an example of the new output:

   futex_requeue-1431    [004] .....   154.082233: sys_futex(uaddr: 0x7ffcadad2ff8 (0), FUTEX_WAIT, val: 0, timespec: 0x7f0785f83e90 (0.030000000))
   futex_requeue-1421    [002] .....   154.092286: sys_futex(uaddr: 0x7ffcadad2ff8 (0), FUTEX_CMP_REQUEUE, val: 3, val2: 0x7, uaddr2: 0x7ffcadad2ffc (0), val3: 0)
   futex_requeue-1421    [002] .....   154.092321: sys_futex(uaddr: 0x7ffcadad2ffc (0), FUTEX_WAKE, val: 2147483647)
     futex_waitv-1433    [006] .....   154.111394: sys_futex(uaddr: 0x55ed4a768554 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
     futex_waitv-1435    [002] .....   154.128186: sys_futex(uaddr: 0x7f035fa40000 (0), FUTEX_WAKE, val: 1)


Steven Rostedt (2):
      tracing: Have futex syscall trace event show specific user data
      tracing: Update futex syscall trace event to show more commands

----
 kernel/trace/trace_syscalls.c | 370 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 367 insertions(+), 3 deletions(-)

^ permalink raw reply

* [PATCH 1/2] tracing: Have futex syscall trace event show specific user data
From: Steven Rostedt @ 2026-03-03 21:47 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260303214735.002154462@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

Add specific reporting of the futex system call. This allows for debugging
the futex code a bit easier. Instead of just showing the values passed
into the futex system call, read the value of the user space memory
pointed to by the addr parameter.

Also make the op parameter more readable by parsing the values to show
what the command is:

 futex_requeue_p-3251    [002] .....  2101.068479: sys_futex(uaddr: 0x55e79a4da834 (0x80000cb1), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
 futex_requeue_p-3248    [001] .....  2101.068970: sys_futex(uaddr: 0x7f859072f990 (0xcb2), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 3250)
 futex_requeue_p-3252    [005] .....  2101.069108: sys_futex(uaddr: 0x55e79a4da838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7ffe61076aa0, uaddr2: 0x55e79a4da834, uaddr2: 94453214586932, val3: 0)
 futex_requeue_p-3252    [005] .....  2101.069410: sys_futex(uaddr: 0x55e79a4da834 (0x80000cb1), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 266 +++++++++++++++++++++++++++++++++-
 1 file changed, 263 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e96d0063cbcf..bc60a0497bcc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -11,6 +11,8 @@
 #include <linux/xarray.h>
 #include <asm/syscall.h>
 
+#include <uapi/linux/futex.h>
+
 #include "trace_output.h"
 #include "trace.h"
 
@@ -237,6 +239,125 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
 	return trace_handle_return(s);
 }
 
+static const char * __futex_cmds[] =
+{
+	"FUTEX_WAIT", "FUTEX_WAKE", "FUTEX_FD", "FUTEX_REQUEUE",
+	"FUTEX_CMP_REQUEUE", "FUTEX_WAKE_OP", "FUTEX_LOCK_PI",
+	"FUTEX_UNLOCK_PI", "FUTEX_TRYLOCK_PI", "FUTEX_WAIT_BITSET",
+	"FUTEX_WAKE_BITSET", "FUTEX_WAIT_REQUEUE_PI", "FUTEX_CMP_REQUEUE_PI",
+	"FUTEX_LOCK_PI2", };
+
+/* From futex/syscalls.c */
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+	switch (cmd) {
+	case FUTEX_WAIT:
+	case FUTEX_LOCK_PI:
+	case FUTEX_LOCK_PI2:
+	case FUTEX_WAIT_BITSET:
+	case FUTEX_WAIT_REQUEUE_PI:
+		return true;
+	}
+	return false;
+}
+
+static __always_inline bool futex_cmd_has_addr2(u32 cmd)
+{
+	switch (cmd) {
+	case FUTEX_REQUEUE:
+	case FUTEX_CMP_REQUEUE:
+	case FUTEX_WAKE_OP:
+	case FUTEX_WAIT_REQUEUE_PI:
+		return true;
+	}
+	return false;
+}
+
+static enum print_line_t
+sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
+		      struct trace_seq *s, struct trace_event *event, int ent_size)
+{
+	bool done = false;
+	unsigned int op, cmd;
+	void *end = (void *)trace + ent_size;
+	void *ptr;
+	int i;
+
+	/* Set ptr to the user space copied area */
+	ptr = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
+	if (ptr + 4 > end)
+		ptr = NULL;
+
+	trace_seq_printf(s, "%s(", entry->name);
+
+	for (i = 0; !done && i < entry->nb_args; i++) {
+
+		if (trace_seq_has_overflowed(s))
+			goto end;
+
+		switch (i) {
+		case 0:
+			trace_seq_printf(s, "uaddr: 0x%lx", trace->args[i]);
+			if (ptr) {
+				u32 val = *(u32 *)ptr;
+				if (val < 10)
+					trace_seq_printf(s, " (%u)", val);
+				else
+					trace_seq_printf(s, " (0x%x)", val);
+			}
+			continue;
+		case 1:
+			op = trace->args[i];
+			cmd = op & FUTEX_CMD_MASK;
+			if (cmd <= FUTEX_LOCK_PI2)
+				trace_seq_printf(s, ", %s", __futex_cmds[cmd]);
+			else
+				trace_seq_puts(s, ", UNKNOWN");
+
+			if (op & FUTEX_PRIVATE_FLAG)
+				trace_seq_puts(s, "|FUTEX_PRIVATE_FLAG");
+			if (op & FUTEX_CLOCK_REALTIME)
+				trace_seq_puts(s, "|FUTEX_CLOCK_REALTIME");
+			continue;
+		case 3:
+			if (!futex_cmd_has_timeout(cmd)) {
+
+				if (!futex_cmd_has_addr2(cmd)) {
+					done = true;
+					continue;
+				}
+
+				trace_seq_printf(s, ", val2: 0x%x",
+						 (u32)(long)trace->args[i]);
+				continue;
+			}
+
+			if (!trace->args[i])
+				continue;
+
+			trace_seq_printf(s, ", timespec: 0x%lx",
+					 trace->args[i]);
+
+			continue;
+
+		case 4:
+			if (!futex_cmd_has_addr2(cmd)) {
+				done = true;
+				continue;
+			}
+			trace_seq_printf(s, ", uaddr2: 0x%lx", trace->args[i]);
+		}
+
+		trace_seq_printf(s, ", %s: %lu", entry->args[i],
+				 trace->args[i]);
+	}
+	trace_seq_putc(s, ')');
+end:
+	trace_seq_putc(s, '\n');
+
+	return trace_handle_return(s);
+}
+
 static enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags,
 		    struct trace_event *event)
@@ -267,6 +388,10 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 		if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
 			return sys_enter_openat_print(trace, entry, s, event);
 		break;
+	case __NR_futex:
+		if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
+			return sys_enter_futex_print(trace, entry, s, event, iter->ent_size);
+		break;
 	default:
 		break;
 	}
@@ -437,6 +562,69 @@ sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	return pos;
 }
 
+static int __init
+sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+{
+	int pos = 0;
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"\"uaddr: 0x%%lx (0x%%lx) cmd=%%s%%s%%s");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"  val: 0x%%x timeout/val2: 0x%%llx");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" uaddr2: 0x%%lx val3: 0x%%x\", ");
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->uaddr,");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->__value,");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"  __print_symbolic(REC->op & 0x%x, ", FUTEX_CMD_MASK);
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_WAIT\"}, ", FUTEX_WAIT);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_WAKE\"}, ", FUTEX_WAKE);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_FD\"}, ", FUTEX_FD);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_REQUEUE\"}, ", FUTEX_REQUEUE);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_CMP_REQUEUE\"}, ", FUTEX_CMP_REQUEUE);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_WAKE_OP\"}, ", FUTEX_WAKE_OP);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_LOCK_PI\"}, ", FUTEX_LOCK_PI);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_UNLOCK_PI\"}, ", FUTEX_UNLOCK_PI);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_TRYLOCK_PI\"}, ", FUTEX_TRYLOCK_PI);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_WAIT_BITSET\"}, ", FUTEX_WAIT_BITSET);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_WAKE_BITSET\"}, ", FUTEX_WAKE_BITSET);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_WAIT_REQUEUE_PI\"}, ", FUTEX_WAIT_REQUEUE_PI);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_CMP_REQUEUE_PI\"}, ", FUTEX_CMP_REQUEUE_PI);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"{%d, \"FUTEX_LOCK_PI2\"}),", FUTEX_LOCK_PI2);
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" (REC->op & %d) ? \"|FUTEX_PRIVATE_FLAG\" : \"\",",
+			FUTEX_PRIVATE_FLAG);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" (REC->op & %d) ? \"|FUTEX_CLOCK_REALTIME\" : \"\",",
+			FUTEX_CLOCK_REALTIME);
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->val, REC->utime,");
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->uaddr, REC->val3");
+	return pos;
+}
+
 static int __init
 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
@@ -447,6 +635,8 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	switch (entry->syscall_nr) {
 	case __NR_openat:
 		return sys_enter_openat_print_fmt(entry, buf, len);
+	case __NR_futex:
+		return sys_enter_futex_print_fmt(entry, buf, len);
 	default:
 		break;
 	}
@@ -523,6 +713,21 @@ static void __init free_syscall_print_fmt(struct trace_event_call *call)
 		kfree(call->print_fmt);
 }
 
+static int __init futex_fields(struct trace_event_call *call, int offset)
+{
+	char *arg;
+	int ret;
+
+	arg = kstrdup("__value", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
+				 FILTER_OTHER);
+	if (ret)
+		kfree(arg);
+	return ret;
+}
+
 static int __init syscall_enter_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_enter trace;
@@ -544,6 +749,9 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
 		offset += sizeof(unsigned long);
 	}
 
+	if (!ret && meta->syscall_nr == __NR_futex)
+		return futex_fields(call, offset);
+
 	if (ret || !meta->user_mask)
 		return ret;
 
@@ -689,6 +897,48 @@ static int syscall_copy_user_array(char *buf, const char __user *ptr,
 	return 0;
 }
 
+static int
+syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
+{
+	struct syscall_user_buffer *sbuf;
+	const char __user *ptr;
+	char *buf;
+
+	/* buf_size of zero means user doesn't want user space read */
+	if (!buf_size)
+		return -1;
+
+	/* If the syscall_buffer is NULL, tracing is being shutdown */
+	sbuf = READ_ONCE(syscall_buffer);
+	if (!sbuf)
+		return -1;
+
+	ptr = (char __user *)args[0];
+
+	*buffer = trace_user_fault_read(&sbuf->buf, ptr, 4, NULL, NULL);
+	if (!*buffer)
+		return -1;
+
+	/* Add room for the value */
+	*size += 4;
+
+	buf = *buffer;
+
+	return 0;
+}
+
+static void syscall_put_futex(struct syscall_metadata *sys_data,
+			      struct syscall_trace_enter *entry,
+			      char *buffer)
+{
+	u32 *ptr;
+
+	/* Place the futex key into the storage */
+	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+	*ptr = *(u32 *)buffer;
+}
+
 static char *sys_fault_user(unsigned int buf_size,
 			    struct syscall_metadata *sys_data,
 			    struct syscall_user_buffer *sbuf,
@@ -905,6 +1155,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 		if (syscall_get_data(sys_data, args, &user_ptr,
 				     &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
 			return;
+	} else if (syscall_nr == __NR_futex) {
+		if (syscall_get_futex(args, &user_ptr, &size, tr->syscall_buf_sz) < 0)
+			return;
 	}
 
 	size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
@@ -921,6 +1174,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	if (mayfault)
 		syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
 
+	else if (syscall_nr == __NR_futex)
+		syscall_put_futex(sys_data, entry, user_ptr);
+
 	trace_event_buffer_commit(&fbuffer);
 }
 
@@ -971,14 +1227,18 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
 {
 	struct syscall_metadata *sys_data = call->data;
 	struct trace_array *tr = file->tr;
+	bool enable_faults;
 	int ret = 0;
 	int num;
 
 	num = sys_data->syscall_nr;
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
+
+	enable_faults = sys_data->user_mask || num == __NR_futex;
+
 	guard(mutex)(&syscall_trace_lock);
-	if (sys_data->user_mask) {
+	if (enable_faults) {
 		ret = syscall_fault_buffer_enable();
 		if (ret < 0)
 			return ret;
@@ -986,7 +1246,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
 	if (!tr->sys_refcount_enter) {
 		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
 		if (ret < 0) {
-			if (sys_data->user_mask)
+			if (enable_faults)
 				syscall_fault_buffer_disable();
 			return ret;
 		}
@@ -1011,7 +1271,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file,
 	WRITE_ONCE(tr->enter_syscall_files[num], NULL);
 	if (!tr->sys_refcount_enter)
 		unregister_trace_sys_enter(ftrace_syscall_enter, tr);
-	if (sys_data->user_mask)
+	if (sys_data->user_mask || num == __NR_futex)
 		syscall_fault_buffer_disable();
 }
 
-- 
2.51.0



^ permalink raw reply related

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-03-03 20:36 UTC (permalink / raw)
  To: Alistair Popple
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm, david,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman
In-Reply-To: <a6izpi2wlqro72erhbvxhlx2lwdnae7my3ghfs6t33ivtixo4h@bi2u4x6qv7ul>

On Thu, Feb 26, 2026 at 02:27:24PM +1100, Alistair Popple wrote:
> On 2026-02-25 at 02:17 +1100, Gregory Price <gourry@gourry.net> wrote...
> > 
> > If your service only allocates movable pages - your ZONE_NORMAL is
> > effectively ZONE_MOVABLE.  
> 
> This is interesting - it sounds like the conclusion of this is ZONE_* is just a
> bad abstraction and should be replaced with something else maybe some like this?
> 
> And FWIW I'm not tied to the ZONE_DEVICE as being a good abstraction, it's just
> what we seem to have today for determing page types. It almost sounds like what
> we want is just a bunch of hooks that can be associated with a range of pages,
> and then you just get rid of ZONE_DEVICE and instead install hooks appropriate
> for each page a driver manages. I have to think more about that though, this
> is just what popped into my head when you start saying ZONE_MOVABLE could also
> disappear :-)
> 
... snip ...
> > 
> > You don't have to squint because it was deliberate :]
> 
> Nice.
> 

I've had some time to chew on this a bit more.

Adding a node-scope `struct dev_pagemap` produces some interesting
(arguably useful / valuable) effects.

The invariant would be clamping the entire node to ZONE_DEVICE
(more on this below).

So if we think about it this way - we could just view this whole thing
as another variant of ZONE_DEVICE - but without needing the memremap
infrastructure (you can use normal hotplug to achieve it).

0. pgdat->private becomes pgdat->dev_pagemap
   N_MEMORY_PRIVATE -> N_MEMORY_DEVICE ?

   As a start, do a direct conversion, and use the existing
   infrastructure.   then expand hooks as needed (and as-is reasonable)

   Some of the `struct dev_pagemap {}` fields become dead at the node
   scope, but this is a plumbing issue.

   There's already an similar split between the dev_pagemap and the ops
   structure, so it might map very cleanly.

1. "Clamping the entire node to ZONE_DEVICE"

   When we do this, the *actual* ZONE becomes completely irrelevant.
   The allocation path is entirely controlled, so you might actually end
   up freeing up the folio flags that track the zone:

   static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags)
   {
        ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT);
        return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK;
   }

   becomes:

   folio_is_zone_device(folio) {
       return node_is_device_node(folio_nid(folio)) || 
              memdesc_is_zone_device(folio->flags);
   }

   Kind of an interesting.  You still need these flags for traditional
   ZONE_DEVICE, so you can't evict it completely, but you can start to
   see a path here.

2. One dev_pagemap per node or multiple w/ pagemap range searching

   Checking membership is always cheap: 

        node_is_device_node()

   Getting ops can be cheap if 1:1 mappings exists:

       pgdat->device_ops->callback()

   Or may be expensive if range-based matching is required:

      node_device_op(folio, ...) {
         ops = node_ops_lookup(folio); /* pfn-range binary search */
	 ops->callback(folio, ...)
      }

      pgmap already has an embedded range:

      struct dev_pagemap {
        ...
        int nr_range;
        union {
            struct range range;
            DECLARE_FLEX_ARRAY(struct range, ranges);
        };
      };

   Example: Nouveau, registers hundreds of pgmap instances that it
            uses to recover driver contexts for that specific folio.

	    This would not scale well.

	    But most other drivers register between 1-8.  That might.

   That means this might actually be an effective way to evict pgmap
   from struct folio / struct page.  (Not making this a requirement or
   saying it's reasonable, just an interesting observation).

3. Some existing drivers with 1 pgmap per driver instance instantly get
   the folio->lru field back - even if they continue to use ZONE_DEVICE.

   At least 3 drivers use page->zone_device_data as a page freelist
   rather than actual per-page data.  Those drivers could just start
   using folio/page->lru instead.

   Some store actual per-page zone_device_data that would prevent this,
   but from poking around it seems like it might be feasible.

   Some use the pgmap as a container_of() argument to get driver
   context, may or may not be supportable out of the box, but it seemed
   like mild refactoring might get them back the use of folio->lru.

   None of this is required, the goal is explicitly not disrupting any
   current users of ZONE_DEVICE.

Just some additional food for thought.

As-designed now, this would only apply to NUMA systems, meaning you
can't fully evict pgmap from struct page/folio --- but you could
imagine a world where in non-numa mode we even register a separate
pglist_data specifically for device memory even w/o NUMA.

~Gregory 

^ permalink raw reply

* [syzbot] [bpf?] [trace?] KASAN: slab-use-after-free Read in bpf_trace_run9
From: syzbot @ 2026-03-03 18:24 UTC (permalink / raw)
  To: andrii, ast, bpf, daniel, eddyz87, haoluo, john.fastabend, jolsa,
	kpsingh, linux-kernel, linux-trace-kernel, martin.lau,
	mathieu.desnoyers, mattbobrowski, mhiramat, rostedt, sdf, song,
	syzkaller-bugs, yonghong.song

Hello,

syzbot found the following issue on:

HEAD commit:    a75cb869a8cc Merge tag 'v7.0-rc1-ksmbd-server-fixes' of gi..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=17cb50ba580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=56150637ffd942dd
dashboard link: https://syzkaller.appspot.com/bug?extid=b4c5ad098c821bf8d8bc
compiler:       Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=122c6006580000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1429cd5a580000

Downloadable assets:
disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/d900f083ada3/non_bootable_disk-a75cb869.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/dadc7741a2af/vmlinux-a75cb869.xz
kernel image: https://storage.googleapis.com/syzbot-assets/7d84dbd51e35/bzImage-a75cb869.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+b4c5ad098c821bf8d8bc@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: slab-use-after-free in __bpf_trace_run kernel/trace/bpf_trace.c:2075 [inline]
BUG: KASAN: slab-use-after-free in bpf_trace_run9+0x13b/0x8c0 kernel/trace/bpf_trace.c:2136
Read of size 8 at addr ffff888039269618 by task syz.5.56/5665

CPU: 0 UID: 0 PID: 5665 Comm: syz.5.56 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 print_address_description mm/kasan/report.c:378 [inline]
 print_report+0xba/0x230 mm/kasan/report.c:482
 kasan_report+0x117/0x150 mm/kasan/report.c:595
 __bpf_trace_run kernel/trace/bpf_trace.c:2075 [inline]
 bpf_trace_run9+0x13b/0x8c0 kernel/trace/bpf_trace.c:2136
 __bpf_trace_virtio_transport_alloc_pkt+0x3a5/0x410 include/trace/events/vsock_virtio_transport_common.h:39
 __traceiter_virtio_transport_alloc_pkt+0xc1/0x120 include/trace/events/vsock_virtio_transport_common.h:39
 __do_trace_virtio_transport_alloc_pkt include/trace/events/vsock_virtio_transport_common.h:39 [inline]
 trace_virtio_transport_alloc_pkt include/trace/events/vsock_virtio_transport_common.h:39 [inline]
 virtio_transport_alloc_skb+0x1108/0x1180 net/vmw_vsock/virtio_transport_common.c:312
 virtio_transport_send_pkt_info+0x570/0xff0 net/vmw_vsock/virtio_transport_common.c:391
 virtio_transport_connect+0xf5/0x150 net/vmw_vsock/virtio_transport_common.c:1080
 vsock_connect+0xaf5/0xd60 net/vmw_vsock/af_vsock.c:1716
 __sys_connect_file net/socket.c:2089 [inline]
 __sys_connect+0x312/0x450 net/socket.c:2108
 __do_sys_connect net/socket.c:2114 [inline]
 __se_sys_connect net/socket.c:2111 [inline]
 __x64_sys_connect+0x7a/0x90 net/socket.c:2111
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fd65079c799
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffd52be1438 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 00007fd650a15fa0 RCX: 00007fd65079c799
RDX: 0000000000000010 RSI: 0000200000000080 RDI: 0000000000000003
RBP: 00007fd650832bd9 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fd650a15fac R14: 00007fd650a15fa0 R15: 00007fd650a15fa0
 </TASK>

Allocated by task 5664:
 kasan_save_stack mm/kasan/common.c:57 [inline]
 kasan_save_track+0x3e/0x80 mm/kasan/common.c:78
 poison_kmalloc_redzone mm/kasan/common.c:398 [inline]
 __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:415
 kasan_kmalloc include/linux/kasan.h:263 [inline]
 __kmalloc_cache_noprof+0x31c/0x660 mm/slub.c:5339
 kmalloc_noprof include/linux/slab.h:962 [inline]
 kzalloc_noprof include/linux/slab.h:1200 [inline]
 bpf_raw_tp_link_attach+0x278/0x700 kernel/bpf/syscall.c:4264
 bpf_raw_tracepoint_open+0x1b2/0x220 kernel/bpf/syscall.c:4312
 __sys_bpf+0x846/0x950 kernel/bpf/syscall.c:6270
 __do_sys_bpf kernel/bpf/syscall.c:6341 [inline]
 __se_sys_bpf kernel/bpf/syscall.c:6339 [inline]
 __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:6339
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Freed by task 5576:
 kasan_save_stack mm/kasan/common.c:57 [inline]
 kasan_save_track+0x3e/0x80 mm/kasan/common.c:78
 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584
 poison_slab_object mm/kasan/common.c:253 [inline]
 __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:285
 kasan_slab_free include/linux/kasan.h:235 [inline]
 slab_free_hook mm/slub.c:2687 [inline]
 slab_free mm/slub.c:6124 [inline]
 kfree+0x1c1/0x630 mm/slub.c:6442
 rcu_do_batch kernel/rcu/tree.c:2617 [inline]
 rcu_core+0x7cd/0x1070 kernel/rcu/tree.c:2869
 handle_softirqs+0x22a/0x870 kernel/softirq.c:622
 do_softirq+0x76/0xd0 kernel/softirq.c:523
 __local_bh_enable_ip+0xf8/0x130 kernel/softirq.c:450
 local_bh_enable include/linux/bottom_half.h:33 [inline]
 __alloc_skb+0x1aa/0x7d0 net/core/skbuff.c:697
 alloc_skb include/linux/skbuff.h:1383 [inline]
 mld_newpack+0x14c/0xc90 net/ipv6/mcast.c:1775
 add_grhead+0x5a/0x2a0 net/ipv6/mcast.c:1886
 add_grec+0x1452/0x1740 net/ipv6/mcast.c:2025
 mld_send_initial_cr+0x288/0x550 net/ipv6/mcast.c:2268
 mld_dad_work+0x45/0x5b0 net/ipv6/mcast.c:2294
 process_one_work kernel/workqueue.c:3275 [inline]
 process_scheduled_works+0xb02/0x1830 kernel/workqueue.c:3358
 worker_thread+0xa50/0xfc0 kernel/workqueue.c:3439
 kthread+0x388/0x470 kernel/kthread.c:467
 ret_from_fork+0x51e/0xb90 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

Last potentially related work creation:
 kasan_save_stack+0x3e/0x60 mm/kasan/common.c:57
 kasan_record_aux_stack+0xbd/0xd0 mm/kasan/generic.c:556
 __call_rcu_common kernel/rcu/tree.c:3131 [inline]
 call_rcu+0xee/0x890 kernel/rcu/tree.c:3251
 bpf_link_put_direct kernel/bpf/syscall.c:3323 [inline]
 bpf_link_release+0x6b/0x80 kernel/bpf/syscall.c:3330
 __fput+0x44f/0xa70 fs/file_table.c:469
 task_work_run+0x1d9/0x270 kernel/task_work.c:233
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 __exit_to_user_mode_loop kernel/entry/common.c:67 [inline]
 exit_to_user_mode_loop+0xed/0x480 kernel/entry/common.c:98
 __exit_to_user_mode_prepare include/linux/irq-entry-common.h:226 [inline]
 syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:256 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:325 [inline]
 do_syscall_64+0x32d/0xf80 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

The buggy address belongs to the object at ffff888039269600
 which belongs to the cache kmalloc-192 of size 192
The buggy address is located 24 bytes inside of
 freed 192-byte region [ffff888039269600, ffff8880392696c0)

The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff888039269a00 pfn:0x39269
flags: 0x4fff00000000200(workingset|node=1|zone=1|lastcpupid=0x7ff)
page_type: f5(slab)
raw: 04fff00000000200 ffff88801ac413c0 ffffea0001646350 ffffea00015fe190
raw: ffff888039269a00 000000000010000b 00000000f5000000 0000000000000000
page dumped because: kasan: bad access detected
page_owner tracks the page as allocated
page last allocated via order 0, migratetype Unmovable, gfp_mask 0x1d2cc0(GFP_USER|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 5643, tgid 5643 (syz.5.37), ts 117716289519, free_ts 117714764185
 set_page_owner include/linux/page_owner.h:32 [inline]
 post_alloc_hook+0x231/0x280 mm/page_alloc.c:1889
 prep_new_page mm/page_alloc.c:1897 [inline]
 get_page_from_freelist+0x24dc/0x2580 mm/page_alloc.c:3962
 __alloc_frozen_pages_noprof+0x18d/0x380 mm/page_alloc.c:5250
 alloc_slab_page mm/slub.c:3255 [inline]
 allocate_slab+0x77/0x660 mm/slub.c:3444
 new_slab mm/slub.c:3502 [inline]
 refill_objects+0x331/0x3c0 mm/slub.c:7134
 refill_sheaf mm/slub.c:2804 [inline]
 __pcs_replace_empty_main+0x2b9/0x620 mm/slub.c:4578
 alloc_from_pcs mm/slub.c:4681 [inline]
 slab_alloc_node mm/slub.c:4815 [inline]
 __kmalloc_cache_noprof+0x392/0x660 mm/slub.c:5334
 kmalloc_noprof include/linux/slab.h:962 [inline]
 kzalloc_noprof include/linux/slab.h:1200 [inline]
 bpf_raw_tp_link_attach+0x278/0x700 kernel/bpf/syscall.c:4264
 bpf_raw_tracepoint_open+0x1b2/0x220 kernel/bpf/syscall.c:4312
 __sys_bpf+0x846/0x950 kernel/bpf/syscall.c:6270
 __do_sys_bpf kernel/bpf/syscall.c:6341 [inline]
 __se_sys_bpf kernel/bpf/syscall.c:6339 [inline]
 __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:6339
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
page last free pid 15 tgid 15 stack trace:
 reset_page_owner include/linux/page_owner.h:25 [inline]
 __free_pages_prepare mm/page_alloc.c:1433 [inline]
 __free_frozen_pages+0xc2b/0xdb0 mm/page_alloc.c:2978
 __tlb_remove_table_free mm/mmu_gather.c:228 [inline]
 tlb_remove_table_rcu+0x85/0x100 mm/mmu_gather.c:291
 rcu_do_batch kernel/rcu/tree.c:2617 [inline]
 rcu_core+0x7cd/0x1070 kernel/rcu/tree.c:2869
 handle_softirqs+0x22a/0x870 kernel/softirq.c:622
 run_ksoftirqd+0x36/0x60 kernel/softirq.c:1063
 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160
 kthread+0x388/0x470 kernel/kthread.c:467
 ret_from_fork+0x51e/0xb90 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245

Memory state around the buggy address:
 ffff888039269500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 ffff888039269580: 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff888039269600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                            ^
 ffff888039269680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
 ffff888039269700: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want syzbot to run the reproducer, reply with:
#syz test: git://repo/address.git branch-or-commit-hash
If you attach or paste a git patch, syzbot will apply it before testing.

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* Re: [PATCH v2 105/110] security: replace PRIino with %llu/%llx format strings
From: Casey Schaufler @ 2026-03-03 18:06 UTC (permalink / raw)
  To: Jeff Layton, Alexander Viro, Christian Brauner, Jan Kara,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, Dan Williams,
	Matthew Wilcox, Eric Biggers, Theodore Y. Ts'o, Muchun Song,
	Oscar Salvador, David Hildenbrand, David Howells, Paulo Alcantara,
	Andreas Dilger, Jan Kara, Jaegeuk Kim, Chao Yu, Trond Myklebust,
	Anna Schumaker, Chuck Lever, NeilBrown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, Steve French, Ronnie Sahlberg,
	Shyam Prasad N, Bharath SM, Alexander Aring, Ryusuke Konishi,
	Viacheslav Dubeyko, Eric Van Hensbergen, Latchesar Ionkov,
	Dominique Martinet, Christian Schoenebeck, David Sterba,
	Marc Dionne, Ian Kent, Luis de Bethencourt, Salah Triki,
	Tigran A. Aivazian, Ilya Dryomov, Alex Markuze, Jan Harkes, coda,
	Nicolas Pitre, Tyler Hicks, Amir Goldstein, Christoph Hellwig,
	John Paul Adrian Glaubitz, Yangtao Li, Mikulas Patocka,
	David Woodhouse, Richard Weinberger, Dave Kleikamp,
	Konstantin Komarov, Mark Fasheh, Joel Becker, Joseph Qi,
	Mike Marshall, Martin Brandenburg, Miklos Szeredi, Anders Larsen,
	Zhihao Cheng, Damien Le Moal, Naohiro Aota, Johannes Thumshirn,
	John Johansen, Paul Moore, James Morris, Serge E. Hallyn,
	Mimi Zohar, Roberto Sassu, Dmitry Kasatkin, Eric Snowberg, Fan Wu,
	Stephen Smalley, Ondrej Mosnacek, Alex Deucher,
	Christian König, David Airlie, Simona Vetter, Sumit Semwal,
	Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	David S. Miller, Jakub Kicinski, Simon Horman, Oleg Nesterov,
	Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Darrick J. Wong,
	Martin Schiller, Eric Paris, Joerg Reuter, Marcel Holtmann,
	Johan Hedberg, Luiz Augusto von Dentz, Oliver Hartkopp,
	Marc Kleine-Budde, David Ahern, Neal Cardwell, Steffen Klassert,
	Herbert Xu, Remi Denis-Courmont, Marcelo Ricardo Leitner,
	Xin Long, Magnus Karlsson, Maciej Fijalkowski, Stanislav Fomichev,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend
  Cc: linux-fsdevel, linux-kernel, linux-trace-kernel, nvdimm, fsverity,
	linux-mm, netfs, linux-ext4, linux-f2fs-devel, linux-nfs,
	linux-cifs, samba-technical, linux-nilfs, v9fs, linux-afs, autofs,
	ceph-devel, codalist, ecryptfs, linux-mtd, jfs-discussion, ntfs3,
	ocfs2-devel, devel, linux-unionfs, apparmor,
	linux-security-module, linux-integrity, selinux, amd-gfx,
	dri-devel, linux-media, linaro-mm-sig, netdev, linux-perf-users,
	linux-fscrypt, linux-xfs, linux-hams, linux-x25, audit,
	linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <20260302-iino-u64-v2-105-e5388800dae0@kernel.org>

On 3/2/2026 12:25 PM, Jeff Layton wrote:
> Now that i_ino is u64 and the PRIino format macro has been removed,
> replace all uses in security with the concrete format strings.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>

For the security/smack changes:

Acked-by: Casey Schaufler <casey@schaufler-ca.com>

> ---
>  security/apparmor/apparmorfs.c       |  4 ++--
>  security/integrity/integrity_audit.c |  2 +-
>  security/ipe/audit.c                 |  2 +-
>  security/lsm_audit.c                 | 10 +++++-----
>  security/selinux/hooks.c             | 10 +++++-----
>  security/smack/smack_lsm.c           | 12 ++++++------
>  6 files changed, 20 insertions(+), 20 deletions(-)
>
> diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
> index be343479f80b71566be6fda90fc4e00912faad63..7b645f40e71c956f216fa6a7d69c3ecd4e2a5ff4 100644
> --- a/security/apparmor/apparmorfs.c
> +++ b/security/apparmor/apparmorfs.c
> @@ -149,7 +149,7 @@ static int aafs_count;
>  
>  static int aafs_show_path(struct seq_file *seq, struct dentry *dentry)
>  {
> -	seq_printf(seq, "%s:[%" PRIino "u]", AAFS_NAME, d_inode(dentry)->i_ino);
> +	seq_printf(seq, "%s:[%llu]", AAFS_NAME, d_inode(dentry)->i_ino);
>  	return 0;
>  }
>  
> @@ -2644,7 +2644,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer,
>  	char name[32];
>  	int res;
>  
> -	res = snprintf(name, sizeof(name), "%s:[%" PRIino "u]", AAFS_NAME,
> +	res = snprintf(name, sizeof(name), "%s:[%llu]", AAFS_NAME,
>  		       d_inode(dentry)->i_ino);
>  	if (res > 0 && res < sizeof(name))
>  		res = readlink_copy(buffer, buflen, name, strlen(name));
> diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c
> index d28dac23a4e7cf651856b80ab7756d250187ccde..d8d9e5ff1cd22b091f462d1e83d28d2d6bd983e9 100644
> --- a/security/integrity/integrity_audit.c
> +++ b/security/integrity/integrity_audit.c
> @@ -62,7 +62,7 @@ void integrity_audit_message(int audit_msgno, struct inode *inode,
>  	if (inode) {
>  		audit_log_format(ab, " dev=");
>  		audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -		audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +		audit_log_format(ab, " ino=%llu", inode->i_ino);
>  	}
>  	audit_log_format(ab, " res=%d errno=%d", !result, errno);
>  	audit_log_end(ab);
> diff --git a/security/ipe/audit.c b/security/ipe/audit.c
> index 0de95dd4fbea15d4d913fc42e197c3120a9d24a0..93fb59fbddd60b56c0b22be2a38b809ef9e18b76 100644
> --- a/security/ipe/audit.c
> +++ b/security/ipe/audit.c
> @@ -153,7 +153,7 @@ void ipe_audit_match(const struct ipe_eval_ctx *const ctx,
>  		if (inode) {
>  			audit_log_format(ab, " dev=");
>  			audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -			audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +			audit_log_format(ab, " ino=%llu", inode->i_ino);
>  		} else {
>  			audit_log_format(ab, " dev=? ino=?");
>  		}
> diff --git a/security/lsm_audit.c b/security/lsm_audit.c
> index 523f2ee116f0f928003aec30a105d6d4ecb49b0b..737f5a263a8f79416133315edf363ece3d79c722 100644
> --- a/security/lsm_audit.c
> +++ b/security/lsm_audit.c
> @@ -202,7 +202,7 @@ void audit_log_lsm_data(struct audit_buffer *ab,
>  		if (inode) {
>  			audit_log_format(ab, " dev=");
>  			audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -			audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +			audit_log_format(ab, " ino=%llu", inode->i_ino);
>  		}
>  		break;
>  	}
> @@ -215,7 +215,7 @@ void audit_log_lsm_data(struct audit_buffer *ab,
>  		if (inode) {
>  			audit_log_format(ab, " dev=");
>  			audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -			audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +			audit_log_format(ab, " ino=%llu", inode->i_ino);
>  		}
>  		break;
>  	}
> @@ -228,7 +228,7 @@ void audit_log_lsm_data(struct audit_buffer *ab,
>  		if (inode) {
>  			audit_log_format(ab, " dev=");
>  			audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -			audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +			audit_log_format(ab, " ino=%llu", inode->i_ino);
>  		}
>  
>  		audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd);
> @@ -246,7 +246,7 @@ void audit_log_lsm_data(struct audit_buffer *ab,
>  		if (inode) {
>  			audit_log_format(ab, " dev=");
>  			audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -			audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +			audit_log_format(ab, " ino=%llu", inode->i_ino);
>  		}
>  		break;
>  	}
> @@ -265,7 +265,7 @@ void audit_log_lsm_data(struct audit_buffer *ab,
>  		}
>  		audit_log_format(ab, " dev=");
>  		audit_log_untrustedstring(ab, inode->i_sb->s_id);
> -		audit_log_format(ab, " ino=%" PRIino "u", inode->i_ino);
> +		audit_log_format(ab, " ino=%llu", inode->i_ino);
>  		rcu_read_unlock();
>  		break;
>  	}
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 9430f44c81447708c67ddc35c5b4254f16731b8f..8f38de4d223ea59cfea6bbe73747d7b228e0c33f 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -1400,7 +1400,7 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry,
>  	if (rc < 0) {
>  		kfree(context);
>  		if (rc != -ENODATA) {
> -			pr_warn("SELinux: %s:  getxattr returned %d for dev=%s ino=%" PRIino "u\n",
> +			pr_warn("SELinux: %s:  getxattr returned %d for dev=%s ino=%llu\n",
>  				__func__, -rc, inode->i_sb->s_id, inode->i_ino);
>  			return rc;
>  		}
> @@ -1412,13 +1412,13 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry,
>  					     def_sid, GFP_NOFS);
>  	if (rc) {
>  		char *dev = inode->i_sb->s_id;
> -		kino_t ino = inode->i_ino;
> +		u64 ino = inode->i_ino;
>  
>  		if (rc == -EINVAL) {
> -			pr_notice_ratelimited("SELinux: inode=%" PRIino "u on dev=%s was found to have an invalid context=%s.  This indicates you may need to relabel the inode or the filesystem in question.\n",
> +			pr_notice_ratelimited("SELinux: inode=%llu on dev=%s was found to have an invalid context=%s.  This indicates you may need to relabel the inode or the filesystem in question.\n",
>  					      ino, dev, context);
>  		} else {
> -			pr_warn("SELinux: %s:  context_to_sid(%s) returned %d for dev=%s ino=%" PRIino "u\n",
> +			pr_warn("SELinux: %s:  context_to_sid(%s) returned %d for dev=%s ino=%llu\n",
>  				__func__, context, -rc, dev, ino);
>  		}
>  	}
> @@ -3477,7 +3477,7 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name,
>  					   &newsid);
>  	if (rc) {
>  		pr_err("SELinux:  unable to map context to SID"
> -		       "for (%s, %" PRIino "u), rc=%d\n",
> +		       "for (%s, %llu), rc=%d\n",
>  		       inode->i_sb->s_id, inode->i_ino, -rc);
>  		return;
>  	}
> diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
> index 22b6bd322840c82697c38c07b19a4677e7da2598..2eb3368a3632b836df54ba8628c16f7215ddf3ea 100644
> --- a/security/smack/smack_lsm.c
> +++ b/security/smack/smack_lsm.c
> @@ -182,7 +182,7 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc)
>  	char acc[SMK_NUM_ACCESS_TYPE + 1];
>  
>  	if (isp->smk_flags & SMK_INODE_IMPURE)
> -		pr_info("Smack Unconfined Corruption: inode=(%s %" PRIino "u) %s\n",
> +		pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n",
>  			inode->i_sb->s_id, inode->i_ino, current->comm);
>  
>  	if (rc <= 0)
> @@ -195,7 +195,7 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc)
>  
>  	smk_bu_mode(mode, acc);
>  
> -	pr_info("Smack %s: (%s %s %s) inode=(%s %" PRIino "u) %s\n", smk_bu_mess[rc],
> +	pr_info("Smack %s: (%s %s %s) inode=(%s %llu) %s\n", smk_bu_mess[rc],
>  		tsp->smk_task->smk_known, isp->smk_inode->smk_known, acc,
>  		inode->i_sb->s_id, inode->i_ino, current->comm);
>  	return 0;
> @@ -214,7 +214,7 @@ static int smk_bu_file(struct file *file, int mode, int rc)
>  	char acc[SMK_NUM_ACCESS_TYPE + 1];
>  
>  	if (isp->smk_flags & SMK_INODE_IMPURE)
> -		pr_info("Smack Unconfined Corruption: inode=(%s %" PRIino "u) %s\n",
> +		pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n",
>  			inode->i_sb->s_id, inode->i_ino, current->comm);
>  
>  	if (rc <= 0)
> @@ -223,7 +223,7 @@ static int smk_bu_file(struct file *file, int mode, int rc)
>  		rc = 0;
>  
>  	smk_bu_mode(mode, acc);
> -	pr_info("Smack %s: (%s %s %s) file=(%s %" PRIino "u %pD) %s\n", smk_bu_mess[rc],
> +	pr_info("Smack %s: (%s %s %s) file=(%s %llu %pD) %s\n", smk_bu_mess[rc],
>  		sskp->smk_known, smk_of_inode(inode)->smk_known, acc,
>  		inode->i_sb->s_id, inode->i_ino, file,
>  		current->comm);
> @@ -244,7 +244,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file,
>  	char acc[SMK_NUM_ACCESS_TYPE + 1];
>  
>  	if (isp->smk_flags & SMK_INODE_IMPURE)
> -		pr_info("Smack Unconfined Corruption: inode=(%s %" PRIino "u) %s\n",
> +		pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n",
>  			inode->i_sb->s_id, inode->i_ino, current->comm);
>  
>  	if (rc <= 0)
> @@ -253,7 +253,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file,
>  		rc = 0;
>  
>  	smk_bu_mode(mode, acc);
> -	pr_info("Smack %s: (%s %s %s) file=(%s %" PRIino "u %pD) %s\n", smk_bu_mess[rc],
> +	pr_info("Smack %s: (%s %s %s) file=(%s %llu %pD) %s\n", smk_bu_mess[rc],
>  		sskp->smk_known, smk_of_inode(inode)->smk_known, acc,
>  		inode->i_sb->s_id, inode->i_ino, file,
>  		current->comm);
>

^ permalink raw reply

* Re: [PATCH v2 001/110] vfs: introduce kino_t typedef and PRIino format macro
From: Matthew Wilcox @ 2026-03-03 17:18 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Christoph Hellwig, Darrick J. Wong, Theodore Tso, Alexander Viro,
	Christian Brauner, Jan Kara, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Dan Williams, Eric Biggers, Muchun Song,
	Oscar Salvador, David Hildenbrand, David Howells, Paulo Alcantara,
	Andreas Dilger, Jan Kara, Jaegeuk Kim, Chao Yu, Trond Myklebust,
	Anna Schumaker, Chuck Lever, NeilBrown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, Steve French, Ronnie Sahlberg,
	Shyam Prasad N, Bharath SM, Alexander Aring, Ryusuke Konishi,
	Viacheslav Dubeyko, Eric Van Hensbergen, Latchesar Ionkov,
	Dominique Martinet, Christian Schoenebeck, David Sterba,
	Marc Dionne, Ian Kent, Luis de Bethencourt, Salah Triki,
	Tigran A. Aivazian, Ilya Dryomov, Alex Markuze, Jan Harkes, coda,
	Nicolas Pitre, Tyler Hicks, Amir Goldstein,
	John Paul Adrian Glaubitz, Yangtao Li, Mikulas Patocka,
	David Woodhouse, Richard Weinberger, Dave Kleikamp,
	Konstantin Komarov, Mark Fasheh, Joel Becker, Joseph Qi,
	Mike Marshall, Martin Brandenburg, Miklos Szeredi, Anders Larsen,
	Zhihao Cheng, Damien Le Moal, Naohiro Aota, Johannes Thumshirn,
	John Johansen, Paul Moore, James Morris, Serge E. Hallyn,
	Mimi Zohar, Roberto Sassu, Dmitry Kasatkin, Eric Snowberg, Fan Wu,
	Stephen Smalley, Ondrej Mosnacek, Casey Schaufler, Alex Deucher,
	Christian König, David Airlie, Simona Vetter, Sumit Semwal,
	Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	David S. Miller, Jakub Kicinski, Simon Horman, Oleg Nesterov,
	Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Martin Schiller,
	Eric Paris, Joerg Reuter, Marcel Holtmann, Johan Hedberg,
	Luiz Augusto von Dentz, Oliver Hartkopp, Marc Kleine-Budde,
	David Ahern, Neal Cardwell, Steffen Klassert, Herbert Xu,
	Remi Denis-Courmont, Marcelo Ricardo Leitner, Xin Long,
	Magnus Karlsson, Maciej Fijalkowski, Stanislav Fomichev,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, linux-fsdevel, linux-kernel, linux-trace-kernel,
	nvdimm, fsverity, linux-mm, netfs, linux-ext4, linux-f2fs-devel,
	linux-nfs, linux-cifs, samba-technical, linux-nilfs, v9fs,
	linux-afs, autofs, ceph-devel, codalist, ecryptfs, linux-mtd,
	jfs-discussion, ntfs3, ocfs2-devel, devel, linux-unionfs,
	apparmor, linux-security-module, linux-integrity, selinux,
	amd-gfx, dri-devel, linux-media, linaro-mm-sig, netdev,
	linux-perf-users, linux-fscrypt, linux-xfs, linux-hams, linux-x25,
	audit, linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <1310fc5c09cce52ec00344b936275fe584c88dea.camel@kernel.org>

On Tue, Mar 03, 2026 at 09:19:42AM -0500, Jeff Layton wrote:
> There are really only three options here:
> 
> 1/ Do (almost) all of the changes in one giant patch
> 
> 2/ Accept that the build may break during the interim stages
> 
> 3/ This series: using a typedef and macro to work around the breakage
> until the type can be changed, at the expense of some extra churn in
> the codebase

4/ Don't do anything, drop the patch series (I'm not in favour of this,
but it is an option)

5/ Do the conversion(s) _once_ per filesystem.  Here's one way to do
it:

-	unsigned long           i_ino;
+	union {
+		u64 i_ino64;
+		struct {
+#if defined(CONFIG_64BIT)
+			unsigned long i_ino;
+#elif defined(CONFIG_CPU_BIG_ENDIAN)
+			unsigned long i_ino;
+			unsigned long i_ino_pad;
+#else
+			unsigned long i_ino_pad;
+			unsigned long i_ino;
+#endif
+		};
+	};

[...]
#define i_ino(inode)	(inode)->i_ino64

So that's patch one.  All plain references to i_ino access the lower
bits of i_ino64, so everything will continue to work as it does today.

Once you've got the VFS core in shape, you can convert filesystems one
at a time to use i_ino(inode).  Once you're done you can delete the
scaffolding from the core and go back to calling i_ino64 just i_ino.
You could delete the i_ino() uses from filesystems at that point, but
why bother?

I'm sure there are other ways to do it, this is just the one I came up
with.  But for the love of god stop spamming hundreds of people on the
cc of this patchset.  In fact, take me off for next time -- I get each
one of these fucking patches four times.

^ permalink raw reply

* [PATCH] tracing: Fix failure to read user space from system call trace events
From: Steven Rostedt @ 2026-03-03 17:04 UTC (permalink / raw)
  To: LKML, Linux Trace Kernel; +Cc: Masami Hiramatsu, Mathieu Desnoyers

From: Steven Rostedt <rostedt@goodmis.org>

The system call trace events call trace_user_fault_read() to read the user
space part of some system calls. This is done by grabbing a per-cpu
buffer, disabling migration, enabling preemption, calling
copy_from_user(), disabling preemption, enabling migration and checking if
the task was preempted while preemption was enabled. If it was, the buffer
is considered corrupted and it tries again.

There's a safety mechanism that will fail out of this loop if it fails 100
times (with a warning). That warning message was triggered in some
pi_futex stress tests. Enabling the sched_switch trace event and
traceoff_on_warning, showed the problem:

 pi_mutex_hammer-1375    [006] d..21   138.981648: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981651: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981656: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981659: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981664: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981667: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981671: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981675: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981679: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981682: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981687: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981690: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981695: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981698: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981703: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981706: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981711: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981714: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981719: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981722: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981727: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981730: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95
 pi_mutex_hammer-1375    [006] d..21   138.981735: sched_switch: prev_comm=pi_mutex_hammer prev_pid=1375 prev_prio=95 prev_state=R+ ==> next_comm=migration/6 next_pid=47 next_prio=0
     migration/6-47      [006] d..2.   138.981738: sched_switch: prev_comm=migration/6 prev_pid=47 prev_prio=0 prev_state=S ==> next_comm=pi_mutex_hammer next_pid=1375 next_prio=95

What happened was the task 1375 was flagged to be migrated. When
preemption was enabled, the migration thread woke up to migrate that task,
but failed because migration for that task was disabled. This caused the
loop to fail to exit because the task scheduled out while trying to read
user space.

Every time the task enabled preemption the migration thread would schedule
in, try to migrate the task, fail and let the task continue. But because
the loop would only enable preemption with migration disabled, it would
always fail because each time it enabled preemption to read user space,
the migration thread would try to migrate it.

To solve this, when the loop fails to read user space without being
scheduled out, enabled and disable preemption with migration enabled. This
will allow the migration task to successfully migrate the task and the
next loop should succeed to read user space without being scheduled out.

Cc: stable@vger.kernel.org
Fixes: 64cf7d058a005 ("tracing: Have trace_marker use per-cpu data to read user space")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 359ba2e73f49..cd45a09e0190 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6131,6 +6131,23 @@ char *trace_user_fault_read(struct trace_user_buf_info *tinfo,
 	 */
 
 	do {
+		/*
+		 * It is possible that something is trying to migrate this
+		 * task. What happens then, is when preemption is enabled,
+		 * the migration thread will preempt this task, try to 
+		 * migrate it, fail, then let it run again. That will
+		 * cause this to loop again and never succeed.
+		 * On failures, enabled and disable preemption with
+		 * migration enabled, to allow the migration thread to
+		 * migrate this task.
+		 */
+		if (trys) {
+			preempt_enable_notrace();
+			preempt_disable_notrace();
+			cpu = smp_processor_id();
+			buffer = per_cpu_ptr(tinfo->tbuf, cpu)->buf;
+		}
+
 		/*
 		 * If for some reason, copy_from_user() always causes a context
 		 * switch, this would then cause an infinite loop.
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v2 003/110] audit: widen ino fields to u64
From: Paul Moore @ 2026-03-03 16:17 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Dan Williams, Matthew Wilcox,
	Eric Biggers, Theodore Y. Ts'o, Muchun Song, Oscar Salvador,
	David Hildenbrand, David Howells, Paulo Alcantara, Andreas Dilger,
	Jan Kara, Jaegeuk Kim, Chao Yu, Trond Myklebust, Anna Schumaker,
	Chuck Lever, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Steve French, Ronnie Sahlberg, Shyam Prasad N, Bharath SM,
	Alexander Aring, Ryusuke Konishi, Viacheslav Dubeyko,
	Eric Van Hensbergen, Latchesar Ionkov, Dominique Martinet,
	Christian Schoenebeck, David Sterba, Marc Dionne, Ian Kent,
	Luis de Bethencourt, Salah Triki, Tigran A. Aivazian,
	Ilya Dryomov, Alex Markuze, Jan Harkes, coda, Nicolas Pitre,
	Tyler Hicks, Amir Goldstein, Christoph Hellwig,
	John Paul Adrian Glaubitz, Yangtao Li, Mikulas Patocka,
	David Woodhouse, Richard Weinberger, Dave Kleikamp,
	Konstantin Komarov, Mark Fasheh, Joel Becker, Joseph Qi,
	Mike Marshall, Martin Brandenburg, Miklos Szeredi, Anders Larsen,
	Zhihao Cheng, Damien Le Moal, Naohiro Aota, Johannes Thumshirn,
	John Johansen, James Morris, Serge E. Hallyn, Mimi Zohar,
	Roberto Sassu, Dmitry Kasatkin, Eric Snowberg, Fan Wu,
	Stephen Smalley, Ondrej Mosnacek, Casey Schaufler, Alex Deucher,
	Christian König, David Airlie, Simona Vetter, Sumit Semwal,
	Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	David S. Miller, Jakub Kicinski, Simon Horman, Oleg Nesterov,
	Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Darrick J. Wong,
	Martin Schiller, Eric Paris, Joerg Reuter, Marcel Holtmann,
	Johan Hedberg, Luiz Augusto von Dentz, Oliver Hartkopp,
	Marc Kleine-Budde, David Ahern, Neal Cardwell, Steffen Klassert,
	Herbert Xu, Remi Denis-Courmont, Marcelo Ricardo Leitner,
	Xin Long, Magnus Karlsson, Maciej Fijalkowski, Stanislav Fomichev,
	Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
	John Fastabend, linux-fsdevel, linux-kernel, linux-trace-kernel,
	nvdimm, fsverity, linux-mm, netfs, linux-ext4, linux-f2fs-devel,
	linux-nfs, linux-cifs, samba-technical, linux-nilfs, v9fs,
	linux-afs, autofs, ceph-devel, codalist, ecryptfs, linux-mtd,
	jfs-discussion, ntfs3, ocfs2-devel, devel, linux-unionfs,
	apparmor, linux-security-module, linux-integrity, selinux,
	amd-gfx, dri-devel, linux-media, linaro-mm-sig, netdev,
	linux-perf-users, linux-fscrypt, linux-xfs, linux-hams, linux-x25,
	audit, linux-bluetooth, linux-can, linux-sctp, bpf
In-Reply-To: <add39953250a4a1b2fe2b09deb3373c2a7482b88.camel@kernel.org>

On Tue, Mar 3, 2026 at 11:12 AM Jeff Layton <jlayton@kernel.org> wrote:
> On Tue, 2026-03-03 at 11:03 -0500, Paul Moore wrote:
> > On Tue, Mar 3, 2026 at 6:05 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Mon, 2026-03-02 at 18:44 -0500, Paul Moore wrote:
> > > > On Mon, Mar 2, 2026 at 3:25 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > >
> > > > > inode->i_ino is being widened from unsigned long to u64. The audit
> > > > > subsystem uses unsigned long ino in struct fields, function parameters,
> > > > > and local variables that store inode numbers from arbitrary filesystems.
> > > > > On 32-bit platforms this truncates inode numbers that exceed 32 bits,
> > > > > which will cause incorrect audit log entries and broken watch/mark
> > > > > comparisons.
> > > > >
> > > > > Widen all audit ino fields, parameters, and locals to u64, and update
> > > > > the inode format string from %lu to %llu to match.
> > > > >
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > >  include/linux/audit.h   | 2 +-
> > > > >  kernel/audit.h          | 9 ++++-----
> > > > >  kernel/audit_fsnotify.c | 4 ++--
> > > > >  kernel/audit_watch.c    | 8 ++++----
> > > > >  kernel/auditsc.c        | 2 +-
> > > > >  5 files changed, 12 insertions(+), 13 deletions(-)
> > > >
> > > > We should also update audit_hash_ino() in kernel/audit.h.  It is a
> > > > *very* basic hash function, so I think leaving the function as-is and
> > > > just changing the inode parameter from u32 to u64 should be fine.
> >
> > ...
> >
> > > It doesn't look like changing the argument type will make any material
> > > difference. Given that it should still work without that change, can we
> > > leave this cleanup for you to do in a follow-on patchset?
> >
> > I would prefer if you made the change as part of the patch, mainly to
> > keep a patch record of this being related.
>
> Ok, I'll see about factoring that in.

Thanks.

> > Ideally I'd really like to see kino_t used in the audit code instead
> > of u64, but perhaps that is done in a later patch that I didn't see.
>
> I think I didn't make this clear enough in the cover letter, but kino_t
> is removed at the end of the series. It's just there to support the
> change during the interim.

Ah, gotcha, thanks for the education :)

> If HCH gets his way to do the changes as one big patch, it'll go away
> entirely.

-- 
paul-moore.com

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox