Linux Trace Kernel
 help / color / mirror / Atom feed
* Re: [PATCH] tracing: Move snapshot code out of trace.c and into trace_snapshot.c
From: Steven Rostedt @ 2026-03-31 12:34 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kernel test robot, LKML, Linux trace kernel, llvm, oe-kbuild-all,
	Masami Hiramatsu, Mathieu Desnoyers
In-Reply-To: <6efebcb6-194e-4c21-8808-fdf09160eac0@app.fastmail.com>

On Tue, 31 Mar 2026 12:53:38 +0200
"Arnd Bergmann" <arnd@arndb.de> wrote:

> Right, I assumed it had to be something like that, just didn't immediately
> see what. I've sent a patch to just remove the duplicate inline
> function now.

Ah, I had already sent a patch to fix the duplicate because I saw the
kernel test robot's report.

  https://lore.kernel.org/all/20260330205859.24c0aae3@gandalf.local.home/

-- Steve

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Gabriele Monaco @ 2026-03-31 12:50 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <20260331104918.2710853-1-namcao@linutronix.de>

On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
> Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
> epoll_wait is real-time-safe syscall for sleeping.
> 
> Add epoll_wait to the list of rt-safe sleeping APIs.
> 
> Signed-off-by: Nam Cao <namcao@linutronix.de>

Thanks for the patch, looks reasonable.
I tried re-generating the header (sleep.h) with rvgen based on the new
specification and I'm getting a different order.

Is what you're committing the result of rvgen on your computer?
We probably still have some unpredictable result in the rvgen's output if that's
the case (no big deal then, though it triggers me a bit).

I would still like to run some tests on this, how urgently would you like this
patch through? I was really about to send Steve a PR with the other changes so
this might need to wait for the next merge window.

Thanks,
Gabriele

> ---
>  kernel/trace/rv/monitors/sleep/sleep.c    |  8 ++
>  kernel/trace/rv/monitors/sleep/sleep.h    | 98 ++++++++++++-----------
>  tools/verification/models/rtapp/sleep.ltl |  1 +
>  3 files changed, 61 insertions(+), 46 deletions(-)
> 
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index c1347da69e9d..59091863c17c 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -49,6 +49,7 @@ static void ltl_atoms_init(struct task_struct *task, struct
> ltl_monitor *mon, bo
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
>  		ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  		ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
>  		ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
>  	}
> @@ -63,6 +64,7 @@ static void ltl_atoms_init(struct task_struct *task, struct
> ltl_monitor *mon, bo
>  		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  
>  		if (strstarts(task->comm, "migration/"))
>  			ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
> @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs
> *regs, long id)
>  			break;
>  		}
>  		break;
> +#ifdef __NR_epoll_wait
> +	case __NR_epoll_wait:
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
> +		break;
> +#endif
>  	}
>  }
>  
> @@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs
> *regs, long ret)
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
> +	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
>  }
>  
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.h
> b/kernel/trace/rv/monitors/sleep/sleep.h
> index 2ab46fd218d2..95dc2727c059 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.h
> +++ b/kernel/trace/rv/monitors/sleep/sleep.h
> @@ -15,6 +15,7 @@ enum ltl_atom {
>  	LTL_ABORT_SLEEP,
>  	LTL_BLOCK_ON_RT_MUTEX,
>  	LTL_CLOCK_NANOSLEEP,
> +	LTL_EPOLL_WAIT,
>  	LTL_FUTEX_LOCK_PI,
>  	LTL_FUTEX_WAIT,
>  	LTL_KERNEL_THREAD,
> @@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
>  		"ab_sl",
>  		"bl_on_rt_mu",
>  		"cl_na",
> +		"ep_wa",
>  		"fu_lo_pi",
>  		"fu_wa",
>  		"ker_th",
> @@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
>  
>  static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> @@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  static void
>  ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state,
> unsigned long *next)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> diff --git a/tools/verification/models/rtapp/sleep.ltl
> b/tools/verification/models/rtapp/sleep.ltl
> index 6379bbeb6212..6f26c4810f78 100644
> --- a/tools/verification/models/rtapp/sleep.ltl
> +++ b/tools/verification/models/rtapp/sleep.ltl
> @@ -5,6 +5,7 @@ RT_FRIENDLY_SLEEP = (RT_VALID_SLEEP_REASON or KERNEL_THREAD)
>  
>  RT_VALID_SLEEP_REASON = FUTEX_WAIT
>                       or RT_FRIENDLY_NANOSLEEP
> +                     or EPOLL_WAIT
>  
>  RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
>                      and NANOSLEEP_TIMER_ABSTIME


^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Nam Cao @ 2026-03-31 13:41 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <4b47d5e7e9dde0c76beb1a9383a13553c2455d92.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:

> On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
>> Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
>> epoll_wait is real-time-safe syscall for sleeping.
>> 
>> Add epoll_wait to the list of rt-safe sleeping APIs.
>> 
>> Signed-off-by: Nam Cao <namcao@linutronix.de>
>
> Thanks for the patch, looks reasonable.
> I tried re-generating the header (sleep.h) with rvgen based on the new
> specification and I'm getting a different order.
>
> Is what you're committing the result of rvgen on your computer?
> We probably still have some unpredictable result in the rvgen's output if that's
> the case (no big deal then, though it triggers me a bit).

Right, fixing this is in my list. The script uses set and set's order is
not deterministic. You get different (but equivalent) results every
time. I should start working on that..

> I would still like to run some tests on this, how urgently would you like this
> patch through? I was really about to send Steve a PR with the other changes so
> this might need to wait for the next merge window.

The earlier the better, but no one will die because it misses a merge
window.

Nam

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Gabriele Monaco @ 2026-03-31 13:47 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <87y0j86rq8.fsf@yellow.woof>

On Tue, 2026-03-31 at 15:41 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> 
> > On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
> > > Since commit 0c43094f8cc9 ("eventpoll: Replace rwlock with spinlock"),
> > > epoll_wait is real-time-safe syscall for sleeping.
> > > 
> > > Add epoll_wait to the list of rt-safe sleeping APIs.
> > > 
> > > Signed-off-by: Nam Cao <namcao@linutronix.de>
> > 
> > Thanks for the patch, looks reasonable.
> > I tried re-generating the header (sleep.h) with rvgen based on the new
> > specification and I'm getting a different order.
> > 
> > Is what you're committing the result of rvgen on your computer?
> > We probably still have some unpredictable result in the rvgen's output if
> > that's
> > the case (no big deal then, though it triggers me a bit).
> 
> Right, fixing this is in my list. The script uses set and set's order is
> not deterministic. You get different (but equivalent) results every
> time. I should start working on that..

Reasonable, no rush. I just noticed this behaviour when trying to write some
selftests for rvgen and didn't manage to make it deterministic with trivial
changes.

> 
> > I would still like to run some tests on this, how urgently would you like
> > this
> > patch through? I was really about to send Steve a PR with the other changes
> > so
> > this might need to wait for the next merge window.
> 
> The earlier the better, but no one will die because it misses a merge
> window.

Alright I'll see if I can squeeze it in within this week, if not, it'll have to
wait. For now:

Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>

Thanks,
Gabriele


^ permalink raw reply

* Re: [PATCH v4 3/5] locking: Add contended_release tracepoint to sleepable locks
From: Usama Arif @ 2026-03-31 14:11 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <acu7LdegiZ5_-dEW@shell.ilvokhin.com>



On 31/03/2026 15:16, Dmitry Ilvokhin wrote:
> On Tue, Mar 31, 2026 at 03:34:50AM -0700, Usama Arif wrote:
>> On Thu, 26 Mar 2026 15:10:02 +0000 Dmitry Ilvokhin <d@ilvokhin.com> wrote:
>>
>>> Add the contended_release trace event. This tracepoint fires on the
>>> holder side when a contended lock is released, complementing the
>>> existing contention_begin/contention_end tracepoints which fire on the
>>> waiter side.
>>>
>>> This enables correlating lock hold time under contention with waiter
>>> events by lock address.
>>>
>>> Add trace_contended_release() calls to the slowpath unlock paths of
>>> sleepable locks: mutex, rtmutex, semaphore, rwsem, percpu-rwsem, and
>>> RT-specific rwbase locks.
>>>
>>> Where possible, trace_contended_release() fires before the lock is
>>> released and before the waiter is woken. For some lock types, the
>>> tracepoint fires after the release but before the wake. Making the
>>> placement consistent across all lock types is not worth the added
>>> complexity.
>>>
>>> For reader/writer locks, the tracepoint fires for every reader releasing
>>> while a writer is waiting, not only for the last reader.
>>>
>>> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
>>> ---
>>>  include/trace/events/lock.h   | 17 +++++++++++++++++
>>>  kernel/locking/mutex.c        |  4 ++++
>>>  kernel/locking/percpu-rwsem.c | 11 +++++++++++
>>>  kernel/locking/rtmutex.c      |  1 +
>>>  kernel/locking/rwbase_rt.c    |  6 ++++++
>>>  kernel/locking/rwsem.c        | 10 ++++++++--
>>>  kernel/locking/semaphore.c    |  4 ++++
>>>  7 files changed, 51 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
>>> index da978f2afb45..1ded869cd619 100644
>>> --- a/include/trace/events/lock.h
>>> +++ b/include/trace/events/lock.h
>>> @@ -137,6 +137,23 @@ TRACE_EVENT(contention_end,
>>>  	TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret)
>>>  );
>>>  
>>> +TRACE_EVENT(contended_release,
>>> +
>>> +	TP_PROTO(void *lock),
>>> +
>>> +	TP_ARGS(lock),
>>> +
>>> +	TP_STRUCT__entry(
>>> +		__field(void *, lock_addr)
>>> +	),
>>> +
>>> +	TP_fast_assign(
>>> +		__entry->lock_addr = lock;
>>> +	),
>>> +
>>> +	TP_printk("%p", __entry->lock_addr)
>>> +);
>>> +
>>>  #endif /* _TRACE_LOCK_H */
>>>  
>>>  /* This part must be outside protection */
>>> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
>>> index 427187ff02db..6c2c9312eb8f 100644
>>> --- a/kernel/locking/mutex.c
>>> +++ b/kernel/locking/mutex.c
>>> @@ -997,6 +997,9 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
>>>  		wake_q_add(&wake_q, next);
>>>  	}
>>>  
>>> +	if (trace_contended_release_enabled() && waiter)
>>> +		trace_contended_release(lock);
>>> +
>>
>> This won't compile? waiter is declared in the if block, so you are using
>> it outside scope here.
>>
> 
> Thanks for the feedback, Usama.
> 
> waiter is declared at function scope, right on top. It's also assigned
> before the if block, so it's still in scope at the tracepoint.

Ah ok, I was reviewing on top of mm-new branch from today where waiter
is declared in the if block. Probably something changed related to
locking/tracing and its not in mm-new yet.

^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Gabriele Monaco @ 2026-03-31 15:15 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <20260331104918.2710853-1-namcao@linutronix.de>


On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index c1347da69e9d..59091863c17c 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs
> *regs, long id)
>  			break;
>  		}
>  		break;
> +#ifdef __NR_epoll_wait
> +	case __NR_epoll_wait:
> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
> +		break;
> +#endif

Sashiko (the AI bot) wonders why this isn't ltl_atom_update() like other things
around here. Is that intentional?

>  	}
>  }
>  
> @@ -174,6 +181,7 @@ static void handle_sys_exit(void *data, struct pt_regs
> *regs, long ret)
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
> +	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
>  }
>  
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.h
> b/kernel/trace/rv/monitors/sleep/sleep.h
> index 2ab46fd218d2..95dc2727c059 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.h
> +++ b/kernel/trace/rv/monitors/sleep/sleep.h
> @@ -15,6 +15,7 @@ enum ltl_atom {
>  	LTL_ABORT_SLEEP,
>  	LTL_BLOCK_ON_RT_MUTEX,
>  	LTL_CLOCK_NANOSLEEP,
> +	LTL_EPOLL_WAIT,
>  	LTL_FUTEX_LOCK_PI,
>  	LTL_FUTEX_WAIT,
>  	LTL_KERNEL_THREAD,
> @@ -40,6 +41,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
>  		"ab_sl",
>  		"bl_on_rt_mu",
>  		"cl_na",
> +		"ep_wa",
>  		"fu_lo_pi",
>  		"fu_wa",
>  		"ker_th",
> @@ -75,39 +77,41 @@ static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
>  
>  static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> @@ -124,39 +128,41 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  static void
>  ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state,
> unsigned long *next)
>  {
> -	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> -	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> -	bool val40 = task_is_rcu || task_is_migration;
> -	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> -	bool val41 = futex_lock_pi || val40;
> -	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> -	bool val5 = block_on_rt_mutex || val41;
> -	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> -	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val32 = abort_sleep || kthread_should_stop;
>  	bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
> -	bool val33 = woken_by_nmi || val32;
>  	bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
> -	bool val34 = woken_by_hardirq || val33;
>  	bool woken_by_equal_or_higher_prio =
> test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
>  	     mon->atoms);
> -	bool val14 = woken_by_equal_or_higher_prio || val34;
>  	bool wake = test_bit(LTL_WAKE, mon->atoms);
> -	bool val13 = !wake;
> -	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
> +	bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
> +	bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
> +	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
> +	bool rt = test_bit(LTL_RT, mon->atoms);
> +	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
>  	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
>  	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> -	bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool val25 = nanosleep_timer_abstime && val24;
> -	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> -	bool val18 = clock_nanosleep && val25;
> +	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
> +	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> -	bool val9 = futex_wait || val18;
> +	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
> +	bool epoll_wait = test_bit(LTL_EPOLL_WAIT, mon->atoms);
> +	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
> +	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
> +	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> +	bool val42 = task_is_rcu || task_is_migration;
> +	bool val43 = futex_lock_pi || val42;
> +	bool val5 = block_on_rt_mutex || val43;
> +	bool val34 = abort_sleep || kthread_should_stop;
> +	bool val35 = woken_by_nmi || val34;
> +	bool val36 = woken_by_hardirq || val35;
> +	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val13 = !wake;
> +	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> +	bool val27 = nanosleep_timer_abstime && val26;
> +	bool val18 = clock_nanosleep && val27;
> +	bool val20 = val18 || epoll_wait;
> +	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> -	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool val2 = !sleep;
> -	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool val1 = !rt;
>  	bool val3 = val1 || val2;
>  
> diff --git a/tools/verification/models/rtapp/sleep.ltl
> b/tools/verification/models/rtapp/sleep.ltl
> index 6379bbeb6212..6f26c4810f78 100644
> --- a/tools/verification/models/rtapp/sleep.ltl
> +++ b/tools/verification/models/rtapp/sleep.ltl
> @@ -5,6 +5,7 @@ RT_FRIENDLY_SLEEP = (RT_VALID_SLEEP_REASON or KERNEL_THREAD)
>  
>  RT_VALID_SLEEP_REASON = FUTEX_WAIT
>                       or RT_FRIENDLY_NANOSLEEP
> +                     or EPOLL_WAIT
>  
>  RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
>                      and NANOSLEEP_TIMER_ABSTIME


^ permalink raw reply

* Re: [PATCH] rv: Allow epoll in rtapp-sleep monitor
From: Nam Cao @ 2026-03-31 15:23 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-trace-kernel, linux-kernel
In-Reply-To: <58674d7f10c260369f5cb78599ba6ecb3804358f.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> On Tue, 2026-03-31 at 12:49 +0200, Nam Cao wrote:
>> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
>> b/kernel/trace/rv/monitors/sleep/sleep.c
>> index c1347da69e9d..59091863c17c 100644
>> --- a/kernel/trace/rv/monitors/sleep/sleep.c
>> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
>> @@ -162,6 +164,11 @@ static void handle_sys_enter(void *data, struct pt_regs
>> *regs, long id)
>>  			break;
>>  		}
>>  		break;
>> +#ifdef __NR_epoll_wait
>> +	case __NR_epoll_wait:
>> +		ltl_atom_set(mon, LTL_EPOLL_WAIT, true);
>> +		break;
>> +#endif
>
> Sashiko (the AI bot) wonders why this isn't ltl_atom_update() like other things
> around here. Is that intentional?

No that's not intentional. It does not affect verification result, but
still should be fixed. I will send v2.

Funnily a colleague just told me earlier today about how good AIs are at
reviewing..

Nam

^ permalink raw reply

* Re: [PATCH v2] bootconfig: Apply early options from embedded config
From: Breno Leitao @ 2026-03-31 15:27 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Jonathan Corbet, Shuah Khan, linux-kernel, linux-trace-kernel,
	linux-doc, oss, paulmck, rostedt, kernel-team
In-Reply-To: <20260331125827.157a833882830007ea9b0b31@kernel.org>

hello Masami,

On Tue, Mar 31, 2026 at 12:58:27PM +0900, Masami Hiramatsu wrote:

> > 3) Ensure that early bootconfig parameters don't overwrite the boot command
> >    line. For example, if the boot command line has foo=bar and bootconfig
> >    later has foo=baz, the command line value should take precedence.
> >    This prevents early boot code (in setup_arch()) from seeing a parameter
> >    value that will be changed later.
>
> OK, this also needs to be considered. Currently we just pass the bootconfig
> parameters right before bootloader given parameters as "extra_command_line"
> if "bootconfig" in cmdline or CONFIG_BOOT_CONFIG_FORCE=y.
>
> [boot_config(.kernel)]<command_line>[ -- [boot_config(.init)][init_command_line]]
>
> This is currently expected behavior. The bootconfig parameters are
> expected to be overridden by command_line or command_line are appended.

That's correct, and I have no intention of changing this behavior. Here's
the current approach:

1) Early parameters from the bootloader are parsed first in setup_arch()

2) Subsequently, bootconfig_apply_early_params() is invoked. Any early
   parameter that was already parsed from the bootloader (in setup_arch())
   will be skipped at this stage.

> If we change this for early params, we also should change the expected
> output of /proc/cmdline too. I think we have 2 options;
>
>  - As before, we expect the parameters provided by the boot configuration
>    to be processed first and then overridden later by the command line.
>
> Or,
>
>  - ignore all parameters which is given from the command line, this also
>    updates existing setup_boot_config() (means xbc_snprint_cmdline() ).
>
> Anyway, this behavior change will also be a bit critical... We have
> to announce it.

As mentioned above, I don't anticipate any changes to existing behavior.
Bootconfig parsing remains unchanged. The only modification is that
bootconfig_apply_early_params() will skip any early config parameter
that's already present in the bootloader command line.

> > +Note that embedded bootconfig is parsed after ``setup_arch()``, so
> > +early options that are consumed during architecture initialization
> > +(e.g., ``mem=``, ``memmap=``, ``earlycon``, ``noapic``, ``nolapic``,
> > +``acpi=``, ``numa=``, ``iommu=``) may not take effect from bootconfig.
> > +
>
> This is easy to explain, but it's quite troublesome for users to
> determine which parameters are unavailable.

Agreed. This turned out to be significantly more complex than I
initially anticipated.

I'm uncertain whether we can accomplish this without examining every
early_parameter() implementation in depth.

> Currently we can identify
> it by `git grep early_param -- arch/${ARCH}`. But it is setup in
> setup_arch() we need to track the source code. (Or ask AI :))

The challenge extends beyond that. There are numerous early_parameter()
definitions scattered throughout the kernel that may or may not be
utilized by setup_arch().

For example, consider `early_param("mitigations", ..)` in
./kernel/cpu.c. This modifies the cpu_mitigations global variable, which
is referenced in various locations across different architectures.

It's worth noting that we have over 300 early_parameter() instances in
the kernel.

Given this, analyzing all these early parameters and examining each one
individually represents a substantial amount of work.

Are there alternative approaches? At this point, I'm leaning toward
breaking bootconfig's dependency on memblock, allowing us to invoke it
before setup_arch(). Is this the only practical solution available?!

Thanks,
--breno

^ permalink raw reply

* Re: [PATCH] tracing: always provide a prototype for tracing_alloc_snapshot()
From: Steven Rostedt @ 2026-03-31 16:13 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Bartosz Golaszewski, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260331182036.e62143d5a9a59776d8cf7ae2@kernel.org>

On Tue, 31 Mar 2026 18:20:36 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Tue, 31 Mar 2026 10:20:01 +0200
> Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com> wrote:
> 
> > The tracing_alloc_snapshot() symbol is always exported even with
> > !CONFIG_TRACER_SNAPSHOT so the prototype too must be always visible or
> > we'll see the following warning:
> > 
> > kernel/trace/trace.c:820:5: warning: no previous prototype for ‘tracing_alloc_snapshot’ [-Wmissing-prototypes]
> >   820 | int tracing_alloc_snapshot(void)
> >       |     ^~~~~~~~~~~~~~~~~~~~~~
> > 
> > Fixes: bade44fe5462 ("tracing: Move snapshot code out of trace.c and into trace_snapshot.c")
> > Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>  
> 
> Good catch!
> 
> Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>

But is the wrong fix. I already fixed it properly:

  https://patch.msgid.link/20260328101946.2c4ef4a5@robin

It's still in the testing phase but will be going to linux-next this week.

-- Steve

^ permalink raw reply

* [PATCH v9 0/3] tracing: Remove backup instance after read all
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel

Hi,

Here is the v9 of the series to improve backup instances of
the persistent ring buffer. The previous version is here:

https://lore.kernel.org/all/177071300558.2293046.12057922262682243630.stgit@mhiramat.tok.corp.google.com/

In this version, I removed bugfixes (those are actual fixes/minor
updates) and the force permission check in tracefs
because superuser can modify the permission by itself. Instead,
simply add read-only and FMODE_WRITE check in the related open()
file operations[1/3]. Also, I fixed 2 bugs in autoremove patch
to init dedicated workqueue correctly [2/3].

Series Description
------------------
Since backup instances are a kind of snapshot of the persistent
ring buffer, it should be readonly. And if it is readonly
there is no reason to keep it after reading all data via trace_pipe
because the data has been consumed. But user should be able to remove
the readonly instance by rmdir or truncating `trace` file.

Thus, [1/3] makes backup instances readonly (not able to write any
events, cleanup trace, change buffer size). Also, [2/3] removes the
backup instance after consuming all data via trace_pipe.
With this improvements, even if we makes a backup instance (using
the same amount of memory of the persistent ring buffer), it will
be removed after reading the data automatically.

Thanks,

---

Masami Hiramatsu (Google) (3):
      tracing: Make the backup instance non-reusable
      tracing: Remove the backup instance automatically after read
      tracing/Documentation: Add a section about backup instance


 Documentation/trace/debugging.rst |   19 ++++
 kernel/trace/trace.c              |  166 +++++++++++++++++++++++++++++--------
 kernel/trace/trace.h              |   13 +++
 kernel/trace/trace_boot.c         |    5 +
 kernel/trace/trace_events.c       |   76 ++++++++++-------
 5 files changed, 208 insertions(+), 71 deletions(-)

--
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH v9 1/3] tracing: Make the backup instance non-reusable
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177497473558.569199.6527680985537865638.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since there is no reason to reuse the backup instance, make it
readonly (but erasable).
Note that only backup instances are readonly, because
other trace instances will be empty unless it is writable.
Only backup instances have copy entries from the original.

With this change, most of the trace control files are removed
from the backup instance, including eventfs enable/filter etc.

 # find /sys/kernel/tracing/instances/backup/events/ | wc -l
 4093
 # find /sys/kernel/tracing/instances/boot_map/events/ | wc -l
 9573

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v9:
   - Add forcibly readonly check in open() operations.
 Changes in v8:
   - Remove read-only checks in read() operation.
 Changes in v7:
   - Return -EACCES instead of -EPERM.
 Changes in v6:
   - Remove tracing_on file from readonly instances.
   - Remove unused writable_mode from tracing_init_tracefs_percpu().
   - Cleanup init_tracer_tracefs() and create_event_toplevel_files().
   - Remove TRACE_MODE_WRITE_MASK.
   - Add TRACE_ARRAY_FL_RDONLY.
 Changes in v5:
   - Rebased on the latest for-next (and hide show_event_filters/triggers
     if the instance is readonly.
 Changes in v4:
  - Make trace data erasable. (not reusable)
 Changes in v3:
  - Resuse the beginning part of event_entries for readonly files.
  - Remove readonly file_operations and checking readonly flag in
    each write operation.
 Changes in v2:
  - Use readonly file_operations to prohibit writing instead of
    checking flags in write() callbacks.
  - Remove writable files from eventfs.
---
 kernel/trace/trace.c        |   81 +++++++++++++++++++++++++++----------------
 kernel/trace/trace.h        |    7 ++++
 kernel/trace/trace_boot.c   |    5 ++-
 kernel/trace/trace_events.c |   76 +++++++++++++++++++++++-----------------
 4 files changed, 104 insertions(+), 65 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b9dd6378849..8cec7bd70438 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3414,6 +3414,11 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp)
 	if (ret)
 		return ret;
 
+	if ((filp->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+		trace_array_put(tr);
+		return -EACCES;
+	}
+
 	filp->private_data = inode->i_private;
 
 	return 0;
@@ -6435,6 +6440,11 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 	if (ret)
 		return ret;
 
+	if ((file->f_mode & FMODE_WRITE) && trace_array_is_readonly(tr)) {
+		trace_array_put(tr);
+		return -EACCES;
+	}
+
 	ret = single_open(file, tracing_clock_show, inode->i_private);
 	if (ret < 0)
 		trace_array_put(tr);
@@ -8771,17 +8781,22 @@ static __init void create_trace_instances(struct dentry *d_tracer)
 static void
 init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 {
+	umode_t writable_mode = TRACE_MODE_WRITE;
 	int cpu;
 
+	if (trace_array_is_readonly(tr))
+		writable_mode = TRACE_MODE_READ;
+
 	trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer,
-			tr, &show_traces_fops);
+			  tr, &show_traces_fops);
 
-	trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer,
-			tr, &set_tracer_fops);
+	trace_create_file("current_tracer", writable_mode, d_tracer,
+			  tr, &set_tracer_fops);
 
-	trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer,
+	trace_create_file("tracing_cpumask", writable_mode, d_tracer,
 			  tr, &tracing_cpumask_fops);
 
+	/* Options are used for changing print-format even for readonly instance. */
 	trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer,
 			  tr, &tracing_iter_fops);
 
@@ -8791,12 +8806,36 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer,
 			  tr, &tracing_pipe_fops);
 
-	trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer,
+	trace_create_file("buffer_size_kb", writable_mode, d_tracer,
 			  tr, &tracing_entries_fops);
 
 	trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer,
 			  tr, &tracing_total_entries_fops);
 
+	trace_create_file("trace_clock", writable_mode, d_tracer, tr,
+			  &trace_clock_fops);
+
+	trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
+			  &trace_time_stamp_mode_fops);
+
+	tr->buffer_percent = 50;
+
+	trace_create_file("buffer_subbuf_size_kb", writable_mode, d_tracer,
+			  tr, &buffer_subbuf_size_fops);
+
+	create_trace_options_dir(tr);
+
+	if (tr->range_addr_start)
+		trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
+				  tr, &last_boot_fops);
+
+	for_each_tracing_cpu(cpu)
+		tracing_init_tracefs_percpu(tr, cpu);
+
+	/* Read-only instance has above files only. */
+	if (trace_array_is_readonly(tr))
+		return;
+
 	trace_create_file("free_buffer", 0200, d_tracer,
 			  tr, &tracing_free_buffer_fops);
 
@@ -8808,49 +8847,29 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("trace_marker_raw", 0220, d_tracer,
 			  tr, &tracing_mark_raw_fops);
 
-	trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr,
-			  &trace_clock_fops);
-
-	trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
-			  tr, &rb_simple_fops);
-
-	trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr,
-			  &trace_time_stamp_mode_fops);
-
-	tr->buffer_percent = 50;
-
 	trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
-			tr, &buffer_percent_fops);
-
-	trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
-			  tr, &buffer_subbuf_size_fops);
+			  tr, &buffer_percent_fops);
 
 	trace_create_file("syscall_user_buf_size", TRACE_MODE_WRITE, d_tracer,
-			 tr, &tracing_syscall_buf_fops);
+			  tr, &tracing_syscall_buf_fops);
 
-	create_trace_options_dir(tr);
+	trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer,
+			  tr, &rb_simple_fops);
 
 	trace_create_maxlat_file(tr, d_tracer);
 
 	if (ftrace_create_function_files(tr, d_tracer))
 		MEM_FAIL(1, "Could not allocate function filter files");
 
-	if (tr->range_addr_start) {
-		trace_create_file("last_boot_info", TRACE_MODE_READ, d_tracer,
-				  tr, &last_boot_fops);
 #ifdef CONFIG_TRACER_SNAPSHOT
-	} else {
+	if (!tr->range_addr_start)
 		trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
 				  tr, &snapshot_fops);
 #endif
-	}
 
 	trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer,
 			  tr, &tracing_err_log_fops);
 
-	for_each_tracing_cpu(cpu)
-		tracing_init_tracefs_percpu(tr, cpu);
-
 	ftrace_init_tracefs(tr, d_tracer);
 }
 
@@ -9635,7 +9654,7 @@ __init static void enable_instances(void)
 		 * Backup buffers can be freed but need vfree().
 		 */
 		if (backup)
-			tr->flags |= TRACE_ARRAY_FL_VMALLOC;
+			tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
 
 		if (start || backup) {
 			tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a3ea735a9ef6..2d9d26d423f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -464,6 +464,7 @@ enum {
 	TRACE_ARRAY_FL_MOD_INIT		= BIT(3),
 	TRACE_ARRAY_FL_MEMMAP		= BIT(4),
 	TRACE_ARRAY_FL_VMALLOC		= BIT(5),
+	TRACE_ARRAY_FL_RDONLY		= BIT(6),
 };
 
 #ifdef CONFIG_MODULES
@@ -493,6 +494,12 @@ extern unsigned long trace_adjust_address(struct trace_array *tr, unsigned long
 
 extern struct trace_array *printk_trace;
 
+static inline bool trace_array_is_readonly(struct trace_array *tr)
+{
+	/* backup instance is read only. */
+	return tr->flags & TRACE_ARRAY_FL_RDONLY;
+}
+
 /*
  * The global tracer (top) should be the first trace array added,
  * but we check the flag anyway.
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index dbe29b4c6a7a..2ca2541c8a58 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -61,7 +61,8 @@ trace_boot_set_instance_options(struct trace_array *tr, struct xbc_node *node)
 		v = memparse(p, NULL);
 		if (v < PAGE_SIZE)
 			pr_err("Buffer size is too small: %s\n", p);
-		if (tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
+		if (trace_array_is_readonly(tr) ||
+		    tracing_resize_ring_buffer(tr, v, RING_BUFFER_ALL_CPUS) < 0)
 			pr_err("Failed to resize trace buffer to %s\n", p);
 	}
 
@@ -597,7 +598,7 @@ trace_boot_enable_tracer(struct trace_array *tr, struct xbc_node *node)
 
 	p = xbc_node_find_value(node, "tracer", NULL);
 	if (p && *p != '\0') {
-		if (tracing_set_tracer(tr, p) < 0)
+		if (trace_array_is_readonly(tr) || tracing_set_tracer(tr, p) < 0)
 			pr_err("Failed to set given tracer: %s\n", p);
 	}
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index de807a9e2371..7ddcee312471 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1401,6 +1401,9 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
 {
 	int ret;
 
+	if (trace_array_is_readonly(tr))
+		return -EACCES;
+
 	mutex_lock(&event_mutex);
 	ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
 	mutex_unlock(&event_mutex);
@@ -2969,8 +2972,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
 	} else
 		__get_system(system);
 
-	/* ftrace only has directories no files */
-	if (strcmp(name, "ftrace") == 0)
+	/* ftrace only has directories no files, readonly instance too. */
+	if (strcmp(name, "ftrace") == 0 || trace_array_is_readonly(tr))
 		nr_entries = 0;
 	else
 		nr_entries = ARRAY_SIZE(system_entries);
@@ -3135,28 +3138,30 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 	int ret;
 	static struct eventfs_entry event_entries[] = {
 		{
-			.name		= "enable",
+			.name		= "format",
 			.callback	= event_callback,
-			.release	= event_release,
 		},
+#ifdef CONFIG_PERF_EVENTS
 		{
-			.name		= "filter",
+			.name		= "id",
 			.callback	= event_callback,
 		},
+#endif
+#define NR_RO_EVENT_ENTRIES	(1 + IS_ENABLED(CONFIG_PERF_EVENTS))
+/* Readonly files must be above this line and counted by NR_RO_EVENT_ENTRIES. */
 		{
-			.name		= "trigger",
+			.name		= "enable",
 			.callback	= event_callback,
+			.release	= event_release,
 		},
 		{
-			.name		= "format",
+			.name		= "filter",
 			.callback	= event_callback,
 		},
-#ifdef CONFIG_PERF_EVENTS
 		{
-			.name		= "id",
+			.name		= "trigger",
 			.callback	= event_callback,
 		},
-#endif
 #ifdef CONFIG_HIST_TRIGGERS
 		{
 			.name		= "hist",
@@ -3189,7 +3194,10 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 	if (!e_events)
 		return -ENOMEM;
 
-	nr_entries = ARRAY_SIZE(event_entries);
+	if (trace_array_is_readonly(tr))
+		nr_entries = NR_RO_EVENT_ENTRIES;
+	else
+		nr_entries = ARRAY_SIZE(event_entries);
 
 	name = trace_event_name(call);
 	ei = eventfs_create_dir(name, e_events, event_entries, nr_entries, file);
@@ -4532,31 +4540,44 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 	int nr_entries;
 	static struct eventfs_entry events_entries[] = {
 		{
-			.name		= "enable",
+			.name		= "header_page",
 			.callback	= events_callback,
 		},
 		{
-			.name		= "header_page",
+			.name		= "header_event",
 			.callback	= events_callback,
 		},
+#define NR_RO_TOP_ENTRIES	2
+/* Readonly files must be above this line and counted by NR_RO_TOP_ENTRIES. */
 		{
-			.name		= "header_event",
+			.name		= "enable",
 			.callback	= events_callback,
 		},
 	};
 
-	entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
-				  tr, &ftrace_set_event_fops);
-	if (!entry)
-		return -ENOMEM;
+	if (!trace_array_is_readonly(tr)) {
+		entry = trace_create_file("set_event", TRACE_MODE_WRITE, parent,
+					tr, &ftrace_set_event_fops);
+		if (!entry)
+			return -ENOMEM;
 
-	trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
-			  &ftrace_show_event_filters_fops);
+		/* There are not as crucial, just warn if they are not created */
+		trace_create_file("show_event_filters", TRACE_MODE_READ, parent, tr,
+				&ftrace_show_event_filters_fops);
 
-	trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
-			  &ftrace_show_event_triggers_fops);
+		trace_create_file("show_event_triggers", TRACE_MODE_READ, parent, tr,
+				&ftrace_show_event_triggers_fops);
 
-	nr_entries = ARRAY_SIZE(events_entries);
+		trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
+				tr, &ftrace_set_event_pid_fops);
+
+		trace_create_file("set_event_notrace_pid",
+				TRACE_MODE_WRITE, parent, tr,
+				&ftrace_set_event_notrace_pid_fops);
+		nr_entries = ARRAY_SIZE(events_entries);
+	} else {
+		nr_entries = NR_RO_TOP_ENTRIES;
+	}
 
 	e_events = eventfs_create_events_dir("events", parent, events_entries,
 					     nr_entries, tr);
@@ -4565,15 +4586,6 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
 		return -ENOMEM;
 	}
 
-	/* There are not as crucial, just warn if they are not created */
-
-	trace_create_file("set_event_pid", TRACE_MODE_WRITE, parent,
-			  tr, &ftrace_set_event_pid_fops);
-
-	trace_create_file("set_event_notrace_pid",
-			  TRACE_MODE_WRITE, parent, tr,
-			  &ftrace_set_event_notrace_pid_fops);
-
 	tr->event_dir = e_events;
 
 	return 0;


^ permalink raw reply related

* [PATCH v9 2/3] tracing: Remove the backup instance automatically after read
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177497473558.569199.6527680985537865638.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since the backup instance is readonly, after reading all data
via pipe, no data is left on the instance. Thus it can be
removed safely after closing all files.
This also removes it if user resets the ring buffer manually
via 'trace' file.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v9:
   - Fix to initialize autoremove workqueue only for backup.
   - Fix to return -ENODEV if trace_array_get() refers freeing instance.
 Changes in v6:
   - Fix typo in comment.
   - Only when there is a readonly trace array, initialize autoremove_wq.
   - Fix to exit loop in trace_array_get() if tr is found in the list.
 Changes in v4:
   - Update description.
---
 kernel/trace/trace.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace.h |    6 ++++
 2 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8cec7bd70438..1d73400a01c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -539,8 +539,65 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
 	tr->ring_buffer_expanded = true;
 }
 
+static int __remove_instance(struct trace_array *tr);
+
+static void trace_array_autoremove(struct work_struct *work)
+{
+	struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
+
+	guard(mutex)(&event_mutex);
+	guard(mutex)(&trace_types_lock);
+
+	/*
+	 * This can be fail if someone gets @tr before starting this
+	 * function, but in that case, this will be kicked again when
+	 * putting it. So we don't care about the result.
+	 */
+	__remove_instance(tr);
+}
+
+static struct workqueue_struct *autoremove_wq;
+
+static void trace_array_kick_autoremove(struct trace_array *tr)
+{
+	if (autoremove_wq && !work_pending(&tr->autoremove_work))
+		queue_work(autoremove_wq, &tr->autoremove_work);
+}
+
+static void trace_array_cancel_autoremove(struct trace_array *tr)
+{
+	if (autoremove_wq && work_pending(&tr->autoremove_work))
+		cancel_work(&tr->autoremove_work);
+}
+
+static void trace_array_init_autoremove(struct trace_array *tr)
+{
+	INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
+}
+
+static void trace_array_start_autoremove(void)
+{
+	if (autoremove_wq)
+		return;
+
+	autoremove_wq = alloc_workqueue("tr_autoremove_wq",
+					WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!autoremove_wq)
+		pr_warn("Unable to allocate tr_autoremove_wq. autoremove disabled.\n");
+}
+
 LIST_HEAD(ftrace_trace_arrays);
 
+static int __trace_array_get(struct trace_array *this_tr)
+{
+	/* When free_on_close is set, this is not available anymore. */
+	if (autoremove_wq && this_tr->free_on_close)
+		return -ENODEV;
+
+	this_tr->ref++;
+	return 0;
+}
+
 int trace_array_get(struct trace_array *this_tr)
 {
 	struct trace_array *tr;
@@ -548,8 +605,7 @@ int trace_array_get(struct trace_array *this_tr)
 	guard(mutex)(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (tr == this_tr) {
-			tr->ref++;
-			return 0;
+			return __trace_array_get(tr);
 		}
 	}
 
@@ -560,6 +616,12 @@ static void __trace_array_put(struct trace_array *this_tr)
 {
 	WARN_ON(!this_tr->ref);
 	this_tr->ref--;
+	/*
+	 * When free_on_close is set, prepare removing the array
+	 * when the last reference is released.
+	 */
+	if (this_tr->ref == 1 && this_tr->free_on_close)
+		trace_array_kick_autoremove(this_tr);
 }
 
 /**
@@ -4829,6 +4891,10 @@ static void update_last_data(struct trace_array *tr)
 	/* Only if the buffer has previous boot data clear and update it. */
 	tr->flags &= ~TRACE_ARRAY_FL_LAST_BOOT;
 
+	/* If this is a backup instance, mark it for autoremove. */
+	if (tr->flags & TRACE_ARRAY_FL_VMALLOC)
+		tr->free_on_close = true;
+
 	/* Reset the module list and reload them */
 	if (tr->scratch) {
 		struct trace_scratch *tscratch = tr->scratch;
@@ -8442,8 +8508,8 @@ struct trace_array *trace_array_find_get(const char *instance)
 
 	guard(mutex)(&trace_types_lock);
 	tr = trace_array_find(instance);
-	if (tr)
-		tr->ref++;
+	if (tr && __trace_array_get(tr) < 0)
+		tr = NULL;
 
 	return tr;
 }
@@ -8540,6 +8606,8 @@ trace_array_create_systems(const char *name, const char *systems,
 	if (ftrace_allocate_ftrace_ops(tr) < 0)
 		goto out_free_tr;
 
+	trace_array_init_autoremove(tr);
+
 	ftrace_init_trace_array(tr);
 
 	init_trace_flags_index(tr);
@@ -8650,7 +8718,9 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
 
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		if (tr->name && strcmp(tr->name, name) == 0) {
-			tr->ref++;
+			/* if this fails, @tr is going to be removed. */
+			if (__trace_array_get(tr) < 0)
+				tr = NULL;
 			return tr;
 		}
 	}
@@ -8689,6 +8759,7 @@ static int __remove_instance(struct trace_array *tr)
 			set_tracer_flag(tr, 1ULL << i, 0);
 	}
 
+	trace_array_cancel_autoremove(tr);
 	tracing_set_nop(tr);
 	clear_ftrace_function_probes(tr);
 	event_trace_del_tracer(tr);
@@ -9653,8 +9724,10 @@ __init static void enable_instances(void)
 		/*
 		 * Backup buffers can be freed but need vfree().
 		 */
-		if (backup)
+		if (backup) {
 			tr->flags |= TRACE_ARRAY_FL_VMALLOC | TRACE_ARRAY_FL_RDONLY;
+			trace_array_start_autoremove();
+		}
 
 		if (start || backup) {
 			tr->flags |= TRACE_ARRAY_FL_BOOT | TRACE_ARRAY_FL_LAST_BOOT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2d9d26d423f1..60e079177492 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -455,6 +455,12 @@ struct trace_array {
 	 * we do not waste memory on systems that are not using tracing.
 	 */
 	bool ring_buffer_expanded;
+	/*
+	 * If the ring buffer is a read only backup instance, it will be
+	 * removed after dumping all data via pipe, because no readable data.
+	 */
+	bool free_on_close;
+	struct work_struct	autoremove_work;
 };
 
 enum {


^ permalink raw reply related

* [PATCH v9 3/3] tracing/Documentation: Add a section about backup instance
From: Masami Hiramatsu (Google) @ 2026-03-31 16:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel
In-Reply-To: <177497473558.569199.6527680985537865638.stgit@mhiramat.tok.corp.google.com>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a section about backup instance to the debugging.rst.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
  Changes in v6:
   - Fix typos.
---
 Documentation/trace/debugging.rst |   19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/trace/debugging.rst b/Documentation/trace/debugging.rst
index 4d88c346fc38..15857951b506 100644
--- a/Documentation/trace/debugging.rst
+++ b/Documentation/trace/debugging.rst
@@ -159,3 +159,22 @@ If setting it from the kernel command line, it is recommended to also
 disable tracing with the "traceoff" flag, and enable tracing after boot up.
 Otherwise the trace from the most recent boot will be mixed with the trace
 from the previous boot, and may make it confusing to read.
+
+Using a backup instance for keeping previous boot data
+------------------------------------------------------
+
+It is also possible to record trace data at system boot time by specifying
+events with the persistent ring buffer, but in this case the data before the
+reboot will be lost before it can be read. This problem can be solved by a
+backup instance. From the kernel command line::
+
+  reserve_mem=12M:4096:trace trace_instance=boot_map@trace,sched,irq trace_instance=backup=boot_map
+
+On boot up, the previous data in the "boot_map" is copied to the "backup"
+instance, and the "sched:*" and "irq:*" events for the current boot are traced
+in the "boot_map". Thus the user can read the previous boot data from the "backup"
+instance without stopping the trace.
+
+Note that this "backup" instance is readonly, and will be removed automatically
+if you clear the trace data or read out all trace data from the "trace_pipe"
+or the "trace_pipe_raw" files.
\ No newline at end of file


^ permalink raw reply related

* Re: [PATCH v15 1/5] ring-buffer: Flush and stop persistent ring buffer on panic
From: Catalin Marinas @ 2026-03-31 17:57 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Steven Rostedt, Will Deacon, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, Ian Rogers, linux-arm-kernel, Robin Murphy
In-Reply-To: <177494616630.71933.2941681397188791689.stgit@mhiramat.tok.corp.google.com>

On Tue, Mar 31, 2026 at 05:36:06PM +0900, Masami Hiramatsu (Google) wrote:
> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> On real hardware, panic and machine reboot may not flush hardware cache
> to memory. This means the persistent ring buffer, which relies on a
> coherent state of memory, may not have its events written to the buffer
> and they may be lost. Moreover, there may be inconsistency with the
> counters which are used for validation of the integrity of the
> persistent ring buffer which may cause all data to be discarded.
> 
> To avoid this issue, stop recording of the ring buffer on panic and
> flush the cache of the ring buffer's memory.
> 
> Fixes: e645535a954a ("tracing: Add option to use memmapped memory for trace boot instance")
> Cc: stable@vger.kernel.org
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
[...]
> diff --git a/arch/arm64/include/asm/ring_buffer.h b/arch/arm64/include/asm/ring_buffer.h
> new file mode 100644
> index 000000000000..62316c406888
> --- /dev/null
> +++ b/arch/arm64/include/asm/ring_buffer.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef _ASM_ARM64_RING_BUFFER_H
> +#define _ASM_ARM64_RING_BUFFER_H
> +
> +#include <asm/cacheflush.h>
> +
> +/* Flush D-cache on persistent ring buffer */
> +#define arch_ring_buffer_flush_range(start, end)	dcache_clean_pop(start, end)
> +
> +#endif /* _ASM_ARM64_RING_BUFFER_H */

Adding Robin as he wrote the pmem support for arm64.

I assume the ring buffer here is cacheable memory, otherwise we'd also
need a dmb(osh) as in arch_wb_cache_pmem(). If that's correct:

Acked-by: Catalin Marinas <catalin.marinas@arm.com>

^ permalink raw reply

* Re: [PATCH v15 1/5] ring-buffer: Flush and stop persistent ring buffer on panic
From: Steven Rostedt @ 2026-03-31 18:03 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Masami Hiramatsu (Google), Will Deacon, Mathieu Desnoyers,
	linux-kernel, linux-trace-kernel, Ian Rogers, linux-arm-kernel,
	Robin Murphy
In-Reply-To: <acwK-Xu-Mon2_6bT@arm.com>

On Tue, 31 Mar 2026 18:57:13 +0100
Catalin Marinas <catalin.marinas@arm.com> wrote:

> I assume the ring buffer here is cacheable memory, otherwise we'd also
> need a dmb(osh) as in arch_wb_cache_pmem(). If that's correct:

Yes, it's cacheable (I couldn't imagine the performance overhead if it
wasn't! ;-)

> 
> Acked-by: Catalin Marinas <catalin.marinas@arm.com>

Thanks!

-- Steve

^ permalink raw reply

* [PATCH v3 1/3] tracing: Have futex syscall trace event show specific user data
From: Steven Rostedt @ 2026-03-31 18:13 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260331181349.062575155@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

Add specific reporting of the futex system call. This allows for debugging
the futex code a bit easier. Instead of just showing the values passed
into the futex system call, read the value of the user space memory
pointed to by the addr parameter.

Also make the op parameter more readable by parsing the values to show
what the command is:

 futex_requeue_p-3251    [002] .....  2101.068479: sys_futex(uaddr: 0x55e79a4da834 (0x80000cb1), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
 futex_requeue_p-3248    [001] .....  2101.068970: sys_futex(uaddr: 0x7f859072f990 (0xcb2), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 3250)
 futex_requeue_p-3252    [005] .....  2101.069108: sys_futex(uaddr: 0x55e79a4da838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7ffe61076aa0, uaddr2: 0x55e79a4da834, uaddr2: 94453214586932, val3: 0)
 futex_requeue_p-3252    [005] .....  2101.069410: sys_futex(uaddr: 0x55e79a4da834 (0x80000cb1), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v2: https://patch.msgid.link/20260310201036.542627924@kernel.org

- Removed unused "buf" variable (kernel test robot)

- Iterate __futex_cmds[] make the print statement.
  Note this required exposing __futex_cmds[] to trace_syscall.c

- Added back val statement (with the move to futex/syscall.c the
  third parameter was dropped).

 include/linux/futex.h         |   8 +-
 kernel/futex/syscalls.c       |  97 +++++++++++++++++++++++
 kernel/trace/trace_syscalls.c | 144 +++++++++++++++++++++++++++++++++-
 3 files changed, 245 insertions(+), 4 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 9e9750f04980..7eaf01ff68cf 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -82,6 +82,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u32 *ptr);
+extern const char * __futex_cmds[];
+#endif
+
 #ifdef CONFIG_FUTEX_PRIVATE_HASH
 int futex_hash_allocate_default(void);
 void futex_hash_free(struct mm_struct *mm);
@@ -114,7 +119,8 @@ static inline int futex_hash_allocate_default(void)
 }
 static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
 static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
-
+static inline void futex_print_syscall(struct seq_buf *s, int nr_args,
+				       unsigned long *args, u32 *ptr) { }
 #endif
 
 #endif
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 743c7a728237..3ba8ca017c9c 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -171,6 +171,18 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
 	return false;
 }
 
+static __always_inline bool futex_cmd_has_addr2(u32 cmd)
+{
+	switch (cmd) {
+	case FUTEX_REQUEUE:
+	case FUTEX_CMP_REQUEUE:
+	case FUTEX_WAKE_OP:
+	case FUTEX_WAIT_REQUEUE_PI:
+		return true;
+	}
+	return false;
+}
+
 static __always_inline int
 futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
 {
@@ -207,6 +219,91 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
 }
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+
+/* End with NULL for iterators */
+const char * __futex_cmds[] =
+{
+	"FUTEX_WAIT", "FUTEX_WAKE", "FUTEX_FD", "FUTEX_REQUEUE",
+	"FUTEX_CMP_REQUEUE", "FUTEX_WAKE_OP", "FUTEX_LOCK_PI",
+	"FUTEX_UNLOCK_PI", "FUTEX_TRYLOCK_PI", "FUTEX_WAIT_BITSET",
+	"FUTEX_WAKE_BITSET", "FUTEX_WAIT_REQUEUE_PI", "FUTEX_CMP_REQUEUE_PI",
+	"FUTEX_LOCK_PI2", NULL
+};
+
+void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u32 *ptr)
+{
+	unsigned int op, cmd;
+	bool done = false;
+
+	for (int i = 0; !done && i < nr_args; i++) {
+
+		if (seq_buf_has_overflowed(s))
+			break;
+
+		switch (i) {
+		case 0:
+			seq_buf_printf(s, "uaddr: 0x%lx", args[i]);
+			if (ptr) {
+				u32 val = *ptr;
+				if (val < 10)
+					seq_buf_printf(s, " (%u)", val);
+				else
+					seq_buf_printf(s, " (0x%x)", val);
+			}
+			continue;
+		case 1:
+			op = args[i];
+			cmd = op & FUTEX_CMD_MASK;
+			if (cmd <= FUTEX_LOCK_PI2)
+				seq_buf_printf(s, ", %s", __futex_cmds[cmd]);
+			else
+				seq_buf_puts(s, ", UNKNOWN");
+
+			if (op & FUTEX_PRIVATE_FLAG)
+				seq_buf_puts(s, "|FUTEX_PRIVATE_FLAG");
+			if (op & FUTEX_CLOCK_REALTIME)
+				seq_buf_puts(s, "|FUTEX_CLOCK_REALTIME");
+			continue;
+		case 2:
+			if (args[i] < 10)
+				seq_buf_printf(s, ", val: %ld", args[i]);
+			else
+				seq_buf_printf(s, ", val: 0x%lx", args[i]);
+			continue;
+		case 3:
+			if (!futex_cmd_has_timeout(cmd)) {
+
+				if (!futex_cmd_has_addr2(cmd)) {
+					done = true;
+					continue;
+				}
+
+				seq_buf_printf(s, ", val2: 0x%x", (u32)(long)args[i]);
+				continue;
+			}
+
+			if (!args[i])
+				continue;
+
+			seq_buf_printf(s, ", timespec: 0x%lx", args[i]);
+			continue;
+		case 4:
+			if (!futex_cmd_has_addr2(cmd)) {
+				done = true;
+				continue;
+			}
+			seq_buf_printf(s, ", uaddr2: 0x%lx", args[i]);
+			continue;
+		case 5:
+			seq_buf_printf(s, ", val3: %lu", args[i]);
+			done = true;
+			continue;
+		}
+	}
+}
+#endif
+
 /**
  * futex_parse_waitv - Parse a waitv array from userspace
  * @futexv:	Kernel side list of waiters to be filled
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 37317b81fcda..f8213d772f89 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -6,11 +6,13 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
+#include <linux/futex.h>
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <linux/xarray.h>
 #include <asm/syscall.h>
 
+
 #include "trace_output.h"
 #include "trace.h"
 
@@ -237,6 +239,27 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
 	return trace_handle_return(s);
 }
 
+static enum print_line_t
+sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
+		      struct trace_seq *s, struct trace_event *event, int ent_size)
+{
+	void *end = (void *)trace + ent_size;
+	void *ptr;
+
+	/* Set ptr to the user space copied area */
+	ptr = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
+	if (ptr + 4 > end)
+		ptr = NULL;
+
+	trace_seq_printf(s, "%s(", entry->name);
+
+	futex_print_syscall(&s->seq, entry->nb_args, trace->args, ptr);
+
+	trace_seq_puts(s, ")\n");
+
+	return trace_handle_return(s);
+}
+
 static enum print_line_t
 print_syscall_enter(struct trace_iterator *iter, int flags,
 		    struct trace_event *event)
@@ -267,6 +290,10 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
 		if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
 			return sys_enter_openat_print(trace, entry, s, event);
 		break;
+	case __NR_futex:
+		if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
+			return sys_enter_futex_print(trace, entry, s, event, iter->ent_size);
+		break;
 	default:
 		break;
 	}
@@ -437,6 +464,48 @@ sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	return pos;
 }
 
+static int __init
+sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
+{
+	int pos = 0;
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"\"uaddr: 0x%%lx (0x%%lx) cmd=%%s%%s%%s");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"  val: 0x%%x timeout/val2: 0x%%llx");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" uaddr2: 0x%%lx val3: 0x%%x\", ");
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->uaddr,");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->__value,");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			"  __print_symbolic(REC->op & 0x%x, ", FUTEX_CMD_MASK);
+
+	for (int i = 0; __futex_cmds[i]; i++) {
+		pos += snprintf(buf + pos, LEN_OR_ZERO,
+				"%s{%d, \"%s\"} ",
+				i ? "," : "", i, __futex_cmds[i]);
+	}
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "), ");
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" (REC->op & %d) ? \"|FUTEX_PRIVATE_FLAG\" : \"\",",
+			FUTEX_PRIVATE_FLAG);
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" (REC->op & %d) ? \"|FUTEX_CLOCK_REALTIME\" : \"\",",
+			FUTEX_CLOCK_REALTIME);
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->val, REC->utime,");
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->uaddr, REC->val3");
+	return pos;
+}
+
 static int __init
 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 {
@@ -447,6 +516,8 @@ __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	switch (entry->syscall_nr) {
 	case __NR_openat:
 		return sys_enter_openat_print_fmt(entry, buf, len);
+	case __NR_futex:
+		return sys_enter_futex_print_fmt(entry, buf, len);
 	default:
 		break;
 	}
@@ -523,6 +594,21 @@ static void __init free_syscall_print_fmt(struct trace_event_call *call)
 		kfree(call->print_fmt);
 }
 
+static int __init futex_fields(struct trace_event_call *call, int offset)
+{
+	char *arg;
+	int ret;
+
+	arg = kstrdup("__value", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
+				 FILTER_OTHER);
+	if (ret)
+		kfree(arg);
+	return ret;
+}
+
 static int __init syscall_enter_define_fields(struct trace_event_call *call)
 {
 	struct syscall_trace_enter trace;
@@ -544,6 +630,9 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
 		offset += sizeof(unsigned long);
 	}
 
+	if (!ret && meta->syscall_nr == __NR_futex)
+		return futex_fields(call, offset);
+
 	if (ret || !meta->user_mask)
 		return ret;
 
@@ -689,6 +778,45 @@ static int syscall_copy_user_array(char *buf, const char __user *ptr,
 	return 0;
 }
 
+static int
+syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
+{
+	struct syscall_user_buffer *sbuf;
+	const char __user *ptr;
+
+	/* buf_size of zero means user doesn't want user space read */
+	if (!buf_size)
+		return -1;
+
+	/* If the syscall_buffer is NULL, tracing is being shutdown */
+	sbuf = READ_ONCE(syscall_buffer);
+	if (!sbuf)
+		return -1;
+
+	ptr = (char __user *)args[0];
+
+	*buffer = trace_user_fault_read(&sbuf->buf, ptr, 4, NULL, NULL);
+	if (!*buffer)
+		return -1;
+
+	/* Add room for the value */
+	*size += 4;
+
+	return 0;
+}
+
+static void syscall_put_futex(struct syscall_metadata *sys_data,
+			      struct syscall_trace_enter *entry,
+			      char *buffer)
+{
+	u32 *ptr;
+
+	/* Place the futex key into the storage */
+	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+
+	*ptr = *(u32 *)buffer;
+}
+
 static char *sys_fault_user(unsigned int buf_size,
 			    struct syscall_metadata *sys_data,
 			    struct syscall_user_buffer *sbuf,
@@ -905,6 +1033,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 		if (syscall_get_data(sys_data, args, &user_ptr,
 				     &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
 			return;
+	} else if (syscall_nr == __NR_futex) {
+		if (syscall_get_futex(args, &user_ptr, &size, tr->syscall_buf_sz) < 0)
+			return;
 	}
 
 	size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
@@ -921,6 +1052,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	if (mayfault)
 		syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
 
+	else if (syscall_nr == __NR_futex)
+		syscall_put_futex(sys_data, entry, user_ptr);
+
 	trace_event_buffer_commit(&fbuffer);
 }
 
@@ -971,14 +1105,18 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
 {
 	struct syscall_metadata *sys_data = call->data;
 	struct trace_array *tr = file->tr;
+	bool enable_faults;
 	int ret = 0;
 	int num;
 
 	num = sys_data->syscall_nr;
 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
 		return -ENOSYS;
+
+	enable_faults = sys_data->user_mask || num == __NR_futex;
+
 	guard(mutex)(&syscall_trace_lock);
-	if (sys_data->user_mask) {
+	if (enable_faults) {
 		ret = syscall_fault_buffer_enable();
 		if (ret < 0)
 			return ret;
@@ -986,7 +1124,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file,
 	if (!tr->sys_refcount_enter) {
 		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
 		if (ret < 0) {
-			if (sys_data->user_mask)
+			if (enable_faults)
 				syscall_fault_buffer_disable();
 			return ret;
 		}
@@ -1011,7 +1149,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file,
 	WRITE_ONCE(tr->enter_syscall_files[num], NULL);
 	if (!tr->sys_refcount_enter)
 		unregister_trace_sys_enter(ftrace_syscall_enter, tr);
-	if (sys_data->user_mask)
+	if (sys_data->user_mask || num == __NR_futex)
 		syscall_fault_buffer_disable();
 }
 
-- 
2.51.0



^ permalink raw reply related

* [PATCH v3 0/3] tracing: Read user data from futex system call trace event
From: Steven Rostedt @ 2026-03-31 18:13 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal

We are looking at the performance of futexes and require a bit more
information when tracing them.

The two patches here extend the system call reading of user space to
create specific handling of the futex system call. It now reads the
user space relevant data (the addr, utime and addr2), as well as
parses the flags. This adds a little smarts to the trace event as
it only shows the parameters that are relevant, as well as parses
utime as either a timespec or as val2 depending on the futex_op.

Here's an example of the new output:

 sys_futex(uaddr: 0x56196292e830 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG)
 sys_futex(uaddr: 0x56196292e834 (0x4a7) tid: 1191, FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG)
 sys_futex(uaddr: 0x56196292e834 (0) tid: 0, FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG)
 sys_futex(uaddr: 0x56196292e830 (0), FUTEX_WAIT|FUTEX_PRIVATE_FLAG)
 sys_futex(uaddr: 0x56196292e838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, timespec: 0x7ffc1b91a9f0 (163.048528790), uaddr2: 0x56196292e834 (4aa), val3: 0)
 sys_futex(uaddr: 0x56196292e834 (0x4aa) tid: 1194, FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG)
 sys_futex(uaddr: 0x56196292e838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, timespec: 0x7ffc1b91a9f0 (163.048528790), uaddr2: 0x56196292e834 (800004aa), val3: 0)
 sys_futex(uaddr: 0x7f7ed6b29990 (0x4ab), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME)
 sys_futex(uaddr: 0x56196292e834 (0x800004aa) tid: 1194 (WAITERS), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG)
 sys_futex(uaddr: 0x56196292e838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, timespec: 0x7ffc1b91a9f0 (163.048528790), uaddr2: 0x56196292e834 (800004aa), val3: 0)
 sys_futex(uaddr: 0x56196292e834 (0x800004aa) tid: 1194 (WAITERS), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG)

Changes since v2: https://lore.kernel.org/all/20260310200954.285663884@kernel.org/

- Removed unused "buf" variable (kernel test robot)

- Iterate __futex_cmds[] make the print statement.
  Note this required exposing __futex_cmds[] to trace_syscall.c
  (Masami Hiramatsu)

- Added back val statement (with the move to futex/syscall.c the
  third parameter was dropped).

  git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
futex/core

Head SHA1: 79b0609ad15b24d0bbfb1790e17902a6c210ae69


Steven Rostedt (3):
      tracing: Have futex syscall trace event show specific user data
      tracing: Update futex syscall trace event to show more commands
      tracing: Show TID and flags for PI futex system call trace event

----
 include/linux/futex.h         |  39 ++++++-
 kernel/futex/syscalls.c       | 137 +++++++++++++++++++++---
 kernel/trace/trace_syscalls.c | 237 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 396 insertions(+), 17 deletions(-)

^ permalink raw reply

* [PATCH v3 3/3] tracing: Show TID and flags for PI futex system call trace event
From: Steven Rostedt @ 2026-03-31 18:13 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260331181349.062575155@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

For the futex system call trace event for FUTEX_LOCK_PI and
FUTEX_UNLOCK_PI commands, show the TID from the value (which is usually in
hex) as well as translate the flags (DIED and WAITERS).

 pi_mutex_hammer-1098    [000] .....   121.876928: sys_futex(uaddr: 0x560f40cc8180 (0x450) tid: 1104, FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7f2f9d4b1e50 (0.000100000))
 pi_mutex_hammer-1128    [000] .....   121.877120: sys_futex(uaddr: 0x560f40cc8180 (0x8000042a) tid: 1066 (WAITERS), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7f2f8e493e50 (0.000100000))
 pi_mutex_hammer-1106    [000] .....   121.877242: sys_futex(uaddr: 0x560f40cc8180 (0x80000452) tid: 1106 (WAITERS), FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)

This makes it easier to see the hand off of a mutex and who the owner was.

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/futex/syscalls.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 265ec0a236d0..8144781a9ff0 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -213,6 +213,9 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
 	unsigned int op, cmd;
 	bool done = false;
 
+	op = args[1];
+	cmd = op & FUTEX_CMD_MASK;
+
 	for (int i = 0; !done && i < nr_args; i++) {
 
 		if (seq_buf_has_overflowed(s))
@@ -227,11 +230,30 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
 					seq_buf_printf(s, " (%u)", val);
 				else
 					seq_buf_printf(s, " (0x%x)", val);
+
+				switch(cmd) {
+				case FUTEX_LOCK_PI:
+				case FUTEX_UNLOCK_PI:
+					seq_buf_printf(s, " tid: %d",
+						       val & FUTEX_TID_MASK);
+
+					if (!(val & (FUTEX_OWNER_DIED|FUTEX_WAITERS)))
+						break;
+
+					seq_buf_puts(s, " (");
+					if (val & FUTEX_WAITERS)
+						seq_buf_puts(s, "WAITERS");
+					if (val & FUTEX_OWNER_DIED) {
+						if (op & FUTEX_WAITERS)
+							seq_buf_putc(s, '|');
+						seq_buf_puts(s, "DIED");
+					}
+					seq_buf_putc(s, ')');
+					break;
+				}
 			}
 			continue;
 		case 1:
-			op = args[i];
-			cmd = op & FUTEX_CMD_MASK;
 			if (cmd <= FUTEX_LOCK_PI2)
 				seq_buf_printf(s, ", %s", __futex_cmds[cmd]);
 			else
-- 
2.51.0



^ permalink raw reply related

* [PATCH v3 2/3] tracing: Update futex syscall trace event to show more commands
From: Steven Rostedt @ 2026-03-31 18:13 UTC (permalink / raw)
  To: linux-kernel, linux-trace-kernel
  Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
	Thomas Gleixner, Peter Zijlstra, Brian Geffon, John Stultz,
	Ian Rogers, Suleiman Souhlal
In-Reply-To: <20260331181349.062575155@kernel.org>

From: Steven Rostedt <rostedt@goodmis.org>

Make the futex syscall trace event a little more smart. Have it read the
futex_op instruction to determine what else it can save and print. For the
appropriate options, it will read the utime (timespec) parameter and show
its output as well as the uaddr2.

 futex_requeue_p-1154    [004] .....   144.568339: sys_futex(uaddr: 0x5652b178d834 (0x482), FUTEX_UNLOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)
 futex_requeue_p-1162    [002] .....   144.568696: sys_futex(uaddr: 0x7f763b7fece0 (2), FUTEX_WAIT|FUTEX_PRIVATE_FLAG, val: 2)
 futex_requeue_p-1151    [000] .....   144.568700: sys_futex(uaddr: 0x7f763b7fece0 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
 futex_requeue_p-1162    [002] .....   144.568705: sys_futex(uaddr: 0x7f763b7fece0 (0), FUTEX_WAKE|FUTEX_PRIVATE_FLAG, val: 1)
 futex_requeue_p-1151    [000] .....   144.568715: sys_futex(uaddr: 0x7f764369e990 (0x483), FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, val: 1155)
 futex_requeue_p-1155    [005] .....   144.569420: sys_futex(uaddr: 0x5652b178d838 (0), FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG, val: 0, timespec: 0x7ffdacfba500 (143.890024054), uaddr2: 0x5652b178d834 (0), val3: 0)
 futex_requeue_p-1155    [005] .....   144.569454: sys_futex(uaddr: 0x5652b178d834 (0), FUTEX_LOCK_PI|FUTEX_PRIVATE_FLAG, val: 0)

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/futex.h         |  35 ++++++++-
 kernel/futex/syscalls.c       |  48 ++++++-------
 kernel/trace/trace_syscalls.c | 129 +++++++++++++++++++++++++++++-----
 3 files changed, 164 insertions(+), 48 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 7eaf01ff68cf..ec0d9cfa8a59 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -82,8 +82,35 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	      u32 __user *uaddr2, u32 val2, u32 val3);
 int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4);
 
+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
+{
+	switch (cmd) {
+	case FUTEX_WAIT:
+	case FUTEX_LOCK_PI:
+	case FUTEX_LOCK_PI2:
+	case FUTEX_WAIT_BITSET:
+	case FUTEX_WAIT_REQUEUE_PI:
+		return true;
+	}
+	return false;
+}
+
+static __always_inline bool futex_cmd_has_addr2(u32 cmd)
+{
+	switch (cmd) {
+	case FUTEX_REQUEUE:
+	case FUTEX_CMP_REQUEUE:
+	case FUTEX_WAKE_OP:
+	case FUTEX_WAIT_REQUEUE_PI:
+		return true;
+	}
+	return false;
+}
+
 #ifdef CONFIG_FTRACE_SYSCALLS
-void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u32 *ptr);
+void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
+			 u32 *ptr1, u32 *ptr2, unsigned long *ts1,
+			 unsigned long *ts2);
 extern const char * __futex_cmds[];
 #endif
 
@@ -120,7 +147,11 @@ static inline int futex_hash_allocate_default(void)
 static inline int futex_hash_free(struct mm_struct *mm) { return 0; }
 static inline int futex_mm_init(struct mm_struct *mm) { return 0; }
 static inline void futex_print_syscall(struct seq_buf *s, int nr_args,
-				       unsigned long *args, u32 *ptr) { }
+				       unsigned long *args, u32 *ptr1,
+				       u32 *ptr2, unsigned long *ts1,
+				       unsigned long *ts2) { }
+static __always_inline bool futex_cmd_has_timeout(u32 cmd) { return false; }
+static __always_inline bool futex_cmd_has_addr2(u32 cmd) { return false; }
 #endif
 
 #endif
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 3ba8ca017c9c..265ec0a236d0 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -158,31 +158,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 	return -ENOSYS;
 }
 
-static __always_inline bool futex_cmd_has_timeout(u32 cmd)
-{
-	switch (cmd) {
-	case FUTEX_WAIT:
-	case FUTEX_LOCK_PI:
-	case FUTEX_LOCK_PI2:
-	case FUTEX_WAIT_BITSET:
-	case FUTEX_WAIT_REQUEUE_PI:
-		return true;
-	}
-	return false;
-}
-
-static __always_inline bool futex_cmd_has_addr2(u32 cmd)
-{
-	switch (cmd) {
-	case FUTEX_REQUEUE:
-	case FUTEX_CMP_REQUEUE:
-	case FUTEX_WAKE_OP:
-	case FUTEX_WAIT_REQUEUE_PI:
-		return true;
-	}
-	return false;
-}
-
 static __always_inline int
 futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
 {
@@ -231,7 +206,9 @@ const char * __futex_cmds[] =
 	"FUTEX_LOCK_PI2", NULL
 };
 
-void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u32 *ptr)
+void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args,
+			 u32 *ptr1, u32 *ptr2, unsigned long *ts1,
+			 unsigned long *ts2)
 {
 	unsigned int op, cmd;
 	bool done = false;
@@ -244,8 +221,8 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u3
 		switch (i) {
 		case 0:
 			seq_buf_printf(s, "uaddr: 0x%lx", args[i]);
-			if (ptr) {
-				u32 val = *ptr;
+			if (ptr1) {
+				u32 val = *ptr1;
 				if (val < 10)
 					seq_buf_printf(s, " (%u)", val);
 				else
@@ -287,6 +264,15 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u3
 				continue;
 
 			seq_buf_printf(s, ", timespec: 0x%lx", args[i]);
+
+			if (!ts1 || !ts2)
+				continue;
+
+			if (!*ts1 && !*ts2) {
+				seq_buf_puts(s, " (0)");
+				continue;
+			}
+			seq_buf_printf(s, " (%lu.%09lu)", *ts1, *ts2);
 			continue;
 		case 4:
 			if (!futex_cmd_has_addr2(cmd)) {
@@ -294,6 +280,12 @@ void futex_print_syscall(struct seq_buf *s, int nr_args, unsigned long *args, u3
 				continue;
 			}
 			seq_buf_printf(s, ", uaddr2: 0x%lx", args[i]);
+
+			if (!ptr2)
+				continue;
+
+			seq_buf_printf(s, " (%x)", *ptr2);
+
 			continue;
 		case 5:
 			seq_buf_printf(s, ", val3: %lu", args[i]);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f8213d772f89..0c86986ec7c4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -239,21 +239,35 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat
 	return trace_handle_return(s);
 }
 
+struct futex_data {
+	u32		val1;
+	u32		val2;
+	unsigned long	ts1;
+	unsigned long	ts2;
+};
+
 static enum print_line_t
 sys_enter_futex_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
 		      struct trace_seq *s, struct trace_event *event, int ent_size)
 {
+	struct futex_data *data;
 	void *end = (void *)trace + ent_size;
-	void *ptr;
+	unsigned long *ts1 = NULL, *ts2 = NULL;
+	u32 *ptr1 = NULL, *ptr2 = NULL;
 
 	/* Set ptr to the user space copied area */
-	ptr = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
-	if (ptr + 4 > end)
-		ptr = NULL;
+	data = (void *)trace->args + sizeof(unsigned long) * entry->nb_args;
+	if ((void *)data + sizeof(*data) <= end) {
+		ptr1 = &data->val1;
+		ptr2 = &data->val2;
+		ts1 = &data->ts1;
+		ts2 = &data->ts2;
+	}
 
 	trace_seq_printf(s, "%s(", entry->name);
 
-	futex_print_syscall(&s->seq, entry->nb_args, trace->args, ptr);
+	futex_print_syscall(&s->seq, entry->nb_args, trace->args, ptr1, ptr2,
+			    ts1, ts2);
 
 	trace_seq_puts(s, ")\n");
 
@@ -472,9 +486,9 @@ sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
 			"\"uaddr: 0x%%lx (0x%%lx) cmd=%%s%%s%%s");
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			"  val: 0x%%x timeout/val2: 0x%%llx");
+			"  val: 0x%%x timeout/val2: 0x%%llx (%%lu.%%lu)");
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			" uaddr2: 0x%%lx val3: 0x%%x\", ");
+			" uaddr2: 0x%%lx (0x%%lx) val3: 0x%%x\", ");
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
 			" REC->uaddr,");
@@ -499,10 +513,12 @@ sys_enter_futex_print_fmt(struct syscall_metadata *entry, char *buf, int len)
 			FUTEX_CLOCK_REALTIME);
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			" REC->val, REC->utime,");
+			" REC->val, REC->utime, REC->__ts1, REC->__ts2,");
 
 	pos += snprintf(buf + pos, LEN_OR_ZERO,
-			" REC->uaddr, REC->val3");
+			" REC->uaddr,");
+	pos += snprintf(buf + pos, LEN_OR_ZERO,
+			" REC->__value2, REC->val3");
 	return pos;
 }
 
@@ -605,7 +621,39 @@ static int __init futex_fields(struct trace_event_call *call, int offset)
 	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
 				 FILTER_OTHER);
 	if (ret)
-		kfree(arg);
+		goto free;
+	offset += sizeof(int);
+
+	arg = kstrdup("__value2", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "u32", arg, offset, sizeof(int), 0,
+				 FILTER_OTHER);
+	if (ret)
+		goto free;
+	offset += sizeof(int);
+
+	arg = kstrdup("__ts1", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "unsigned long", arg, offset,
+				 sizeof(unsigned long), 0, FILTER_OTHER);
+	if (ret)
+		goto free;
+	offset += sizeof(long);
+
+	arg = kstrdup("__ts2", GFP_KERNEL);
+	if (WARN_ON_ONCE(!arg))
+		return -ENOMEM;
+	ret = trace_define_field(call, "unsigned long", arg, offset,
+				 sizeof(unsigned long), 0, FILTER_OTHER);
+	if (ret)
+		goto free;
+
+	return 0;
+
+free:
+	kfree(arg);
 	return ret;
 }
 
@@ -778,11 +826,51 @@ static int syscall_copy_user_array(char *buf, const char __user *ptr,
 	return 0;
 }
 
+struct tp_futex_data {
+	u32			cmd;
+	const u32		__user *val1;
+	const u32 		__user *val2;
+	void			__user *timeout;
+};
+
+static int syscall_copy_futex(char *buf, const char __user *ptr,
+			      size_t size, void *data)
+{
+	struct tp_futex_data *tp_data = data;
+	struct futex_data *fdata = (void *)buf;
+	int cmd = tp_data->cmd & FUTEX_CMD_MASK;
+	int ret;
+
+	memset(fdata, 0, sizeof(*fdata));
+
+	if (tp_data->val1) {
+		ret = __copy_from_user(&fdata->val1, tp_data->val1, 4);
+		if (ret)
+			return -1;
+	}
+
+	if (tp_data->val2 && futex_cmd_has_addr2(cmd)) {
+		ret = __copy_from_user(&fdata->val2, tp_data->val2, 4);
+		if (ret)
+			return -1;
+	}
+
+	if (tp_data->timeout && futex_cmd_has_timeout(cmd)) {
+		/* Copies both ts1 and ts2 */
+		ret = __copy_from_user(&fdata->ts1, tp_data->timeout,
+				       sizeof(long) * 2);
+		if (ret)
+			return -1;
+	}
+
+	return 0;
+}
+
 static int
 syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
 {
 	struct syscall_user_buffer *sbuf;
-	const char __user *ptr;
+	struct tp_futex_data tp_data;
 
 	/* buf_size of zero means user doesn't want user space read */
 	if (!buf_size)
@@ -793,14 +881,18 @@ syscall_get_futex(unsigned long *args, char **buffer, int *size, int buf_size)
 	if (!sbuf)
 		return -1;
 
-	ptr = (char __user *)args[0];
+	tp_data.cmd = args[1];
+	tp_data.val1 = (u32 __user *)args[0];
+	tp_data.val2 = (u32 __user *)args[4];
+	tp_data.timeout = (u64 __user *)args[3];
 
-	*buffer = trace_user_fault_read(&sbuf->buf, ptr, 4, NULL, NULL);
+	*buffer = trace_user_fault_read(&sbuf->buf, NULL, 0,
+					syscall_copy_futex, &tp_data);
 	if (!*buffer)
 		return -1;
 
-	/* Add room for the value */
-	*size += 4;
+	/* Add room for values */
+	*size += sizeof(struct futex_data);
 
 	return 0;
 }
@@ -809,12 +901,13 @@ static void syscall_put_futex(struct syscall_metadata *sys_data,
 			      struct syscall_trace_enter *entry,
 			      char *buffer)
 {
-	u32 *ptr;
+	struct futex_data *fdata = (void *)buffer;
+	struct futex_data *data;
 
 	/* Place the futex key into the storage */
-	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
+	data = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
 
-	*ptr = *(u32 *)buffer;
+	*data = *fdata;
 }
 
 static char *sys_fault_user(unsigned int buf_size,
-- 
2.51.0



^ permalink raw reply related

* Re: [PATCH 1/2] selftests/tracing: Fix to make --logdir option work again
From: Shuah Khan @ 2026-03-31 19:07 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Steven Rostedt, Shuah Khan, Gabriele Monaco, Mathieu Desnoyers,
	linux-kernel, linux-trace-kernel, linux-kselftest, Shuah Khan
In-Reply-To: <20260304110425.3858532b6ba092d84a31595a@kernel.org>

On 3/3/26 19:04, Masami Hiramatsu (Google) wrote:
> Hi Shuah,
> 
> Could you pick these 2 patches to the selftests tree or should I pick it?
> 
> Thanks,
> 
> On Tue, 10 Feb 2026 18:54:12 +0900
> "Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:
> 
>> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
>>
>
Done. Sorry for the delay on this. I applied it to my next to send
it for Linux 7.1

thanks,
-- Shuah

^ permalink raw reply

* [PATCH] tracing: Allow backup to save persistent ring buffer before it starts
From: Steven Rostedt @ 2026-03-31 20:39 UTC (permalink / raw)
  To: LKML, Linux Trace Kernel; +Cc: Masami Hiramatsu, Mathieu Desnoyers, John Stultz

From: Steven Rostedt <rostedt@goodmis.org>

When the persistent ring buffer was first introduced, it did not make
sense to start tracing for it on the kernel command line. That's because
if there was a crash, the start of events would invalidate the events from
the previous boot that had the crash.

But now that there's a "backup" instance that can take a snapshot of the
persistent ring buffer when boot starts, it is possible to have the
persistent ring buffer start events at boot up and not lose the old events.

Update the code where the boot events start after all boot time instances
are created. This will allow the backup instance to copy the persistent
ring buffer from the previous boot, and allow the persistent ring buffer
to start tracing new events for the current boot.

  reserve_mem=100M:12M:trace trace_instance=boot_mapped^@trace,sched trace_instance=backup=boot_mapped

The above will create a boot_mapped persistent ring buffer and enabled the
scheduler events. If there's a crash, a "backup" instance will be created
holding the events of the persistent ring buffer from the previous boot,
while the persistent ring buffer will once again start tracing scheduler
events of the current boot.

Now the user doesn't have to remember to start the persistent ring buffer.
It will always have the events started at each boot.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 22 ++++++++++++++++++++++
 kernel/trace/trace.h |  5 ++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ef55b48064da..5b46083f6f94 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -10871,9 +10871,31 @@ __init static void enable_instances(void)
 			tr->range_name = no_free_ptr(rname);
 		}
 
+		/*
+		 * Save the events to start and enabled them after all boot instances
+		 * have been created.
+		 */
+		tr->boot_events = curr_str;
+	}
+
+	/* Enable the events after all boot instances have been created */
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+
+		if (!tr->boot_events || !(*tr->boot_events)) {
+			tr->boot_events = NULL;
+			continue;
+		}
+
+		curr_str = tr->boot_events;
+
+		/* Clear the instance if this is a persistent buffer */
+		if (tr->flags & TRACE_ARRAY_FL_LAST_BOOT)
+			update_last_data(tr);
+
 		while ((tok = strsep(&curr_str, ","))) {
 			early_enable_events(tr, tok, true);
 		}
+		tr->boot_events = NULL;
 	}
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b001fbba0881..e68f9c2027eb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,7 +405,10 @@ struct trace_array {
 	unsigned char		trace_flags_index[TRACE_FLAGS_MAX_SIZE];
 	unsigned int		flags;
 	raw_spinlock_t		start_lock;
-	const char		*system_names;
+	union {
+		const char	*system_names;
+		char		*boot_events;
+	};
 	struct list_head	err_log;
 	struct dentry		*dir;
 	struct dentry		*options;
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v9 2/3] tracing: Remove the backup instance automatically after read
From: Steven Rostedt @ 2026-03-31 21:19 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel
In-Reply-To: <177497475349.569199.11513916633426967730.stgit@mhiramat.tok.corp.google.com>

On Wed,  1 Apr 2026 01:32:33 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:

> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 8cec7bd70438..1d73400a01c7 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -539,8 +539,65 @@ void trace_set_ring_buffer_expanded(struct trace_array *tr)
>  	tr->ring_buffer_expanded = true;
>  }
>  
> +static int __remove_instance(struct trace_array *tr);
> +
> +static void trace_array_autoremove(struct work_struct *work)
> +{
> +	struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);
> +
> +	guard(mutex)(&event_mutex);
> +	guard(mutex)(&trace_types_lock);

Hmm, should we do a check if the tr still exists? Couldn't the user delete
this via a rmdir after the last file closed and this was kicked?

  CPU 0							CPU 1
  -----							-----
  open(trace_pipe);
  read(..);
  close(trace_pipe);
     kick the work queue to delete it....
						rmdir();
							[instance deleted]

  __remove_instance();

   [ now the tr is freed, and the remove will crash!]


What would prevent this is this is to use trace_array_destroy() that checks
this and also adds the proper locking:

static void trace_array_autoremove(struct work_struct *work)
{
	struct trace_array *tr = container_of(work, struct trace_array, autoremove_work);

	trace_array_destroy(tr);
}


> +
> +	/*
> +	 * This can be fail if someone gets @tr before starting this
> +	 * function, but in that case, this will be kicked again when
> +	 * putting it. So we don't care about the result.
> +	 */
> +	__remove_instance(tr);
> +}
> +
> +static struct workqueue_struct *autoremove_wq;
> +
> +static void trace_array_kick_autoremove(struct trace_array *tr)
> +{
> +	if (autoremove_wq && !work_pending(&tr->autoremove_work))
> +		queue_work(autoremove_wq, &tr->autoremove_work);

Doesn't queue_work() check if it's pending? Do we really need to check it
twice?

> +}
> +
> +static void trace_array_cancel_autoremove(struct trace_array *tr)
> +{
> +	if (autoremove_wq && work_pending(&tr->autoremove_work))
> +		cancel_work(&tr->autoremove_work);

Same here, as can't this be racy?

> +}
> +
> +static void trace_array_init_autoremove(struct trace_array *tr)
> +{
> +	INIT_WORK(&tr->autoremove_work, trace_array_autoremove);
> +}
> +
> +static void trace_array_start_autoremove(void)
> +{
> +	if (autoremove_wq)
> +		return;
> +
> +	autoremove_wq = alloc_workqueue("tr_autoremove_wq",
> +					WQ_UNBOUND | WQ_HIGHPRI, 0);
> +	if (!autoremove_wq)
> +		pr_warn("Unable to allocate tr_autoremove_wq. autoremove
> disabled.\n"); +}
> +
>  LIST_HEAD(ftrace_trace_arrays);

-- Steve

^ permalink raw reply

* Re: [PATCH v9 3/3] tracing/Documentation: Add a section about backup instance
From: Steven Rostedt @ 2026-03-31 21:21 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel
In-Reply-To: <177497476117.569199.18085846838539980210.stgit@mhiramat.tok.corp.google.com>

On Wed,  1 Apr 2026 01:32:41 +0900
"Masami Hiramatsu (Google)" <mhiramat@kernel.org> wrote:

> From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> 
> Add a section about backup instance to the debugging.rst.
> 
> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
> ---
>   Changes in v6:
>    - Fix typos.
> ---
>  Documentation/trace/debugging.rst |   19 +++++++++++++++++++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/Documentation/trace/debugging.rst b/Documentation/trace/debugging.rst
> index 4d88c346fc38..15857951b506 100644
> --- a/Documentation/trace/debugging.rst
> +++ b/Documentation/trace/debugging.rst
> @@ -159,3 +159,22 @@ If setting it from the kernel command line, it is recommended to also
>  disable tracing with the "traceoff" flag, and enable tracing after boot up.
>  Otherwise the trace from the most recent boot will be mixed with the trace
>  from the previous boot, and may make it confusing to read.
> +
> +Using a backup instance for keeping previous boot data
> +------------------------------------------------------
> +
> +It is also possible to record trace data at system boot time by specifying
> +events with the persistent ring buffer, but in this case the data before the
> +reboot will be lost before it can be read. This problem can be solved by a
> +backup instance. From the kernel command line::
> +
> +  reserve_mem=12M:4096:trace trace_instance=boot_map@trace,sched,irq trace_instance=backup=boot_map

The above didn't actually work well without my patch to enable events on
the persistent ring buffer with the backup. But keep it, as it will work
with my patch ;-)

-- Steve


> +
> +On boot up, the previous data in the "boot_map" is copied to the "backup"
> +instance, and the "sched:*" and "irq:*" events for the current boot are traced
> +in the "boot_map". Thus the user can read the previous boot data from the "backup"
> +instance without stopping the trace.
> +
> +Note that this "backup" instance is readonly, and will be removed automatically
> +if you clear the trace data or read out all trace data from the "trace_pipe"
> +or the "trace_pipe_raw" files.
> \ No newline at end of file


^ permalink raw reply

* Re: [PATCH RFC v4 25/44] KVM: selftests: Test basic single-page conversion flow
From: Ackerley Tng @ 2026-03-31 22:33 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jroedel, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Jason Gunthorpe,
	Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm
In-Reply-To: <20260326-gmem-inplace-conversion-v4-25-e202fe950ffd@google.com>

Ackerley Tng <ackerleytng@google.com> writes:

> Add a selftest for the guest_memfd memory attribute conversion ioctls.
> The test starts the guest_memfd as all-private (the default state), and
> verifies the basic flow of converting a single page to shared and then back
> to private.
>
> Add infrastructure that supports extensions to other conversion flow
> tests. This infrastructure will be used in upcoming patches for other
> conversion tests.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  tools/testing/selftests/kvm/Makefile.kvm           |   1 +
>  .../selftests/kvm/guest_memfd_conversions_test.c   | 205 +++++++++++++++++++++
>  2 files changed, 206 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
> index dc68371f76a33..0e2a9adfca57e 100644
> --- a/tools/testing/selftests/kvm/Makefile.kvm
> +++ b/tools/testing/selftests/kvm/Makefile.kvm
> @@ -147,6 +147,7 @@ TEST_GEN_PROGS_x86 += access_tracking_perf_test
>  TEST_GEN_PROGS_x86 += coalesced_io_test
>  TEST_GEN_PROGS_x86 += dirty_log_perf_test
>  TEST_GEN_PROGS_x86 += guest_memfd_test
> +TEST_GEN_PROGS_x86 += guest_memfd_conversions_test
>  TEST_GEN_PROGS_x86 += hardware_disable_test
>  TEST_GEN_PROGS_x86 += memslot_modification_stress_test
>  TEST_GEN_PROGS_x86 += memslot_perf_test
> diff --git a/tools/testing/selftests/kvm/guest_memfd_conversions_test.c b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
> new file mode 100644
> index 0000000000000..841b2824ae996
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/guest_memfd_conversions_test.c
> @@ -0,0 +1,205 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2024, Google LLC.
> + */
> +#include <sys/mman.h>
> +#include <unistd.h>
> +
> +#include <linux/align.h>
> +#include <linux/kvm.h>
> +#include <linux/sizes.h>
> +
> +#include "kvm_util.h"
> +#include "kselftest_harness.h"
> +#include "test_util.h"
> +#include "ucall_common.h"
> +
> +FIXTURE(gmem_conversions) {
> +	struct kvm_vcpu *vcpu;
> +	int gmem_fd;
> +	/* HVA of the first byte of the memory mmap()-ed from gmem_fd. */
> +	char *mem;
> +};
> +
> +typedef FIXTURE_DATA(gmem_conversions) test_data_t;
> +
> +FIXTURE_SETUP(gmem_conversions) { }
> +
> +static uint64_t page_size;
> +
> +static void guest_do_rmw(void);
> +#define GUEST_MEMFD_SHARING_TEST_GVA 0x90000000ULL
> +
> +/*
> + * Defer setup until the individual test is invoked so that tests can specify
> + * the number of pages and flags for the guest_memfd instance.
> + */
> +static void gmem_conversions_do_setup(test_data_t *t, int nr_pages,
> +				      int gmem_flags)
> +{
> +	const struct vm_shape shape = {
> +		.mode = VM_MODE_DEFAULT,
> +		.type = KVM_X86_SW_PROTECTED_VM,
> +	};
> +	/*
> +	 * Use high GPA above APIC_DEFAULT_PHYS_BASE to avoid clashing with
> +	 * APIC_DEFAULT_PHYS_BASE.
> +	 */
> +	const uint64_t gpa = SZ_4G;
> +	const uint32_t slot = 1;
> +	u64 supported_flags;
> +	struct kvm_vm *vm;
> +
> +	vm = __vm_create_shape_with_one_vcpu(shape, &t->vcpu, nr_pages, guest_do_rmw);
> +
> +	supported_flags = vm_check_cap(vm, KVM_CAP_MEMORY_ATTRIBUTES2_FLAGS);
> +	TEST_REQUIRE(supported_flags & KVM_SET_MEMORY_ATTRIBUTES2_PRESERVE);
> +
> +	vm_mem_add(vm, VM_MEM_SRC_SHMEM, gpa, slot, nr_pages,
> +		   KVM_MEM_GUEST_MEMFD, -1, 0, gmem_flags);
> +
> +	t->gmem_fd = kvm_slot_to_fd(vm, slot);
> +	t->mem = addr_gpa2hva(vm, gpa);
> +	virt_map(vm, GUEST_MEMFD_SHARING_TEST_GVA, gpa, nr_pages);
> +}
> +
> +static void gmem_conversions_do_teardown(test_data_t *t)
> +{
> +	/* No need to close gmem_fd, it's owned by the VM structure. */
> +	kvm_vm_free(t->vcpu->vm);
> +}
> +
> +FIXTURE_TEARDOWN(gmem_conversions)
> +{
> +	gmem_conversions_do_teardown(self);
> +}
> +
> +/*
> + * In these test definition macros, __nr_pages and nr_pages is used to set up
> + * the total number of pages in the guest_memfd under test. This will be
> + * available in the test definitions as nr_pages.
> + */
> +
> +#define __GMEM_CONVERSION_TEST(test, __nr_pages, flags)				\
> +static void __gmem_conversions_##test(test_data_t *t, int nr_pages);		\
> +										\
> +TEST_F(gmem_conversions, test)							\
> +{										\
> +	gmem_conversions_do_setup(self, __nr_pages, flags);			\
> +	__gmem_conversions_##test(self, __nr_pages);				\
> +}										\
> +static void __gmem_conversions_##test(test_data_t *t, int nr_pages)		\
> +
> +#define GMEM_CONVERSION_TEST(test, __nr_pages, flags)				\
> +	__GMEM_CONVERSION_TEST(test, __nr_pages, (flags) | GUEST_MEMFD_FLAG_MMAP)
> +
> +#define __GMEM_CONVERSION_TEST_INIT_PRIVATE(test, __nr_pages)			\
> +	GMEM_CONVERSION_TEST(test, __nr_pages, 0)
> +
> +#define GMEM_CONVERSION_TEST_INIT_PRIVATE(test)					\
> +	__GMEM_CONVERSION_TEST_INIT_PRIVATE(test, 1)
> +
> +struct guest_check_data {
> +	void *mem;
> +	char expected_val;
> +	char write_val;
> +};
> +static struct guest_check_data guest_data;
> +
> +static void guest_do_rmw(void)
> +{
> +	for (;;) {
> +		char *mem = READ_ONCE(guest_data.mem);
> +
> +		GUEST_ASSERT_EQ(READ_ONCE(*mem), READ_ONCE(guest_data.expected_val));
> +		WRITE_ONCE(*mem, READ_ONCE(guest_data.write_val));
> +
> +		GUEST_SYNC(0);
> +	}
> +}
> +
> +static void run_guest_do_rmw(struct kvm_vcpu *vcpu, loff_t pgoff,
> +			     char expected_val, char write_val)
> +{
> +	struct ucall uc;
> +	int r;
> +
> +	guest_data.mem = (void *)GUEST_MEMFD_SHARING_TEST_GVA + pgoff * page_size;
> +	guest_data.expected_val = expected_val;
> +	guest_data.write_val = write_val;
> +	sync_global_to_guest(vcpu->vm, guest_data);
> +
> +	do {
> +		r = __vcpu_run(vcpu);
> +	} while (r == -1 && errno == EINTR);
> +
> +	TEST_ASSERT_EQ(r, 0);

TEST_ASSERT_EQ() ends up calling exit() on failures, which skips
FIXTURE_TEARDOWN().

Other than the explicit assertions not working with the
kselftest_harness, kvm selftest library functions like vm_mem_add() also
call TEST_ASSERT, which doesn't play nice with kselftest_harness.

Any suggestions for this? Should we use the kselftest framework with
these tests?

(I ran into this issue while trying to test something else, where I
needed FIXTURE_TEARDOWN() to clean up system state.)

Or is it "okay" in this case since FIXTURE_TEARDOWN() only cleans up
stuff that would happen if the program exits anyway?

>
> [...snip...]
>

^ permalink raw reply

* Re: [PATCH RFC v4 08/44] KVM: Introduce KVM_SET_MEMORY_ATTRIBUTES2
From: Michael Roth @ 2026-03-31 22:53 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jroedel, jthoughton, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Jason Gunthorpe, Vlastimil Babka, kvm,
	linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm
In-Reply-To: <20260326-gmem-inplace-conversion-v4-8-e202fe950ffd@google.com>

On Thu, Mar 26, 2026 at 03:24:17PM -0700, Ackerley Tng wrote:
> Introduce a "version 2" of KVM_SET_MEMORY_ATTRIBUTES to support returning
> information back to userspace.

Hi Ackerley,

Not trying to bikeshed below, but I'm working on getting related QEMU
patches cleaned up to post soon and was working through some of the new
uAPI bits, and plumbing some of these capabilities in seems a little
awkward in a couple places so wondering if we should revisit how some of
this API is defined...

> 
> This new ioctl and structure will, in a later patch, be shared as a
> guest_memfd ioctl, where the padding in the new kvm_memory_attributes2
> structure will be for writing the response from the guest_memfd ioctl to
> userspace.
> 
> A new ioctl is necessary for these reasons:
> 
> 1. KVM_SET_MEMORY_ATTRIBUTES is currently a write-only ioctl and does not
>    allow userspace to read fields. There's nothing in code (yet?) that
>    validates this, but using _IOWR for consistency would be prudent.
> 
> 2. KVM_SET_MEMORY_ATTRIBUTES, when used as a guest_memfd ioctl, will need
>    an additional field to provide userspace with more error details.
> 
> Alternatively, a completely new ioctl could be defined, unrelated to
> KVM_SET_MEMORY_ATTRIBUTES, but using the same ioctl number and struct for
> the vm and guest_memfd ioctls streamlines the interface for userspace. In
> addition, any memory attributes, implemented on the vm or guest_memfd
> ioctl, can be easily shared with the other.
> 
> Add KVM_CAP_MEMORY_ATTRIBUTES2 to indicate that struct
> kvm_memory_attributes2 exists and can be used either with
> KVM_SET_MEMORY_ATTRIBUTES2 via the vm or guest_memfd ioctl.

The guest_memfd support for the KVM_SET_MEMORY_ATTRIBUTES2 ioctl isn't
added until patch #10, and to scan for it you sort of need to infer it
via KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES reporting non-zero (i.e.
KVM_MEMORY_ATTRIBUTE_PRIVATE), so it's confusing to state that
KVM_CAP_MEMORY_ATTRIBUTES2 means you can use the struct via a guest_memfd
ioctl.

I think the above is trying to simply say that the corresponding struct
exists, and remain agnostic about how it can be used. But if that were
the case, there would be no way to know when KVM_SET_MEMORY_ATTRIBUTES2 is
available in the first place, so in the case of KVM ioctls at least,
KVM_CAP_MEMORY_ATTRIBUTES2 is advertising both the struct and the ioctl,
whereas for guest_memfd it's only advertising the struct and not saying
anything about whether a similar gmem ioctl is available to use it.

Instead, maybe they should both have the same semantics:

  KVM_CAP_MEMORY_ATTRIBUTES2: *SET_ATTRIBUTES* ioctl exists for KVM that utilizes
    struct kvm_memory_attributes2

  KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES: *SET_ATTRIBUTES* ioctl exists for
    guest_memfd that utilizes struct kvm_memory_attributes2

In which case you would leave out any mention of guest_memfd here as far as
the documentation does, and then in patch #10 you could modify it to be
something like:

   4.145 KVM_SET_MEMORY_ATTRIBUTES2
   ---------------------------------

  -:Capability: KVM_CAP_MEMORY_ATTRIBUTES2
  +:Capability: KVM_CAP_MEMORY_ATTRIBUTES2, KVM_GUEST_MEMFD_CAP_MEMORY_ATTRIBUTES
  -:Architectures: x86
  +:Architectures: all
  -:Type: vm ioctl
  +:Type: vm, guest_memfd ioctl
   :Parameters: struct kvm_memory_attributes2 (in/out)
   :Returns: 0 on success, <0 on error

and *then* add in your mentions of how the usage/fields differ for
guest_memfd/KVM_GUEST_MEMFD_CAP_MEMORY_ATTRIBUTES case vs. KVM ioctls.

This avoids needing to issue 2 checks for the guest_memfd variant vs. 1
for KVM, but more importantly avoids subtle differences in how these
similarly-named capabilities are used/documented that might cause
unecessary confusion.

Thanks,

Mike

> 
> Handle KVM_CAP_MEMORY_ATTRIBUTES2 and return the same supported attributes
> as would be returned for KVM_CAP_MEMORY_ATTRIBUTES - the supported
> attributes are the same for now, regardless of the CAP requested.
> 
> Suggested-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> ---
>  Documentation/virt/kvm/api.rst | 32 ++++++++++++++++++++++++++++++++
>  include/uapi/linux/kvm.h       | 12 ++++++++++++
>  virt/kvm/kvm_main.c            | 40 +++++++++++++++++++++++++++++++++++++---
>  3 files changed, 81 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 032516783e962..0b61e2579e1d8 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6359,6 +6359,8 @@ S390:
>  Returns -EINVAL if the VM has the KVM_VM_S390_UCONTROL flag set.
>  Returns -EINVAL if called on a protected VM.
>  
> +.. _KVM_SET_MEMORY_ATTRIBUTES:
> +
>  4.141 KVM_SET_MEMORY_ATTRIBUTES
>  -------------------------------
>  
> @@ -6551,6 +6553,36 @@ KVM_S390_KEYOP_SSKE
>    Sets the storage key for the guest address ``guest_addr`` to the key
>    specified in ``key``, returning the previous value in ``key``.
>  
> +4.145 KVM_SET_MEMORY_ATTRIBUTES2
> +---------------------------------
> +
> +:Capability: KVM_CAP_MEMORY_ATTRIBUTES2
> +:Architectures: x86
> +:Type: vm ioctl
> +:Parameters: struct kvm_memory_attributes2 (in/out)
> +:Returns: 0 on success, <0 on error
> +
> +KVM_SET_MEMORY_ATTRIBUTES2 is an extension to
> +KVM_SET_MEMORY_ATTRIBUTES that supports returning (writing) values to
> +userspace.  The original (pre-extension) fields are shared with
> +KVM_SET_MEMORY_ATTRIBUTES identically.
> +
> +Attribute values are shared with KVM_SET_MEMORY_ATTRIBUTES.
> +
> +::
> +
> +  struct kvm_memory_attributes2 {
> +	__u64 address;
> +	__u64 size;
> +	__u64 attributes;
> +	__u64 flags;
> +	__u64 reserved[12];
> +  };
> +
> +  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> +
> +See also: :ref: `KVM_SET_MEMORY_ATTRIBUTES`.
> +
>  .. _kvm_run:
>  
>  5. The kvm_run structure
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 80364d4dbebb0..16567d4a769e5 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -989,6 +989,7 @@ struct kvm_enable_cap {
>  #define KVM_CAP_ARM_SEA_TO_USER 245
>  #define KVM_CAP_S390_USER_OPEREXEC 246
>  #define KVM_CAP_S390_KEYOP 247
> +#define KVM_CAP_MEMORY_ATTRIBUTES2 248
>  
>  struct kvm_irq_routing_irqchip {
>  	__u32 irqchip;
> @@ -1637,6 +1638,17 @@ struct kvm_memory_attributes {
>  	__u64 flags;
>  };
>  
> +/* Available with KVM_CAP_MEMORY_ATTRIBUTES2 */
> +#define KVM_SET_MEMORY_ATTRIBUTES2              _IOWR(KVMIO,  0xd2, struct kvm_memory_attributes2)
> +
> +struct kvm_memory_attributes2 {
> +	__u64 address;
> +	__u64 size;
> +	__u64 attributes;
> +	__u64 flags;
> +	__u64 reserved[12];
> +};
> +
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>  
>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 70b594dafc5cc..3c261904322f0 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2621,9 +2621,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>  	return r;
>  }
>  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> -					   struct kvm_memory_attributes *attrs)
> +					   struct kvm_memory_attributes2 *attrs)
>  {
>  	gfn_t start, end;
> +	int i;
>  
>  	/* flags is currently not used. */
>  	if (attrs->flags)
> @@ -2634,6 +2635,10 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
>  		return -EINVAL;
>  	if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
>  		return -EINVAL;
> +	for (i = 0; i < ARRAY_SIZE(attrs->reserved); i++) {
> +		if (attrs->reserved[i])
> +			return -EINVAL;
> +	}
>  
>  	start = attrs->address >> PAGE_SHIFT;
>  	end = (attrs->address + attrs->size) >> PAGE_SHIFT;
> @@ -4966,6 +4971,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
>  	case KVM_CAP_DEVICE_CTRL:
>  		return 1;
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +	case KVM_CAP_MEMORY_ATTRIBUTES2:
>  	case KVM_CAP_MEMORY_ATTRIBUTES:
>  		if (!vm_memory_attributes)
>  			return 0;
> @@ -5191,6 +5197,14 @@ do {										\
>  		     sizeof_field(struct kvm_userspace_memory_region2, field));	\
>  } while (0)
>  
> +#define SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(field)				\
> +do {										\
> +	BUILD_BUG_ON(offsetof(struct kvm_memory_attributes, field) !=		\
> +		     offsetof(struct kvm_memory_attributes2, field));		\
> +	BUILD_BUG_ON(sizeof_field(struct kvm_memory_attributes, field) !=	\
> +		     sizeof_field(struct kvm_memory_attributes2, field));	\
> +} while (0)
> +
>  static long kvm_vm_ioctl(struct file *filp,
>  			   unsigned int ioctl, unsigned long arg)
>  {
> @@ -5373,15 +5387,35 @@ static long kvm_vm_ioctl(struct file *filp,
>  	}
>  #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
>  #ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> +	case KVM_SET_MEMORY_ATTRIBUTES2:
>  	case KVM_SET_MEMORY_ATTRIBUTES: {
> -		struct kvm_memory_attributes attrs;
> +		struct kvm_memory_attributes2 attrs;
> +		unsigned long size;
> +
> +		if (ioctl == KVM_SET_MEMORY_ATTRIBUTES) {
> +			/*
> +			 * Fields beyond struct kvm_memory_attributes shouldn't
> +			 * be accessed, but avoid leaking kernel memory in case
> +			 * of a bug.
> +			 */
> +			memset(&attrs, 0, sizeof(attrs));
> +			size = sizeof(struct kvm_memory_attributes);
> +		} else {
> +			size = sizeof(struct kvm_memory_attributes2);
> +		}
> +
> +		/* Ensure the common parts of the two structs are identical. */
> +		SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(address);
> +		SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(size);
> +		SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(attributes);
> +		SANITY_CHECK_MEMORY_ATTRIBUTES_FIELD(flags);
>  
>  		r = -ENOTTY;
>  		if (!vm_memory_attributes)
>  			goto out;
>  
>  		r = -EFAULT;
> -		if (copy_from_user(&attrs, argp, sizeof(attrs)))
> +		if (copy_from_user(&attrs, argp, size))
>  			goto out;
>  
>  		r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
> 
> -- 
> 2.53.0.1018.g2bb0e51243-goog
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox