Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* Re: [PATCH 2/3] rv/rtapp/sleep: Update nanosleep rule
From: Gabriele Monaco @ 2026-05-19 12:58 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <a2175a6647bb88797ad8275eddb4b20b749474ec.1779176466.git.namcao@linutronix.de>

On Tue, 2026-05-19 at 09:49 +0200, Nam Cao wrote:
> CLOCK_REALTIME is the only clock that often is misused in real-time
> applications. The other clocks either are safe for real-time uses
> (CLOCK_TAI, CLOCK_MONOTONIC, CLOCK_BOOTTIME) or are unlikely to be misused
> (CLOCK_AUX, CLOCK_PROCESS_CPUTIME_ID).
> 
> The rtapp monitor's purpose is warning people about common mistakes with
> real-time design. However, warning about all clock types generates too much
> false positives.

I'm fine with the change, but are those really false positives?

From what I understand before this change we would report any non-rt-friendly
clock type, now only realtime.
What we are skipping would still be true positives, just so uncommon not to
justify the extra complexity, right?

Or do you mean an RT task using CLOCK_AUX is a false positive because likely the
users weren't even trying to do real-time?

Thanks,
Gabriele

> 
> Update the monitor to only warn about CLOCK_REALTIME.
> 
> Signed-off-by: Nam Cao <namcao@linutronix.de>
> ---
>  kernel/trace/rv/monitors/sleep/sleep.c    | 10 ++---
>  kernel/trace/rv/monitors/sleep/sleep.h    | 52 +++++++++++------------
>  tools/verification/models/rtapp/sleep.ltl |  2 +-
>  3 files changed, 28 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index 0a36f5519e6b..e01ac56b3f4a 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -43,9 +43,7 @@ static void ltl_atoms_init(struct task_struct *task, struct
> ltl_monitor *mon, bo
>  	ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
>  
>  	if (task_creation) {
> -		ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
> +		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, false);
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
>  		ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
> @@ -136,8 +134,7 @@ static void handle_sys_enter(void *data, struct pt_regs
> *regs, long id)
>  	case __NR_clock_nanosleep_time64:
>  #endif
>  		syscall_get_arguments(current, regs, args);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] ==
> CLOCK_MONOTONIC);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] ==
> CLOCK_TAI);
> +		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, args[0] ==
> CLOCK_REALTIME);
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] ==
> TIMER_ABSTIME);
>  		ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
>  		break;
> @@ -178,8 +175,7 @@ static void handle_sys_exit(void *data, struct pt_regs
> *regs, long ret)
>  
>  	ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
>  	ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
> -	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
> -	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
> +	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.h
> b/kernel/trace/rv/monitors/sleep/sleep.h
> index 95dc2727c059..ed1ac7ad008e 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.h
> +++ b/kernel/trace/rv/monitors/sleep/sleep.h
> @@ -20,8 +20,7 @@ enum ltl_atom {
>  	LTL_FUTEX_WAIT,
>  	LTL_KERNEL_THREAD,
>  	LTL_KTHREAD_SHOULD_STOP,
> -	LTL_NANOSLEEP_CLOCK_MONOTONIC,
> -	LTL_NANOSLEEP_CLOCK_TAI,
> +	LTL_NANOSLEEP_CLOCK_REALTIME,
>  	LTL_NANOSLEEP_TIMER_ABSTIME,
>  	LTL_RT,
>  	LTL_SLEEP,
> @@ -46,8 +45,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
>  		"fu_wa",
>  		"ker_th",
>  		"kth_sh_st",
> -		"na_cl_mo",
> -		"na_cl_ta",
> +		"na_cl_re",
>  		"na_ti_ab",
>  		"rt",
>  		"sl",
> @@ -87,8 +85,7 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
> -	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> +	bool nanosleep_clock_realtime =
> test_bit(LTL_NANOSLEEP_CLOCK_REALTIME, mon->atoms);
>  	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
>  	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> @@ -97,17 +94,17 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
>  	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
>  	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val42 = task_is_rcu || task_is_migration;
> -	bool val43 = futex_lock_pi || val42;
> -	bool val5 = block_on_rt_mutex || val43;
> -	bool val34 = abort_sleep || kthread_should_stop;
> -	bool val35 = woken_by_nmi || val34;
> -	bool val36 = woken_by_hardirq || val35;
> -	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val41 = task_is_rcu || task_is_migration;
> +	bool val42 = futex_lock_pi || val41;
> +	bool val5 = block_on_rt_mutex || val42;
> +	bool val33 = abort_sleep || kthread_should_stop;
> +	bool val34 = woken_by_nmi || val33;
> +	bool val35 = woken_by_hardirq || val34;
> +	bool val14 = woken_by_equal_or_higher_prio || val35;
>  	bool val13 = !wake;
> -	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool val27 = nanosleep_timer_abstime && val26;
> -	bool val18 = clock_nanosleep && val27;
> +	bool val25 = !nanosleep_clock_realtime;
> +	bool val26 = nanosleep_timer_abstime && val25;
> +	bool val18 = clock_nanosleep && val26;
>  	bool val20 = val18 || epoll_wait;
>  	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> @@ -138,8 +135,7 @@ ltl_possible_next_states(struct ltl_monitor *mon, unsigned
> int state, unsigned l
>  	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
> -	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> +	bool nanosleep_clock_realtime =
> test_bit(LTL_NANOSLEEP_CLOCK_REALTIME, mon->atoms);
>  	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
>  	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> @@ -148,17 +144,17 @@ ltl_possible_next_states(struct ltl_monitor *mon,
> unsigned int state, unsigned l
>  	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
>  	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
>  	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val42 = task_is_rcu || task_is_migration;
> -	bool val43 = futex_lock_pi || val42;
> -	bool val5 = block_on_rt_mutex || val43;
> -	bool val34 = abort_sleep || kthread_should_stop;
> -	bool val35 = woken_by_nmi || val34;
> -	bool val36 = woken_by_hardirq || val35;
> -	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val41 = task_is_rcu || task_is_migration;
> +	bool val42 = futex_lock_pi || val41;
> +	bool val5 = block_on_rt_mutex || val42;
> +	bool val33 = abort_sleep || kthread_should_stop;
> +	bool val34 = woken_by_nmi || val33;
> +	bool val35 = woken_by_hardirq || val34;
> +	bool val14 = woken_by_equal_or_higher_prio || val35;
>  	bool val13 = !wake;
> -	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool val27 = nanosleep_timer_abstime && val26;
> -	bool val18 = clock_nanosleep && val27;
> +	bool val25 = !nanosleep_clock_realtime;
> +	bool val26 = nanosleep_timer_abstime && val25;
> +	bool val18 = clock_nanosleep && val26;
>  	bool val20 = val18 || epoll_wait;
>  	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> diff --git a/tools/verification/models/rtapp/sleep.ltl
> b/tools/verification/models/rtapp/sleep.ltl
> index 6f26c4810f78..2637bc48a620 100644
> --- a/tools/verification/models/rtapp/sleep.ltl
> +++ b/tools/verification/models/rtapp/sleep.ltl
> @@ -9,7 +9,7 @@ RT_VALID_SLEEP_REASON = FUTEX_WAIT
>  
>  RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
>                      and NANOSLEEP_TIMER_ABSTIME
> -                    and (NANOSLEEP_CLOCK_MONOTONIC or NANOSLEEP_CLOCK_TAI)
> +                    and not NANOSLEEP_CLOCK_REALTIME
>  
>  RT_FRIENDLY_WAKE = WOKEN_BY_EQUAL_OR_HIGHER_PRIO
>                  or WOKEN_BY_HARDIRQ


^ permalink raw reply

* Re: [PATCH v2 09/14] verification/rvgen: Add selftests
From: Nam Cao @ 2026-05-19 12:59 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, linux-trace-kernel, Steven Rostedt,
	Gabriele Monaco
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260514152055.229162-10-gmonaco@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> The rvgen code generator needs validation to ensure it produces correct
> monitor implementations from input specifications.
>
> Add selftests with golden reference outputs covering all monitor classes
> (DA, HA, LTL) and types (global, per_cpu, per_task, per_obj), including
> optional features like descriptions and parent monitors. Container
> generation and error handling (missing files, invalid specifications,
> missing arguments) are also validated against expected output.
>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Acked-by: Nam Cao <namcao@linutronix.de>

^ permalink raw reply

* Re: [PATCH v2 10/14] rv: Add KUnit stub to rv_react() and rv_*_task_monitor_slot()
From: Nam Cao @ 2026-05-19 13:04 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, linux-trace-kernel, Steven Rostedt,
	Gabriele Monaco, Masami Hiramatsu
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260514152055.229162-11-gmonaco@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> Add KUNIT_STATIC_STUB_REDIRECT to allow those functions to be stubbed in
> a KUnit test. This is useful to catch reaction without creating a custom
> reactor and going through the effort of setting it from a test.
> rv_{get/put}_task_monitor_slot() rely on a lock, but this isn't
> necessary during a unit test, so simply skip the calls.
>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Reviewed-by: Nam Cao <namcao@linutronix.de>

^ permalink raw reply

* Re: [PATCH v2 11/14] rv: Add KUnit tests for some DA/HA monitors
From: Nam Cao @ 2026-05-19 13:18 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, linux-trace-kernel, Steven Rostedt,
	Gabriele Monaco, Masami Hiramatsu
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260514152055.229162-12-gmonaco@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> +#define RV_KUNIT_EXPECT_REACTION_HERE(test, ctx)                             \
> +	for (int __done = ({ RV_KUNIT_EXPECT_NO_REACTION(test, ctx); 0; });  \
> +	     !__done;                                                        \
> +	     __done = ({ RV_KUNIT_EXPECT_REACTION(test, ctx); 1; }))

Cool trick.

> +static void rv_test_stub(struct kunit *test)
> +{
> +	kunit_skip(test, "Monitor not enabled\n");
> +}
> +
> +#define DECLARE_RV_TEST(name) \
> +	void name(struct kunit *test) __weak __alias(rv_test_stub)
> +
> +DECLARE_RV_TEST(rv_test_sco);
> +DECLARE_RV_TEST(rv_test_sssw);
> +DECLARE_RV_TEST(rv_test_sts);
> +DECLARE_RV_TEST(rv_test_opid);
> +DECLARE_RV_TEST(rv_test_nomiss);

Good to know this one. I think I can make use of this trick for some LTL improvement..

Reviewed-by: Nam Cao <namcao@linutronix.de>

^ permalink raw reply

* Re: [PATCH v2 13/14] rv: Prevent unintentional tracepoints during KUnit tests
From: Nam Cao @ 2026-05-19 13:22 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, linux-trace-kernel, Steven Rostedt,
	Gabriele Monaco, Masami Hiramatsu
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260514152055.229162-14-gmonaco@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> Monitor initialisation also called during KUnit tests may register some
> tracepoints, this can lead to issues since we don't expect real monitor
> events running during KUnit tests.
>
> Prevent tracepoint registration if an RV KUnit test is running.
>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Reviewed-by: Nam Cao <namcao@linutronix.de>

^ permalink raw reply

* Re: [PATCH v2 14/14] rv: Add KUnit tests for some LTL monitors
From: Nam Cao @ 2026-05-19 13:24 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, linux-trace-kernel, Steven Rostedt,
	Gabriele Monaco, Masami Hiramatsu
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260514152055.229162-15-gmonaco@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> Validate the functionality of LTL monitors by injecting events in a
> controlled environment (KUnit) and expecting reactions, just like it is
> done in DA monitors.
>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Reviewed-by: Nam Cao <namcao@linutronix.de>

^ permalink raw reply

* Re: [PATCH 2/3] rv/rtapp/sleep: Update nanosleep rule
From: Nam Cao @ 2026-05-19 14:51 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <5af6afbfa32ede2db7f21412c8c21831f0eefd04.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:

> On Tue, 2026-05-19 at 09:49 +0200, Nam Cao wrote:
>> CLOCK_REALTIME is the only clock that often is misused in real-time
>> applications. The other clocks either are safe for real-time uses
>> (CLOCK_TAI, CLOCK_MONOTONIC, CLOCK_BOOTTIME) or are unlikely to be misused
>> (CLOCK_AUX, CLOCK_PROCESS_CPUTIME_ID).
>> 
>> The rtapp monitor's purpose is warning people about common mistakes with
>> real-time design. However, warning about all clock types generates too much
>> false positives.
>
> I'm fine with the change, but are those really false positives?
>
> From what I understand before this change we would report any non-rt-friendly
> clock type, now only realtime.
> What we are skipping would still be true positives, just so uncommon not to
> justify the extra complexity, right?
>
> Or do you mean an RT task using CLOCK_AUX is a false positive because likely the
> users weren't even trying to do real-time?

CLOCK_BOOTTIME is fine for RT.

CLOCK_AUX is created for real-time use cases, so the monitor shouldn't
warn about it. But this clock can also be arbitrarily set, which can be
unsafe (e.g. you want to deploy the air bag in 10us, but someone changes
the clock and you have to wait 10s).

We probably can detect if a clock used by an RT task is set, but I am
not sure how complicated that is, if even possible. For now we are
assuming that user won't do such thing, then CLOCK_AUX is fine for RT
uses.

CLOCK_PROCESS_CPUTIME_ID is a strange one and shouldn't be used for
real-time. But I think it's obvious from the description that you cannot
do real-time with that.

Only CLOCK_REALTIME is a mistake people often make, probably has
something to do with the deceptive name.

Nam

^ permalink raw reply

* Re: [PATCH v2 12/14] rv: Add KUnit stubs for current and smp_processor_id()
From: Nam Cao @ 2026-05-19 14:53 UTC (permalink / raw)
  To: Gabriele Monaco, linux-kernel, linux-trace-kernel, Steven Rostedt,
	Gabriele Monaco, Masami Hiramatsu
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260514152055.229162-13-gmonaco@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> Some monitors do not only rely on tracepoint arguments but also on the
> currently executing task and current processor.
> This makes it more challenging to mock events in KUnit.
>
> Define wrapper functions around current and smp_processor_id(), the
> functionality is stubbed only during KUnit, however the additional
> function call is necessary whenever the KUnit tests are built in.
>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Reviewed-by: Nam Cao <namcao@linutronix.de>

^ permalink raw reply

* Re: [PATCH] tools/bootconfig: Fix null pointer when free buf
From: Masami Hiramatsu @ 2026-05-19 15:04 UTC (permalink / raw)
  To: lihongtao; +Cc: linux-kernel, linux-trace-kernel
In-Reply-To: <20260519031458.141050-1-lihongtao@kylinos.cn>

On Tue, 19 May 2026 11:14:58 +0800
lihongtao <lihongtao@kylinos.cn> wrote:

> In show_xbc() and delete_xbc(), if load_xbc_from_initrd failed,
> the buf may be NULL.

NACK, because free() can handle NULL correctly. See free(3)

   free()
       The free() function frees the memory space pointed to by ptr, which must have been
       returned by a previous call to malloc() or related functions.   Otherwise,  or  if
       ptr  has already been freed, undefined behavior occurs.  If ptr is NULL, no opera‐
       tion is performed.

Thanks,

> 
> Fixes: 950313ebf79c ("tools: bootconfig: Add bootconfig command")
> Signed-off-by: lihongtao <lihongtao@kylinos.cn>
> ---
>  tools/bootconfig/main.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
> index ddabde20585f..417d07a46f92 100644
> --- a/tools/bootconfig/main.c
> +++ b/tools/bootconfig/main.c
> @@ -328,7 +328,8 @@ static int show_xbc(const char *path, bool list)
>  		xbc_show_compact_tree();
>  	ret = 0;
>  out:
> -	free(buf);
> +	if (buf)
> +		free(buf);
>  
>  	return ret;
>  }
> @@ -360,7 +361,8 @@ static int delete_xbc(const char *path)
>  	} /* Ignore if there is no boot config in initrd */
>  
>  	close(fd);
> -	free(buf);
> +	if (buf)
> +		free(buf);
>  
>  	return ret;
>  }
> -- 
> 2.25.1
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH 1/3] rv/rtapp/sleep: Make the error more informative for user
From: Gabriele Monaco @ 2026-05-19 15:07 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <14242f444f7d2396c0c5a345f0099665d09a0ec5.1779176466.git.namcao@linutronix.de>

On Tue, 2026-05-19 at 09:49 +0200, Nam Cao wrote:
> The rtapp/sleep monitor detects real-time tasks which go to sleep in an
> real-time-unsafe manner. If this happen, the monitor triggers a trace event
> in the sched_wakeup tracepoint's handler.
> 

Ok so here WAKE is no longer tied to the wakeup event but to the end of the task
switch.

So what happens if a task was not sleeping but just got preempted? Wouldn't that
trigger WAKE (though that isn't a real wakeup) without RT_FRIENDLY_WAKE ?

Thanks,
Gabriele

> However, the invoking context of that trace event is not the most
> informative, because of the stack trace of that event is the wakeup's code
> path which is not very helpful:
> 
> 74.669317: rv:error_sleep: condvar[254]: violation detected
>     ltl_validate+0x345 ([kernel.kallsyms])
>     handle_sched_wakeup+0x34 ([kernel.kallsyms])
>     ttwu_do_activate+0xff ([kernel.kallsyms])
>     sched_ttwu_pending+0x104 ([kernel.kallsyms])
>     __flush_smp_call_function_queue+0x15b ([kernel.kallsyms])
>     __sysvec_call_function_single+0x18 ([kernel.kallsyms])
>     sysvec_call_function_single+0x66 ([kernel.kallsyms])
>     asm_sysvec_call_function_single+0x1a ([kernel.kallsyms])
>     pv_native_safe_halt+0xf ([kernel.kallsyms])
>     default_idle+0x9 ([kernel.kallsyms])
>     default_idle_call+0x33 ([kernel.kallsyms])
>     do_idle+0x234 ([kernel.kallsyms])
>     cpu_startup_entry+0x24 ([kernel.kallsyms])
>     start_secondary+0xf8 ([kernel.kallsyms])
>     common_startup_64+0x13e ([kernel.kallsyms])
> 
> What would be much more valuable is the stack trace of the task itself.
> 
> Change the update of WAKEUP from being in sched_wakeup trace point's
> handler to sched_exit trace point's handler. This makes the event happen in
> the task's context, making the stack trace far more informative for user:
> 
> rv:error_sleep: condvar[254]: violation detected
>     ltl_validate+0x345 ([kernel.kallsyms])
>     handle_sched_exit+0x39 ([kernel.kallsyms])
>     __schedule+0x80f ([kernel.kallsyms])
>     schedule+0x22 ([kernel.kallsyms])
>     futex_do_wait+0x33 ([kernel.kallsyms])
>     __futex_wait+0x8c ([kernel.kallsyms])
>     futex_wait+0x73 ([kernel.kallsyms])
>     do_futex+0xc6 ([kernel.kallsyms])
>     __x64_sys_futex+0x121 ([kernel.kallsyms])
>     do_syscall_64+0xf3 ([kernel.kallsyms])
>     entry_SYSCALL_64_after_hwframe+0x77 ([kernel.kallsyms])
>     __futex_abstimed_wait_common64+0xc6 (inlined)
>     __futex_abstimed_wait_common+0xc6 (/usr/lib/x86_64-linux-gnu/libc.so.6)
> 
> Signed-off-by: Nam Cao <namcao@linutronix.de>
> ---
>  kernel/trace/rv/monitors/sleep/sleep.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index 8dfe5ec13e19..0a36f5519e6b 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -92,9 +92,9 @@ static void handle_sched_set_state(void *data, struct
> task_struct *task, int sta
>  		ltl_atom_pulse(task, LTL_ABORT_SLEEP, true);
>  }
>  
> -static void handle_sched_wakeup(void *data, struct task_struct *task)
> +static void handle_sched_exit(void *data, bool is_switch)
>  {
> -	ltl_atom_pulse(task, LTL_WAKE, true);
> +	ltl_atom_pulse(current, LTL_WAKE, true);
>  }
>  
>  static void handle_sched_waking(void *data, struct task_struct *task)
> @@ -200,7 +200,7 @@ static int enable_sleep(void)
>  		return retval;
>  
>  	rv_attach_trace_probe("rtapp_sleep", sched_waking,
> handle_sched_waking);
> -	rv_attach_trace_probe("rtapp_sleep", sched_wakeup,
> handle_sched_wakeup);
> +	rv_attach_trace_probe("rtapp_sleep", sched_exit_tp,
> handle_sched_exit);
>  	rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp,
> handle_sched_set_state);
>  	rv_attach_trace_probe("rtapp_sleep", contention_begin,
> handle_contention_begin);
>  	rv_attach_trace_probe("rtapp_sleep", contention_end,
> handle_contention_end);
> @@ -213,7 +213,7 @@ static int enable_sleep(void)
>  static void disable_sleep(void)
>  {
>  	rv_detach_trace_probe("rtapp_sleep", sched_waking,
> handle_sched_waking);
> -	rv_detach_trace_probe("rtapp_sleep", sched_wakeup,
> handle_sched_wakeup);
> +	rv_detach_trace_probe("rtapp_sleep", sched_exit_tp,
> handle_sched_exit);
>  	rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp,
> handle_sched_set_state);
>  	rv_detach_trace_probe("rtapp_sleep", contention_begin,
> handle_contention_begin);
>  	rv_detach_trace_probe("rtapp_sleep", contention_end,
> handle_contention_end);


^ permalink raw reply

* Re: [PATCH] tools/bootconfig: Fix buf leaks in apply_xbc
From: Masami Hiramatsu @ 2026-05-19 15:12 UTC (permalink / raw)
  To: lihongtao; +Cc: linux-kernel, linux-trace-kernel
In-Reply-To: <20260519031255.139036-1-lihongtao@kylinos.cn>

On Tue, 19 May 2026 11:12:55 +0800
lihongtao <lihongtao@kylinos.cn> wrote:

> If data calloc failed, free the buf before return.
> 

OK, this should be a real bug.

> Fixes: 950313ebf79c ("tools: bootconfig: Add bootconfig command")
> Signed-off-by: lihongtao <lihongtao@kylinos.cn>

BTW, according to the DCO, can you use your real name here(Lihong Tao?)

https://docs.kernel.org/process/submitting-patches.html#sign-your-work-the-developer-s-certificate-of-origin

Thanks,

> ---
>  tools/bootconfig/main.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c
> index 643f707b8f1d..ddabde20585f 100644
> --- a/tools/bootconfig/main.c
> +++ b/tools/bootconfig/main.c
> @@ -390,8 +390,10 @@ static int apply_xbc(const char *path, const char *xbc_path)
>  
>  	/* Backup the bootconfig data */
>  	data = calloc(size + BOOTCONFIG_ALIGN + BOOTCONFIG_FOOTER_SIZE, 1);
> -	if (!data)
> +	if (!data) {
> +		free(buf);
>  		return -ENOMEM;
> +	}
>  	memcpy(data, buf, size);
>  
>  	/* Check the data format */
> -- 
> 2.25.1
> 
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH 1/3] rv/rtapp/sleep: Make the error more informative for user
From: Nam Cao @ 2026-05-19 15:24 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <19cffa1267c6538e6310b0149b0367807af76ac0.camel@redhat.com>

Gabriele Monaco <gmonaco@redhat.com> writes:
> On Tue, 2026-05-19 at 09:49 +0200, Nam Cao wrote:
>> The rtapp/sleep monitor detects real-time tasks which go to sleep in an
>> real-time-unsafe manner. If this happen, the monitor triggers a trace event
>> in the sched_wakeup tracepoint's handler.
>
> Ok so here WAKE is no longer tied to the wakeup event but to the end of the task
> switch.
>
> So what happens if a task was not sleeping but just got preempted? Wouldn't that
> trigger WAKE (though that isn't a real wakeup) without RT_FRIENDLY_WAKE ?

The monitor's rule says that when a task sleeps, it will not wake
without rt-friendly wake. If it "wakes" without initial sleep, it is
simply ignored.

I can change the name to be SCHEDULE_IN or something like that to make
it clearer.

Nam

^ permalink raw reply

* Re: [PATCH 1/3] rv/rtapp/sleep: Make the error more informative for user
From: Gabriele Monaco @ 2026-05-19 15:26 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <87y0hf2zj9.fsf@yellow.woof>

On Tue, 2026-05-19 at 17:24 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> > On Tue, 2026-05-19 at 09:49 +0200, Nam Cao wrote:
> > > The rtapp/sleep monitor detects real-time tasks which go to sleep in an
> > > real-time-unsafe manner. If this happen, the monitor triggers a trace
> > > event
> > > in the sched_wakeup tracepoint's handler.
> > 
> > Ok so here WAKE is no longer tied to the wakeup event but to the end of the
> > task
> > switch.
> > 
> > So what happens if a task was not sleeping but just got preempted? Wouldn't
> > that
> > trigger WAKE (though that isn't a real wakeup) without RT_FRIENDLY_WAKE ?
> 
> The monitor's rule says that when a task sleeps, it will not wake
> without rt-friendly wake. If it "wakes" without initial sleep, it is
> simply ignored.

Right makes sense.

> I can change the name to be SCHEDULE_IN or something like that to make
> it clearer.
> 

Yeah that'd be clearer, or at least a comment stating this somewhere.

Thanks,
Gabriele


^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Masami Hiramatsu @ 2026-05-19 15:26 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Linux Trace Kernel, bpf, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa
In-Reply-To: <20260519083128.5795c6af@gandalf.local.home>

On Tue, 19 May 2026 08:31:28 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> > 
> > To access @data, simple casting does not work. Thus we need a
> > new syntax:
> > 
> > 	(STRUCT)(PTR,ASSIGN)->FIELD
> > 
> > So the above case, we can do:
> > 
> > 	data=(foo)(foo_list,list)->data
> 
> Hmm, it may be better to make it one parenthesis?
> 
>        (STRUCT,PTR,ASSIGN)->FIELD
> 
>        data=(foo,foo_list,list)->data

OK, but I don't like this 3 parameters conversion. I want to
make it a kind of type casting with an option.

	(STRUCT,ASSIGN)PTR->FIELD

	data=(foo,list)foo_list->data

The second parenthesis will be eventually needed for nested casting,
for example, in above case, if the data is a pointer to another data
structure:

struct bar {
	int	value;
	...
};

	value=(bar)((foo,list)foo_list->data)->value


> 
> That would make it easier to differentiate between a simple "typecast" and
> a container_of() by checking if the content between the parenthesis has a
> comma.
> 
> Maybe even reorder it to:
> 
>        (PTR,STRUCT,ASSIGN)->FIELD
> 
>        data=(foo_list,foo,list)->data
> 
> to match the order of container_of():
> 
>       data = container_of(foo_list, struct foo, list)->data;
> 
> ?

This doesn't seem to conform to the rule here of using parentheses for
type casting, so I personally don't like it.


> > > diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> > > index e0d3a0da26af..b0829eb1cb52 100644
> > > --- a/kernel/trace/trace_probe.c
> > > +++ b/kernel/trace/trace_probe.c
> > > @@ -464,6 +464,26 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
> > >  	return NULL;
> > >  }
> > >  
> > > +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
> > > +{
> > > +	int id;
> > > +
> > > +	if (!ctx->btf) {
> > > +		struct btf *btf;  
> > 
> > This needs an empty line here.
> 
> Sure.
> 
> For conditional blocks, I don't always add a newline, but this is your code
> and I'll follow your suggestions.

Ah, this is just for fixing checkpatch.pl warning :-)

> 
> > 
> > > +		id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
> > > +		if (id < 0)
> > > +			return -EINVAL;  
> > 
> > Why don't you return id (it has corresponding errno)?
> 
> Because I forgot to ;-)
> 
> > 
> > > +		ctx->btf = btf;
> > > +	} else {
> > > +		id = btf_find_by_name_kind(ctx->btf, sname, BTF_KIND_STRUCT);
> > > +		if (id < 0)
> > > +			return -EINVAL;  
> > 
> > Ditto.
> > 
> > > +	}
> > > +
> > > +	ctx->last_struct = btf_type_by_id(ctx->btf, id);
> > > +	return 0;
> > > +}
> > > +
> > >  static int query_btf_context(struct traceprobe_parse_context *ctx)
> > >  {
> > >  	const struct btf_param *param;
> > > @@ -471,12 +491,12 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
> > >  	struct btf *btf;
> > >  	s32 nr;
> > >  
> > > -	if (ctx->btf)
> > > -		return 0;
> > > -
> > >  	if (!ctx->funcname)
> > >  		return -EINVAL;
> > >  
> > > +	if (ctx->btf)
> > > +		return 0;
> > > +  
> > 
> > Could you tell me why this order is changed?
> > I think this type casting will allow us to skip checking funcname
> > because btf context is already specified.
> 
> I wanted this to fail if btf was already set but funcname wasn't, because
> this should only be called for functions.

Hmm, OK. Then, can you make a separate patch for this?

> 
> > 
> > Ah, BTW, we may need to use a special struct btf* for type
> > casting. If the target function is in a module and the
> > casting type is defined in vmlinux, those are stored in
> > the different places...
> 
> OK, I'll make a separate btf for it then. I'll have to make sure the btf
> used for parsing knows which one to use. Shouldn't be too hard if we check
> for the STRUCT flag in the ctx->flags.

Yeah, and personally, I think that flag should be the TYPECAST flag.
 
Thank you!

-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 16:28 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: LKML, Linux Trace Kernel, bpf, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa
In-Reply-To: <20260520002640.d372bd044ceba9364b1f168f@kernel.org>

On Wed, 20 May 2026 00:26:40 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:
> > Hmm, it may be better to make it one parenthesis?
> > 
> >        (STRUCT,PTR,ASSIGN)->FIELD
> > 
> >        data=(foo,foo_list,list)->data  
> 
> OK, but I don't like this 3 parameters conversion. I want to
> make it a kind of type casting with an option.
> 
> 	(STRUCT,ASSIGN)PTR->FIELD
> 
> 	data=(foo,list)foo_list->data
> 
> The second parenthesis will be eventually needed for nested casting,
> for example, in above case, if the data is a pointer to another data
> structure:
> 
> struct bar {
> 	int	value;
> 	...
> };
> 
> 	value=(bar)((foo,list)foo_list->data)->value

Have fun with the parenthesis parsing ;-)

> 
> 
> > 
> > That would make it easier to differentiate between a simple "typecast" and
> > a container_of() by checking if the content between the parenthesis has a
> > comma.
> > 
> > Maybe even reorder it to:
> > 
> >        (PTR,STRUCT,ASSIGN)->FIELD
> > 
> >        data=(foo_list,foo,list)->data
> > 
> > to match the order of container_of():
> > 
> >       data = container_of(foo_list, struct foo, list)->data;
> > 
> > ?  
> 
> This doesn't seem to conform to the rule here of using parentheses for
> type casting, so I personally don't like it.

OK, as long as it's intuitive and is easy to remember. I hate having to
look up the documents every time I have to do some probe argument
parsing :-(

> 
> 
> > > > diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> > > > index e0d3a0da26af..b0829eb1cb52 100644
> > > > --- a/kernel/trace/trace_probe.c
> > > > +++ b/kernel/trace/trace_probe.c
> > > > @@ -464,6 +464,26 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
> > > >  	return NULL;
> > > >  }
> > > >  
> > > > +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
> > > > +{
> > > > +	int id;
> > > > +
> > > > +	if (!ctx->btf) {
> > > > +		struct btf *btf;    
> > > 
> > > This needs an empty line here.  
> > 
> > Sure.
> > 
> > For conditional blocks, I don't always add a newline, but this is your code
> > and I'll follow your suggestions.  
> 
> Ah, this is just for fixing checkpatch.pl warning :-)

I added it to keep your checkpatch happy.


> > > > @@ -471,12 +491,12 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
> > > >  	struct btf *btf;
> > > >  	s32 nr;
> > > >  
> > > > -	if (ctx->btf)
> > > > -		return 0;
> > > > -
> > > >  	if (!ctx->funcname)
> > > >  		return -EINVAL;
> > > >  
> > > > +	if (ctx->btf)
> > > > +		return 0;
> > > > +    
> > > 
> > > Could you tell me why this order is changed?
> > > I think this type casting will allow us to skip checking funcname
> > > because btf context is already specified.  
> > 
> > I wanted this to fail if btf was already set but funcname wasn't, because
> > this should only be called for functions.  
> 
> Hmm, OK. Then, can you make a separate patch for this?

I added a struct_btf (I can change it to typecast_btf) and have a
helper function that is:

static struct btf *ctx_btf(struct traceprobe_parse_context *ctx)
{
	return ctx->flags & TPARG_FL_STRUCT ?
		ctx->struct_btf : ctx->btf;
}

And have all functions get their btf from that. This way we can keep
the two allocations separate.

> 
> >   
> > > 
> > > Ah, BTW, we may need to use a special struct btf* for type
> > > casting. If the target function is in a module and the
> > > casting type is defined in vmlinux, those are stored in
> > > the different places...  
> > 
> > OK, I'll make a separate btf for it then. I'll have to make sure the btf
> > used for parsing knows which one to use. Shouldn't be too hard if we check
> > for the STRUCT flag in the ctx->flags.  
> 
> Yeah, and personally, I think that flag should be the TYPECAST flag.

I'll update it.

Thanks,

-- Steve

^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 16:38 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: LKML, Linux Trace Kernel, bpf, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa
In-Reply-To: <20260519122836.0bfded70@fedora>

On Tue, 19 May 2026 12:28:36 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> I added a struct_btf (I can change it to typecast_btf) and have a
> helper function that is:

Actually, I'm going to keep it as struct_btf as it is a btf for a
structure. I did change the flag to be TYPECAST though.

-- Steve


> 
> static struct btf *ctx_btf(struct traceprobe_parse_context *ctx)
> {
> 	return ctx->flags & TPARG_FL_STRUCT ?
> 		ctx->struct_btf : ctx->btf;
> }


^ permalink raw reply

* Re: [PATCH 6/9] rv: Ensure synchronous cleanup for HA monitors
From: Wen Yang @ 2026-05-19 16:48 UTC (permalink / raw)
  To: Gabriele Monaco; +Cc: linux-kernel, Steven Rostedt, Nam Cao, linux-trace-kernel
In-Reply-To: <88a6fc5c08d18e3c1f6d29dc106db80fa688bf87.camel@redhat.com>



On 5/18/26 19:54, Gabriele Monaco wrote:
> On Sun, 2026-05-17 at 17:12 +0800, Wen Yang wrote:
>> The guard(rcu)() + synchronize_rcu() mechanism for HA timer callbacks
>> is correct.
>>
>> One concern: TOCTOU between the pre-check and guard(rcu)().
> 
> Yes, this could happen, but I'm not sure it's really a big issue:
> 
>>
>> da_monitor_reset() calls reset_hook BEFORE clearing monitoring:
>>
>>     da_monitor_reset_hook(da_mon);        /* ha_cancel_timer [async]   */
>>     WRITE_ONCE(da_mon->monitoring, 0);    /* cleared AFTER reset_hook  */
>>     da_mon->curr_state = model_get_initial_state();
>>
>> This may creates a window where the callback pre-check passes but the
>> monitor is reset before guard(rcu)() is acquired:
> 
> If a callback is running, there was a violation because the timer expired, so it
> isn't wrong to report, although we are unloading the monitor.
> 
>>
>>     /* __ha_monitor_timer_callback() */
>>     if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
>>         return;
>>
>>     /* passes: monitoring=1
>>      *
>>      * WINDOW ─ CPU A runs da_monitor_reset_all() here:
>>      *   ha_cancel_timer()  [returns: callback is running, cannot cancel]
>>      *   WRITE_ONCE(monitoring, 0)
>>      *   curr_state = model_get_initial_state()
>>      */
>>     guard(rcu)();
>>     curr_state = READ_ONCE(ha_mon->da_mon.curr_state);  /* initial_state */
>>     /* no second da_monitoring() check */
>>     ha_react(curr_state, EVENT_NONE, env_string.buffer); /* spurious call */
>>     ha_trace_error_env(ha_mon, ...);                     /* fires
>> unconditionally */
>>
>> Result: spurious ha_trace_error_env() for initial_state.  For existing
>> monitors (stall/nomiss/opid), model_should_send_event_env(initial, NONE)
>> returns false, so no false-positive reaction, but the trace event fires.
>> Monitors where initial_state carries a constraint would produce a false
>> positive.
> 
> I'm not sure what you mean here, if I understand the situation correctly: the
> callback is running (so we should react), da_monitor_reset() is too late to stop
> it but somehow manages to reset curr_state on time for the callback to see it
> change: react reports the wrong state in an otherwise valid reaction.
> 
>>
>> Proposed fix : re-check inside the RCU critical section:
>>
>>     guard(rcu)();
>>     if (unlikely(!da_monitoring(&ha_mon->da_mon)))  /* re-check here */
>>         return;
>>     curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
> 
> I'm not sure that's going to fix it anyway, RCU cannot synchronise readers,
> checking again would at most (mildly) reduce the race window, not remove it.
> 
> What we could do is to play with barriersin for the callback to either:
> * see monitoring = 1 AND the old curr_state
> * see monitoring = 0 AND the new curr_state
> 
> Something like:
> 
> void __ha_monitor_timer_callback() {
> 	guard(rcu)(); //this is only for waiters, let them wait more
> 
> 	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
> 		return;
> 	smp_rmb();
> 	curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
> 	...
> }
> 
> void da_monitor_reset() {
> 	da_monitor_reset_hook(da_mon);
> 	WRITE_ONCE(da_mon->monitoring, 0);
> 	smp_wmb();
> 	WRITE_ONCE(da_mon->curr_state, model_get_initial_state());
> }
> 
> Coupled with your patch [1] adding more atomic accesses to da_mon->monitoring
> should probably do the trick.
> 
> Am I missing anything?
> 

The goal is right.  One thing worth double-checking is the load order
in the callback against the "SMP BARRIER PAIRING" section of
Documentation/memory-barriers.txt, which states:

   [!] Note that the stores before the write barrier would normally be
   expected to match the loads after the read barrier or the
   address-dependency barrier, and vice versa ...

So, we should to swap the read order in the callback so that it matches
the standard pattern:

   void __ha_monitor_timer_callback() {
         guard(rcu)();
         curr_state = READ_ONCE(ha_mon->da_mon.curr_state);  /* B: 
before rmb */
         smp_rmb();
         if (unlikely(!da_monitoring(&ha_mon->da_mon)))       /* A: 
after rmb */
                 return;
         /*
          * Reached here: monitoring = 1 (old_A).
          * Standard wmb/rmb guarantee: curr_state (read before rmb) is also
          * old, i.e. not initial_state.
          */
         ha_react(curr_state, EVENT_NONE, env_string.buffer);
         ...
   }

   void da_monitor_reset() {
         da_monitor_reset_hook(da_mon);
         WRITE_ONCE(da_mon->monitoring, 0);   /* A: before wmb */
         smp_wmb();
         WRITE_ONCE(da_mon->curr_state, model_get_initial_state());  /* 
B: after wmb */
   }



--
Best wishes,
Wen

> 
> [1] -
> https://lore.kernel.org/lkml/8af5ba4bd93d2acb8a546e8e47ced974a87c1eb8.1778522945.git.wen.yang@linux.dev
> 
>>
>>
>> --
>> Best wishes,
>> Wen
>>
>>
>> On 5/12/26 22:02, Gabriele Monaco wrote:
>>> HA monitors may start timers, all cleanup functions currently stop the
>>> timers asynchronously to avoid sleeping in the wrong context.
>>> Nothing makes sure running callbacks terminate on cleanup.
>>>
>>> Run the entire HA timer callback in an RCU read-side critical section,
>>> this way we can simply synchronize_rcu() with any pending timer and are
>>> sure any cleanup using kfree_rcu() runs after callbacks terminated.
>>> Additionally make sure any unlikely callback running late won't run any
>>> code if the monitor is marked as disabled.
>>>
>>> Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
>>> Fixes: 4a24127bd6cb ("rv: Add support for per-object monitors in DA/HA")
>>> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
>>> ---
>>>    include/rv/da_monitor.h | 23 +++++++++++++++++++----
>>>    include/rv/ha_monitor.h | 18 ++++++++++++++++--
>>>    2 files changed, 35 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
>>> index a4a13b62d1a4..402d3b935c08 100644
>>> --- a/include/rv/da_monitor.h
>>> +++ b/include/rv/da_monitor.h
>>> @@ -57,6 +57,15 @@ static struct rv_monitor rv_this;
>>>    #define da_monitor_reset_hook(da_mon)
>>>    #endif
>>>    
>>> +/*
>>> + * Hook to allow the implementation of hybrid automata: define it with a
>>> + * function that waits for the termination of all monitors background
>>> + * activities (e.g. all timers). This hook can sleep.
>>> + */
>>> +#ifndef da_monitor_sync_hook
>>> +#define da_monitor_sync_hook()
>>> +#endif
>>> +
>>>    /*
>>>     * Type for the target id, default to int but can be overridden.
>>>     * A long type can work as hash table key (PER_OBJ) but will be downgraded
>>> to
>>> @@ -179,6 +188,7 @@ static inline int da_monitor_init(void)
>>>    static inline void da_monitor_destroy(void)
>>>    {
>>>    	da_monitor_reset_all();
>>> +	da_monitor_sync_hook();
>>>    }
>>>    
>>>    #ifndef da_implicit_guard
>>> @@ -232,6 +242,7 @@ static inline int da_monitor_init(void)
>>>    static inline void da_monitor_destroy(void)
>>>    {
>>>    	da_monitor_reset_all();
>>> +	da_monitor_sync_hook();
>>>    }
>>>    
>>>    #ifndef da_implicit_guard
>>> @@ -319,6 +330,7 @@ static inline void da_monitor_destroy(void)
>>>    	}
>>>    
>>>    	da_monitor_reset_all();
>>> +	da_monitor_sync_hook();
>>>    
>>>    	rv_put_task_monitor_slot(task_mon_slot);
>>>    	task_mon_slot = RV_PER_TASK_MONITOR_INIT;
>>> @@ -497,10 +509,9 @@ static void da_monitor_reset_all(void)
>>>    	struct da_monitor_storage *mon_storage;
>>>    	int bkt;
>>>    
>>> -	rcu_read_lock();
>>> +	guard(rcu)();
>>>    	hash_for_each_rcu(da_monitor_ht, bkt, mon_storage, node)
>>>    		da_monitor_reset(&mon_storage->rv.da_mon);
>>> -	rcu_read_unlock();
>>>    }
>>>    
>>>    static inline int da_monitor_init(void)
>>> @@ -516,13 +527,17 @@ static inline void da_monitor_destroy(void)
>>>    	int bkt;
>>>    
>>>    	tracepoint_synchronize_unregister();
>>> +	scoped_guard(rcu) {
>>> +		hash_for_each_rcu(da_monitor_ht, bkt, mon_storage, node) {
>>> +			da_monitor_reset_hook(&mon_storage->rv.da_mon);
>>> +		}
>>> +	}
>>> +	da_monitor_sync_hook();
>>>    	/*
>>>    	 * This function is called after all probes are disabled and no
>>> longer
>>>    	 * pending, we can safely assume no concurrent user.
>>>    	 */
>>> -	synchronize_rcu();
>>>    	hash_for_each_safe(da_monitor_ht, bkt, tmp, mon_storage, node) {
>>> -		da_monitor_reset_hook(&mon_storage->rv.da_mon);
>>>    		hash_del_rcu(&mon_storage->node);
>>>    		kfree(mon_storage);
>>>    	}
>>> diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
>>> index d59507e8cb30..47ff1a41febe 100644
>>> --- a/include/rv/ha_monitor.h
>>> +++ b/include/rv/ha_monitor.h
>>> @@ -36,6 +36,7 @@ static bool ha_monitor_handle_constraint(struct da_monitor
>>> *da_mon,
>>>    #define da_monitor_event_hook ha_monitor_handle_constraint
>>>    #define da_monitor_init_hook ha_monitor_init_env
>>>    #define da_monitor_reset_hook ha_monitor_reset_env
>>> +#define da_monitor_sync_hook() synchronize_rcu()
>>>    
>>>    #include <rv/da_monitor.h>
>>>    #include <linux/seq_buf.h>
>>> @@ -237,12 +238,25 @@ static bool ha_monitor_handle_constraint(struct
>>> da_monitor *da_mon,
>>>    	return false;
>>>    }
>>>    
>>> +/*
>>> + * __ha_monitor_timer_callback - generic callback representation
>>> + *
>>> + * This callback runs in an RCU read-side critical section to allow the
>>> + * destruction sequence to easily synchronize_rcu() with all pending timer
>>> + * after asynchronously disabling them.
>>> + */
>>>    static inline void __ha_monitor_timer_callback(struct ha_monitor *ha_mon)
>>>    {
>>> -	enum states curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
>>>    	DECLARE_SEQ_BUF(env_string, ENV_BUFFER_SIZE);
>>> -	u64 time_ns = ha_get_ns();
>>> +	enum states curr_state;
>>> +	u64 time_ns;
>>> +
>>> +	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
>>> +		return;
>>>    
>>> +	guard(rcu)();
>>> +	curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
>>> +	time_ns = ha_get_ns();
>>>    	ha_get_env_string(&env_string, ha_mon, time_ns);
>>>    	ha_react(curr_state, EVENT_NONE, env_string.buffer);
>>>    	ha_trace_error_env(ha_mon, model_get_state_name(curr_state),
> 

^ permalink raw reply

* [PATCH v5] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 17:01 UTC (permalink / raw)
  To: LKML, Linux trace kernel, bpf
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Mark Rutland, Peter Zijlstra,
	Namhyung Kim, Takaya Saeki, Douglas Raillard, Tom Zanussi,
	Andrew Morton, Thomas Gleixner, Ian Rogers, Jiri Olsa

From: Steven Rostedt <rostedt@goodmis.org>

Add syntax to the parsing of eprobes to be able to typecast a trace event
field that is a pointer to a structure.

Currently, a dereference must be a number, where the user has to figure
out manually the offset of a member of a structure that they want to
dereference.

But for event probes that records a field that happens to be a pointer to
a structure, it cannot dereference these values with BTF naming, but
must use numerical offsets.

For example, to find out what device a sk_buff is pointing to in the
net_dev_xmit trace event, one must first use gdb to find the offsets of the
members of the structures:

 (gdb) p &((struct sk_buff *)0)->dev
 $1 = (struct net_device **) 0x10
 (gdb) p &((struct net_device *)0)->name
 $2 = (char (*)[16]) 0x118

And then use the raw numbers to dereference:

  # echo 'e:xmit net.net_dev_xmit +0x118(+0x10($skbaddr)):string' >> dynamic_events

If BTF is in the kernel, then instead, the skbaddr can be typecast to
sk_buff and use the normal dereference logic.

  # echo 'e:xmit net.net_dev_xmit (sk_buff)skbaddr->dev->name:string' >> dynamic_events
  # echo 1 > events/eprobes/xmit/enable
  # cat trace
[..]
    sshd-session-1022    [000] b..2.   860.249343: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.250061: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.250142: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.263553: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.283820: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.302716: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.322905: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.342828: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.362268: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.382335: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.400856: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.419893: xmit: (net.net_dev_xmit) arg1="enp7s0"

The syntax is simply: (STRUCT)(FIELD)->MEMBER[->MEMBER..]

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes from v4: https://patch.msgid.link/20260518232312.0c78f055@gandalf.local.home

- Only describe this for eprobes, as that is currently the only user of
  this. Kprobes registers can come later. (Sashiko)

- Update the documentation to only be in the eprobetrace.rst.

- Make the btf value separate for structures and functions. As a function
  may use a BTF from a module, it can point to a structure that is in
  vmlinux. Make them separate and add a ctx_btf() helper function to
  figure out which one to use. (Masami Hiramatsu)

- Remove updating query_btf_context() by having it check funcname first.
  The above ctx_btf() helper fixes this.

- Rename TPARG_FL_STRUCT to TPARG_FL_TYPECAST (Masami Hiramatsu)

- Have the typecast code call parse_btf_arg() directly, instead of going
  through parse_probe_vars(). This fixes issues with updating the pcode
  value, and simplifies the formatting and code (Sashiko).

  Added a parse_trace_event() helper to duplicate the handling of
  TPARG_FL_TEVENT.

- Use "goto found_type;" instead of relying on type being NULL in
  parse_btf_arg(). (Masami Hiramatsu)

- Add query_btf_struct() stub function when CONFIG_PROBE_EVENTS_BTF_ARGS
  is not defined. (kernel test robot)

- Remove need for '*' in typecast (Masami Hiramatsu)

- Have the event fields not use '$' when the typecast is used.
  (Masami Hiramatsu)

 Documentation/trace/eprobetrace.rst |   4 +
 kernel/trace/trace_probe.c          | 158 ++++++++++++++++++++++------
 kernel/trace/trace_probe.h          |   4 +
 3 files changed, 135 insertions(+), 31 deletions(-)

diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index 89b5157cfab8..fe3602540569 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -46,6 +46,10 @@ Synopsis of eprobe_events
 		  (x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
                   "string", "ustring", "symbol", "symstr" and "bitfield" are
                   supported.
+  (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+                  a pointer to STRUCT and then derference the pointer defined by
+                  ->MEMBER. Note that when this is used, the FIELD name does not
+                  need to be prefixed with a '$'.
 
 Types
 -----
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0d3a0da26af..cd54deb985f4 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -376,11 +376,17 @@ static bool btf_type_is_char_array(struct btf *btf, const struct btf_type *type)
 		&& BTF_INT_BITS(intdata) == 8;
 }
 
+static struct btf *ctx_btf(struct traceprobe_parse_context *ctx)
+{
+	return ctx->flags & TPARG_FL_TYPECAST ?
+		ctx->struct_btf : ctx->btf;
+}
+
 static int check_prepare_btf_string_fetch(char *typename,
 				struct fetch_insn **pcode,
 				struct traceprobe_parse_context *ctx)
 {
-	struct btf *btf = ctx->btf;
+	struct btf *btf = ctx_btf(ctx);
 
 	if (!btf || !ctx->last_type)
 		return 0;
@@ -464,6 +470,27 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
 	return NULL;
 }
 
+static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
+{
+	int id;
+
+	if (!ctx->struct_btf) {
+		struct btf *btf;
+
+		id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
+		if (id < 0)
+			return id;
+		ctx->struct_btf = btf;
+	} else {
+		id = btf_find_by_name_kind(ctx->struct_btf, sname, BTF_KIND_STRUCT);
+		if (id < 0)
+			return id;
+	}
+
+	ctx->last_struct = btf_type_by_id(ctx->struct_btf, id);
+	return 0;
+}
+
 static int query_btf_context(struct traceprobe_parse_context *ctx)
 {
 	const struct btf_param *param;
@@ -515,6 +542,10 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
 		ctx->params = NULL;
 		ctx->nr_params = 0;
 	}
+	if (ctx->struct_btf) {
+		btf_put(ctx->struct_btf);
+		ctx->last_struct = NULL;
+	}
 }
 
 /* Return 1 if the field separator is arrow operator ('->') */
@@ -554,22 +585,29 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 	struct fetch_insn *code = *pcode;
 	const struct btf_member *field;
 	u32 bitoffs, anon_offs;
+	bool is_struct = ctx->flags & TPARG_FL_TYPECAST;
+	struct btf *btf = ctx_btf(ctx);
 	char *next;
 	int is_ptr;
 	s32 tid;
 
 	do {
-		/* Outer loop for solving arrow operator ('->') */
-		if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
-			trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
-			return -EINVAL;
-		}
-		/* Convert a struct pointer type to a struct type */
-		type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
-		if (!type) {
-			trace_probe_log_err(ctx->offset, BAD_BTF_TID);
-			return -EINVAL;
+		if (!is_struct) {
+			/* Outer loop for solving arrow operator ('->') */
+			if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+				trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+				return -EINVAL;
+			}
+
+			/* Convert a struct pointer type to a struct type */
+			type = btf_type_skip_modifiers(btf, type->type, &tid);
+			if (!type) {
+				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+				return -EINVAL;
+			}
 		}
+		/* Only the first type can skip being a pointer */
+		is_struct = false;
 
 		bitoffs = 0;
 		do {
@@ -580,7 +618,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 				return is_ptr;
 
 			anon_offs = 0;
-			field = btf_find_struct_member(ctx->btf, type, fieldname,
+			field = btf_find_struct_member(btf, type, fieldname,
 						       &anon_offs);
 			if (IS_ERR(field)) {
 				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
@@ -602,7 +640,7 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 				ctx->last_bitsize = 0;
 			}
 
-			type = btf_type_skip_modifiers(ctx->btf, field->type, &tid);
+			type = btf_type_skip_modifiers(btf, field->type, &tid);
 			if (!type) {
 				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
 				return -EINVAL;
@@ -627,6 +665,26 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 	return 0;
 }
 
+
+static int parse_trace_event(char *arg, struct fetch_insn *code,
+			     struct traceprobe_parse_context *ctx)
+{
+	int ret;
+
+	if (code->data)
+		return -EFAULT;
+	ret = parse_trace_event_arg(arg, code, ctx);
+	if (!ret)
+		return 0;
+	if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
+		code->op = FETCH_OP_COMM;
+		return 0;
+	}
+	/* backward compatibility */
+	ctx->offset = 0;
+	return -EINVAL;
+}
+
 static int __store_entry_arg(struct trace_probe *tp, int argnum);
 
 static int parse_btf_arg(char *varname,
@@ -636,11 +694,12 @@ static int parse_btf_arg(char *varname,
 	struct fetch_insn *code = *pcode;
 	const struct btf_param *params;
 	const struct btf_type *type;
+	struct btf *btf = ctx_btf(ctx);
 	char *field = NULL;
 	int i, is_ptr, ret;
 	u32 tid;
 
-	if (WARN_ON_ONCE(!ctx->funcname))
+	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TYPECAST)))
 		return -EINVAL;
 
 	is_ptr = split_next_field(varname, &field, ctx);
@@ -653,6 +712,14 @@ static int parse_btf_arg(char *varname,
 		return -EOPNOTSUPP;
 	}
 
+	if (ctx->flags & TPARG_FL_TEVENT) {
+		int ret;
+
+		ret = parse_trace_event(varname, code, ctx);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
 		code->op = FETCH_OP_RETVAL;
 		/* Check whether the function return type is not void */
@@ -672,7 +739,7 @@ static int parse_btf_arg(char *varname,
 		return 0;
 	}
 
-	if (!ctx->btf) {
+	if (!btf) {
 		ret = query_btf_context(ctx);
 		if (ret < 0 || ctx->nr_params == 0) {
 			trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
@@ -682,7 +749,7 @@ static int parse_btf_arg(char *varname,
 	params = ctx->params;
 
 	for (i = 0; i < ctx->nr_params; i++) {
-		const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
+		const char *name = btf_name_by_offset(btf, params[i].name_off);
 
 		if (name && !strcmp(name, varname)) {
 			if (tparg_is_function_entry(ctx->flags)) {
@@ -704,15 +771,22 @@ static int parse_btf_arg(char *varname,
 			goto found;
 		}
 	}
+
+	if (ctx->flags & TPARG_FL_TYPECAST) {
+		type = ctx->last_struct;
+		goto found_type;
+	}
+
 	trace_probe_log_err(ctx->offset, NO_BTFARG);
 	return -ENOENT;
 
 found:
-	type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+	type = btf_type_skip_modifiers(btf, tid, &tid);
 	if (!type) {
 		trace_probe_log_err(ctx->offset, BAD_BTF_TID);
 		return -EINVAL;
 	}
+found_type:
 	/* Initialize the last type information */
 	ctx->last_type = type;
 	ctx->last_bitoffs = 0;
@@ -727,7 +801,7 @@ static int parse_btf_arg(char *varname,
 static const struct fetch_type *find_fetch_type_from_btf_type(
 					struct traceprobe_parse_context *ctx)
 {
-	struct btf *btf = ctx->btf;
+	struct btf *btf = ctx_btf(ctx);
 	const char *typestr = NULL;
 
 	if (btf && ctx->last_type)
@@ -764,6 +838,11 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
 	ctx->btf = NULL;
 }
 
+static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
+{
+	return -EOPNOTSUPP;
+}
+
 static int query_btf_context(struct traceprobe_parse_context *ctx)
 {
 	return -EOPNOTSUPP;
@@ -953,18 +1032,9 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
 	int len;
 
 	if (ctx->flags & TPARG_FL_TEVENT) {
-		if (code->data)
-			return -EFAULT;
-		ret = parse_trace_event_arg(arg, code, ctx);
-		if (!ret)
-			return 0;
-		if (strcmp(arg, "comm") == 0 || strcmp(arg, "COMM") == 0) {
-			code->op = FETCH_OP_COMM;
-			return 0;
-		}
-		/* backward compatibility */
-		ctx->offset = 0;
-		goto inval;
+		if (parse_trace_event(arg, code, ctx) < 0)
+			goto inval;
+		return 0;
 	}
 
 	if (str_has_prefix(arg, "retval")) {
@@ -1231,6 +1301,30 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 				code->op = FETCH_OP_IMM;
 		}
 		break;
+	case '(':
+		tmp = strchr(arg, ')');
+		if (!tmp) {
+			trace_probe_log_err(ctx->offset + strlen(arg),
+					    DEREF_OPEN_BRACE);
+			return -EINVAL;
+		}
+		*tmp = '\0';
+		ret = query_btf_struct(arg + 1, ctx);
+		*tmp = ')';
+
+		if (ret < 0) {
+			trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
+			return -EINVAL;
+		}
+
+		ctx->flags |= TPARG_FL_TYPECAST;
+		tmp++;
+
+		ctx->offset += tmp - arg;
+		ret = parse_btf_arg(tmp, pcode, end, ctx);
+		ctx->flags &= ~TPARG_FL_TYPECAST;
+		ctx->last_struct = NULL;
+		break;
 	default:
 		if (isalpha(arg[0]) || arg[0] == '_') {	/* BTF variable */
 			if (!tparg_is_function_entry(ctx->flags) &&
@@ -1504,6 +1598,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
 	code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
 
 	ctx->last_type = NULL;
+	ctx->last_struct = NULL;
 	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
 			      ctx);
 	if (ret < 0)
@@ -1705,13 +1800,14 @@ static int sprint_nth_btf_arg(int idx, const char *type,
 			      struct traceprobe_parse_context *ctx)
 {
 	const char *name;
+	struct btf *btf = ctx_btf(ctx);
 	int ret;
 
 	if (idx >= ctx->nr_params) {
 		trace_probe_log_err(0, NO_BTFARG);
 		return -ENOENT;
 	}
-	name = btf_name_by_offset(ctx->btf, ctx->params[idx].name_off);
+	name = btf_name_by_offset(btf, ctx->params[idx].name_off);
 	if (!name) {
 		trace_probe_log_err(0, NO_BTF_ENTRY);
 		return -ENOENT;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 262d8707a3df..6735b8b2432b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -394,6 +394,7 @@ static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
  * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive.
  * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with
  * TPARG_FL_KERNEL.
+ * TPARG_FL_TYPECAST is set if an argument was typecast to a structure.
  */
 #define TPARG_FL_RETURN BIT(0)
 #define TPARG_FL_KERNEL BIT(1)
@@ -402,6 +403,7 @@ static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
 #define TPARG_FL_USER   BIT(4)
 #define TPARG_FL_FPROBE BIT(5)
 #define TPARG_FL_TPOINT BIT(6)
+#define TPARG_FL_TYPECAST BIT(7)
 #define TPARG_FL_LOC_MASK	GENMASK(4, 0)
 
 static inline bool tparg_is_function_entry(unsigned int flags)
@@ -422,7 +424,9 @@ struct traceprobe_parse_context {
 	const struct btf_param *params;	/* Parameter of the function */
 	s32 nr_params;			/* The number of the parameters */
 	struct btf *btf;		/* The BTF to be used */
+	struct btf *struct_btf;		/* The BTF to be used for structs */
 	const struct btf_type *last_type;	/* Saved type */
+	const struct btf_type *last_struct;	/* Saved structure */
 	u32 last_bitoffs;		/* Saved bitoffs */
 	u32 last_bitsize;		/* Saved bitsize */
 	struct trace_probe *tp;
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v5] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 17:28 UTC (permalink / raw)
  To: LKML, Linux trace kernel, bpf
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Mark Rutland, Peter Zijlstra,
	Namhyung Kim, Takaya Saeki, Douglas Raillard, Tom Zanussi,
	Andrew Morton, Thomas Gleixner, Ian Rogers, Jiri Olsa
In-Reply-To: <20260519130144.40e71a00@fedora>

On Tue, 19 May 2026 13:01:44 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> @@ -636,11 +694,12 @@ static int parse_btf_arg(char *varname,
>  	struct fetch_insn *code = *pcode;
>  	const struct btf_param *params;
>  	const struct btf_type *type;
> +	struct btf *btf = ctx_btf(ctx);
>  	char *field = NULL;
>  	int i, is_ptr, ret;
>  	u32 tid;
>  
> -	if (WARN_ON_ONCE(!ctx->funcname))
> +	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TYPECAST)))
>  		return -EINVAL;
>  
>  	is_ptr = split_next_field(varname, &field, ctx);
> @@ -653,6 +712,14 @@ static int parse_btf_arg(char *varname,
>  		return -EOPNOTSUPP;
>  	}
>  
> +	if (ctx->flags & TPARG_FL_TEVENT) {
> +		int ret;
> +
> +		ret = parse_trace_event(varname, code, ctx);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
>  		code->op = FETCH_OP_RETVAL;
>  		/* Check whether the function return type is not void */
> @@ -672,7 +739,7 @@ static int parse_btf_arg(char *varname,
>  		return 0;
>  	}
>  
> -	if (!ctx->btf) {
> +	if (!btf) {
>  		ret = query_btf_context(ctx);

Oops, need a:

		btf = ctx->btf;

here!

-- Steve

>  		if (ret < 0 || ctx->nr_params == 0) {
>  			trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);

^ permalink raw reply

* Re: [PATCH v5] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 17:37 UTC (permalink / raw)
  To: LKML, Linux trace kernel, bpf
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Mark Rutland, Peter Zijlstra,
	Namhyung Kim, Takaya Saeki, Douglas Raillard, Tom Zanussi,
	Andrew Morton, Thomas Gleixner, Ian Rogers, Jiri Olsa
In-Reply-To: <20260519132830.3011a194@fedora>

On Tue, 19 May 2026 13:28:30 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 19 May 2026 13:01:44 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > @@ -636,11 +694,12 @@ static int parse_btf_arg(char *varname,
> >  	struct fetch_insn *code = *pcode;
> >  	const struct btf_param *params;
> >  	const struct btf_type *type;
> > +	struct btf *btf = ctx_btf(ctx);
> >  	char *field = NULL;
> >  	int i, is_ptr, ret;
> >  	u32 tid;
> >  
> > -	if (WARN_ON_ONCE(!ctx->funcname))
> > +	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_TYPECAST)))
> >  		return -EINVAL;
> >  
> >  	is_ptr = split_next_field(varname, &field, ctx);
> > @@ -653,6 +712,14 @@ static int parse_btf_arg(char *varname,
> >  		return -EOPNOTSUPP;
> >  	}
> >  
> > +	if (ctx->flags & TPARG_FL_TEVENT) {
> > +		int ret;
> > +
> > +		ret = parse_trace_event(varname, code, ctx);
> > +		if (ret < 0)
> > +			return ret;

Actually, since typecasting is currently only for eprobes, I'm going to
shuffle the code around a bit to only have the TYPECAST affect TEVENT,
and do the jump to found_type from here.

-- Steve


> > +	}
> > +
> >  	if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
> >  		code->op = FETCH_OP_RETVAL;
> >  		/* Check whether the function return type is not void */
> > @@ -672,7 +739,7 @@ static int parse_btf_arg(char *varname,
> >  		return 0;
> >  	}
> >  
> > -	if (!ctx->btf) {
> > +	if (!btf) {
> >  		ret = query_btf_context(ctx);  
> 
> Oops, need a:
> 
> 		btf = ctx->btf;
> 
> here!
> 
> -- Steve
> 
> >  		if (ret < 0 || ctx->nr_params == 0) {
> >  			trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);  


^ permalink raw reply

* Re: [PATCH v3 1/2] spi: qcom-geni: trace: Add trace events for Qualcomm GENI SPI
From: Steven Rostedt @ 2026-05-19 17:54 UTC (permalink / raw)
  To: Praveen Talari
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Mark Brown, linux-kernel,
	linux-trace-kernel, linux-arm-msm, linux-spi, mukesh.savaliya,
	aniket.randive, chandana.chiluveru, jyothi.seerapu, Konrad Dybcio
In-Reply-To: <20260518-add-tracepoints-for-qcom-geni-spi-v3-1-7928f6810a79@oss.qualcomm.com>

On Mon, 18 May 2026 22:30:51 +0530
Praveen Talari <praveen.talari@oss.qualcomm.com> wrote:

> @@ -0,0 +1,103 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM qcom_geni_spi
> +
> +#if !defined(_TRACE_QCOM_GENI_SPI_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_QCOM_GENI_SPI_H
> +
> +#include <linux/tracepoint.h>
> +
> +TRACE_EVENT(geni_spi_setup_params,
> +	    TP_PROTO(struct device *dev, u8 cs, u32 mode,
> +		     u32 mode_changed, bool cs_changed),
> +	    TP_ARGS(dev, cs, mode, mode_changed, cs_changed),
> +
> +	    TP_STRUCT__entry(__string(name, dev_name(dev))
> +			     __field(u8, cs)

A u8 followed by a u32 will create a 3 byte hole in the structure
layout that gets recorded onto the ring buffer. Best to move that field
to after the bool cs_changed, for better compaction.

> +			     __field(u32, mode)
> +			     __field(u32, mode_changed)
> +			     __field(bool, cs_changed)
> +	    ),
> +
> +	    TP_fast_assign(__assign_str(name);
> +			   __entry->cs = cs;
> +			   __entry->mode = mode;
> +			   __entry->mode_changed = mode_changed;
> +			   __entry->cs_changed = cs_changed;
> +	    ),
> +
> +	    TP_printk("%s: cs=%u mode=0x%08x mode_changed=0x%08x cs_changed=%d",
> +		      __get_str(name), __entry->cs, __entry->mode,
> +		      __entry->mode_changed, __entry->cs_changed)
> +);
> +
> +TRACE_EVENT(geni_spi_clk_cfg,
> +	    TP_PROTO(struct device *dev, unsigned long req_hz,
> +		     unsigned long sclk_hz, unsigned int clk_idx,
> +		     unsigned int clk_div, unsigned int bpw),
> +	    TP_ARGS(dev, req_hz, sclk_hz, clk_idx, clk_div, bpw),
> +
> +	    TP_STRUCT__entry(__string(name, dev_name(dev))

__string items inject a 4 byte meta data so they are basically the same
as a u32 item on the structure. Move this to the end or after the long
words so that on 64 bit architectures you don't create a 4 byte hole
here.

-- Steve

> +			     __field(unsigned long, req_hz)
> +			     __field(unsigned long, sclk_hz)
> +			     __field(unsigned int, clk_idx)
> +			     __field(unsigned int, clk_div)
> +			     __field(unsigned int, bpw)
> +	    ),
> +
> +	    TP_fast_assign(__assign_str(name);
> +			   __entry->req_hz = req_hz;
> +			   __entry->sclk_hz = sclk_hz;
> +			   __entry->clk_idx = clk_idx;
> +			   __entry->clk_div = clk_div;
> +			   __entry->bpw = bpw;
> +	    ),
> +
> +	    TP_printk("%s: req_hz=%lu sclk_hz=%lu clk_idx=%u clk_div=%u bpw=%u",
> +		      __get_str(name), __entry->req_hz, __entry->sclk_hz,
> +		      __entry->clk_idx, __entry->clk_div, __entry->bpw)
> +);

^ permalink raw reply

* Re: [PATCH] gpu: host1x: trace: fix string fields in host1x traces
From: Steven Rostedt @ 2026-05-19 18:10 UTC (permalink / raw)
  To: Artur Kowalski
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, linux-tegra, Thierry Reding, Mikko Perttunen,
	David Airlie, Simona Vetter
In-Reply-To: <20260519-host1x-tracing-v1-1-55afb8cbd186@gmail.com>

On Tue, 19 May 2026 12:16:43 +0200
Artur Kowalski <arturkow2000@gmail.com> wrote:

> Use __assign_str and __get_str as required by tracing subsystem. Fixes
> string fields being rejected by the verifier and unreadable from
> userspace.

Does anyone use these tracepoints? The fact that they have been broken
for 5 years and nobody noticed makes me think they are useless.

I rather remove them than fix them, but if someone thinks that these
are still useful then by all means apply this patch.

Acked-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve


> 
> Tested on v6.18.21.
> 
> Signed-off-by: Artur Kowalski <arturkow2000@gmail.com>
> ---
>  include/trace/events/host1x.h | 50 ++++++++++++++++++++++---------------------
>  1 file changed, 26 insertions(+), 24 deletions(-)
> 
> diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
> index 1ba84b738e46..1b6aeb7b177b 100644
> --- a/include/trace/events/host1x.h
> +++ b/include/trace/events/host1x.h
> @@ -21,9 +21,11 @@ struct host1x_bo;
>  DECLARE_EVENT_CLASS(host1x,
>  	TP_PROTO(const char *name),
>  	TP_ARGS(name),
> -	TP_STRUCT__entry(__field(const char *, name)),
> -	TP_fast_assign(__entry->name = name;),
> -	TP_printk("name=%s", __entry->name)
> +	TP_STRUCT__entry(__string(name, name)),
> +	TP_fast_assign(
> +		__assign_str(name);
> +	),
> +	TP_printk("name=%s", __get_str(name))
>  );
>  
>  DEFINE_EVENT(host1x, host1x_channel_open,
> @@ -52,19 +54,19 @@ TRACE_EVENT(host1x_cdma_push,
>  	TP_ARGS(name, op1, op2),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(u32, op1)
>  		__field(u32, op2)
>  	),
>  
>  	TP_fast_assign(
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->op1 = op1;
>  		__entry->op2 = op2;
>  	),
>  
>  	TP_printk("name=%s, op1=%08x, op2=%08x",
> -		__entry->name, __entry->op1, __entry->op2)
> +		__get_str(name), __entry->op1, __entry->op2)
>  );
>  
>  TRACE_EVENT(host1x_cdma_push_wide,
> @@ -73,7 +75,7 @@ TRACE_EVENT(host1x_cdma_push_wide,
>  	TP_ARGS(name, op1, op2, op3, op4),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(u32, op1)
>  		__field(u32, op2)
>  		__field(u32, op3)
> @@ -81,7 +83,7 @@ TRACE_EVENT(host1x_cdma_push_wide,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->op1 = op1;
>  		__entry->op2 = op2;
>  		__entry->op3 = op3;
> @@ -89,7 +91,7 @@ TRACE_EVENT(host1x_cdma_push_wide,
>  	),
>  
>  	TP_printk("name=%s, op1=%08x, op2=%08x, op3=%08x op4=%08x",
> -		__entry->name, __entry->op1, __entry->op2, __entry->op3,
> +		__get_str(name), __entry->op1, __entry->op2, __entry->op3,
>  		__entry->op4)
>  );
>  
> @@ -100,7 +102,7 @@ TRACE_EVENT(host1x_cdma_push_gather,
>  	TP_ARGS(name, bo, words, offset, cmdbuf),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(struct host1x_bo *, bo)
>  		__field(u32, words)
>  		__field(u32, offset)
> @@ -114,14 +116,14 @@ TRACE_EVENT(host1x_cdma_push_gather,
>  					words * sizeof(u32));
>  		}
>  		__entry->cmdbuf = cmdbuf;
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->bo = bo;
>  		__entry->words = words;
>  		__entry->offset = offset;
>  	),
>  
>  	TP_printk("name=%s, bo=%p, words=%u, offset=%d, contents=[%s]",
> -	  __entry->name, __entry->bo,
> +	  __get_str(name), __entry->bo,
>  	  __entry->words, __entry->offset,
>  	  __print_hex(__get_dynamic_array(cmdbuf),
>  		  __entry->cmdbuf ? __entry->words * 4 : 0))
> @@ -134,7 +136,7 @@ TRACE_EVENT(host1x_channel_submit,
>  	TP_ARGS(name, cmdbufs, relocs, syncpt_id, syncpt_incrs),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(u32, cmdbufs)
>  		__field(u32, relocs)
>  		__field(u32, syncpt_id)
> @@ -142,7 +144,7 @@ TRACE_EVENT(host1x_channel_submit,
>  	),
>  
>  	TP_fast_assign(
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->cmdbufs = cmdbufs;
>  		__entry->relocs = relocs;
>  		__entry->syncpt_id = syncpt_id;
> @@ -151,7 +153,7 @@ TRACE_EVENT(host1x_channel_submit,
>  
>  	TP_printk("name=%s, cmdbufs=%u, relocs=%u, syncpt_id=%u, "
>  		  "syncpt_incrs=%u",
> -		  __entry->name, __entry->cmdbufs, __entry->relocs,
> +		  __get_str(name), __entry->cmdbufs, __entry->relocs,
>  		  __entry->syncpt_id, __entry->syncpt_incrs)
>  );
>  
> @@ -161,19 +163,19 @@ TRACE_EVENT(host1x_channel_submitted,
>  	TP_ARGS(name, syncpt_base, syncpt_max),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(u32, syncpt_base)
>  		__field(u32, syncpt_max)
>  	),
>  
>  	TP_fast_assign(
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->syncpt_base = syncpt_base;
>  		__entry->syncpt_max = syncpt_max;
>  	),
>  
>  	TP_printk("name=%s, syncpt_base=%d, syncpt_max=%d",
> -		__entry->name, __entry->syncpt_base, __entry->syncpt_max)
> +		__get_str(name), __entry->syncpt_base, __entry->syncpt_max)
>  );
>  
>  TRACE_EVENT(host1x_channel_submit_complete,
> @@ -182,19 +184,19 @@ TRACE_EVENT(host1x_channel_submit_complete,
>  	TP_ARGS(name, count, thresh),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(int, count)
>  		__field(u32, thresh)
>  	),
>  
>  	TP_fast_assign(
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->count = count;
>  		__entry->thresh = thresh;
>  	),
>  
>  	TP_printk("name=%s, count=%d, thresh=%d",
> -		__entry->name, __entry->count, __entry->thresh)
> +		__get_str(name), __entry->count, __entry->thresh)
>  );
>  
>  TRACE_EVENT(host1x_wait_cdma,
> @@ -203,16 +205,16 @@ TRACE_EVENT(host1x_wait_cdma,
>  	TP_ARGS(name, eventid),
>  
>  	TP_STRUCT__entry(
> -		__field(const char *, name)
> +		__string(name, name)
>  		__field(u32, eventid)
>  	),
>  
>  	TP_fast_assign(
> -		__entry->name = name;
> +		__assign_str(name);
>  		__entry->eventid = eventid;
>  	),
>  
> -	TP_printk("name=%s, event=%d", __entry->name, __entry->eventid)
> +	TP_printk("name=%s, event=%d", __get_str(name), __entry->eventid)
>  );
>  
>  TRACE_EVENT(host1x_syncpt_load_min,
> 
> ---
> base-commit: ab5fce87a778cb780a05984a2ca448f2b41aafbf
> change-id: 20260519-host1x-tracing-e2d608ec5e37
> 
> Best regards,
> --  
> Artur Kowalski <arturkow2000@gmail.com>


^ permalink raw reply

* Re: [PATCH v5] tracing/eprobes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 18:17 UTC (permalink / raw)
  To: sashiko-bot
  Cc: sashiko-reviews, bpf, LKML, Linux trace kernel, Masami Hiramatsu
In-Reply-To: <20260519174848.176A6C2BCB3@smtp.kernel.org>

On Tue, 19 May 2026 17:48:47 +0000
sashiko-bot@kernel.org wrote:

> Thank you for your contribution! Sashiko AI review found 5 potential issue(s) to consider:
> - [High] Using `(STRUCT)` typecasts for non-eprobes leaves the base fetch instruction uninitialized (`FETCH_OP_NOP`), silently breaking the probe.
> - [High] Prematurely clearing `TPARG_FL_TYPECAST` causes cross-BTF type confusion and breaks string fetches.
> - [Medium] Erroneous caching of `struct_btf` restricts a single trace probe to using typecasts from only one BTF object, preventing cross-module lookups.
> - [Medium] Cross-BTF string table access in `parse_btf_arg()` causes invalid string comparisons.
> - [Low] `clear_btf_context()` fails to clear `ctx->struct_btf` after dropping its reference.
> --
> 
> commit de4b6287a7069fde24b0e0f679a545a06fb9ae6c
> Author: Steven Rostedt <rostedt@goodmis.org>
> 
> tracing/eprobes: Allow use of BTF names to dereference pointers
> 
> Add syntax to the parsing of eprobes to be able to typecast a trace event
> field that is a pointer to a structure.
> 
> > diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> > index e1c73065dae51..2bc9f4ca8aa7e 100644
> > --- a/kernel/trace/trace_probe.c
> > +++ b/kernel/trace/trace_probe.c
> > @@ -464,6 +470,27 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
> >  	return NULL;
> >  }
> >  
> > +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
> > +{
> > +	int id;
> > +
> > +	if (!ctx->struct_btf) {
> > +		struct btf *btf;
> > +
> > +		id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
> > +		if (id < 0)
> > +			return id;
> > +		ctx->struct_btf = btf;
> > +	} else {
> > +		id = btf_find_by_name_kind(ctx->struct_btf, sname, BTF_KIND_STRUCT);  
> 
> [Severity: Medium]
> Does caching struct_btf here restrict a single trace probe event to using
> typecasts from only one BTF object?
> 
> If a probe attempts to typecast structures from two different kernel
> modules, wouldn't the lookup for the second module's struct fail because
> it only searches the cached BTF object of the first module (or vmlinux)?

Hmm, good point. I guess I should clear the struct_btf at the end of
the '(' switch case.

> 
> > +		if (id < 0)
> > +			return id;
> > +	}
> > +
> > +	ctx->last_struct = btf_type_by_id(ctx->struct_btf, id);
> > +	return 0;
> > +}  
> 
> [ ... ]
> 
> > @@ -515,6 +542,10 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
> >  		ctx->params = NULL;
> >  		ctx->nr_params = 0;
> >  	}
> > +	if (ctx->struct_btf) {
> > +		btf_put(ctx->struct_btf);
> > +		ctx->last_struct = NULL;  
> 
> [Severity: Low]
> Should ctx->struct_btf be explicitly set to NULL after btf_put() drops
> the reference?

I'm thinking of dropping it in the '(' switch case.

> 
> While the traceprobe_parse_context might be destroyed shortly after
> parsing completes, leaving a dangling pointer could potentially lead to a
> use-after-free and double-free if any future refactoring reuses the
> parsing context across multiple trace events.
> 
> > +	}
> >  }  
> 
> [ ... ]
> 
> > @@ -672,7 +739,7 @@ static int parse_btf_arg(char *varname,
> >  		return 0;
> >  	}
> >  
> > -	if (!ctx->btf) {
> > +	if (!btf) {
> >  		ret = query_btf_context(ctx);
> >  		if (ret < 0 || ctx->nr_params == 0) {
> >  			trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
> > @@ -682,7 +749,7 @@ static int parse_btf_arg(char *varname,
> >  	params = ctx->params;
> >  
> >  	for (i = 0; i < ctx->nr_params; i++) {
> > -		const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
> > +		const char *name = btf_name_by_offset(btf, params[i].name_off);  
> 
> [Severity: Medium]
> When parsing a typecast argument, btf evaluates to ctx->struct_btf. The
> parameter matching loop then calls btf_name_by_offset() using that btf.
> 
> Since params[i].name_off originates from the function's BTF (ctx->btf,
> populated by previous non-typecast arguments), could this lead to reading
> from the wrong string table if ctx->struct_btf is a different kernel
> module?

The new patch will skip this for TYPECAST arguments. Well, at least for
trace events.

> 
> This might result in invalid string comparisons or returning
> (invalid-name-offset) if it reads out of bounds.
> 
> >  
> >  		if (name && !strcmp(name, varname)) {
> >  			if (tparg_is_function_entry(ctx->flags)) {  
> 
> [ ... ]
> 
> > @@ -1231,6 +1301,30 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
> >  				code->op = FETCH_OP_IMM;
> >  		}
> >  		break;
> > +	case '(':
> > +		tmp = strchr(arg, ')');
> > +		if (!tmp) {
> > +			trace_probe_log_err(ctx->offset + strlen(arg),
> > +					    DEREF_OPEN_BRACE);
> > +			return -EINVAL;
> > +		}
> > +		*tmp = '\0';
> > +		ret = query_btf_struct(arg + 1, ctx);
> > +		*tmp = ')';
> > +
> > +		if (ret < 0) {
> > +			trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
> > +			return -EINVAL;
> > +		}
> > +
> > +		ctx->flags |= TPARG_FL_TYPECAST;
> > +		tmp++;
> > +
> > +		ctx->offset += tmp - arg;
> > +		ret = parse_btf_arg(tmp, pcode, end, ctx);  
> 
> [Severity: High]
> When parse_probe_arg() evaluates the (STRUCT) syntax for kprobes or
> fprobes, it sets TPARG_FL_TYPECAST and calls parse_btf_arg(). Inside
> parse_btf_arg(), ctx_btf() evaluates to ctx->struct_btf, so it bypasses
> query_btf_context() and jumps to found_type.
> 
> However, it doesn't seem to initialize code->op with a valid base fetch
> instruction like FETCH_OP_ARG or FETCH_OP_REG.
> 
> Could this leave the base instruction as FETCH_OP_NOP (0)? If so, it
> seems the base address would be 0, causing subsequent field dereferences
> to safely fail on 0 + offset due to copy_from_kernel_nofault(). This might
> allow unsupported syntax like (sk_buff)%di->dev to parse but silently
> fetch nothing.

Yeah, I may have to force this not to work with function types.

> 
> > +		ctx->flags &= ~TPARG_FL_TYPECAST;  
> 
> [Severity: High]
> The TPARG_FL_TYPECAST flag is cleared here, but ctx->last_type still
> points to a type originating from ctx->struct_btf.
> 
> Later in traceprobe_parse_probe_arg_body(),
> check_prepare_btf_string_fetch() is called. Since the flag is cleared,
> ctx_btf() will incorrectly return ctx->btf (vmlinux for kprobes, or
> NULL for eprobes).
> 
> Does this cause check_prepare_btf_string_fetch() to return 0 for eprobes
> and skip adding the necessary dummy FETCH_OP_DEREF for string fetches?
> Without this dummy dereference, string fetches on character pointers
> in eprobes might fail by reading the raw pointer value itself as a string.
> 
> Also, for kprobes, could this cause type confusion if
> btf_type_skip_modifiers() looks up a module's type ID inside the vmlinux
> BTF?

As mentioned above, I need to put and clear the last_struct here.

Thanks,

-- Steve


> 
> > +		ctx->last_struct = NULL;
> > +		break;  
> 


^ permalink raw reply

* Re: [PATCH mm-unstable v17 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Nico Pache @ 2026-05-19 18:21 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
	mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe, Usama Arif
In-Reply-To: <4566170a-7e3d-49e4-baab-ba2790c198db@kernel.org>

On Tue, May 12, 2026 at 1:30 AM David Hildenbrand (Arm)
<david@kernel.org> wrote:
>
> On 5/11/26 20:58, Nico Pache wrote:
> > The following cleanup reworks all the max_ptes_* handling into helper
> > functions. This increases the code readability and will later be used to
> > implement the mTHP handling of these variables.
> >
> > With these changes we abstract all the madvise_collapse() special casing
> > (dont respect the sysctls) away from the functions that utilize them. And
> > will be used later in this series to cleanly restrict the mTHP collapse
> > behavior.
> >
> > No functional change is intended; however, we are now only reading the
> > sysfs variables once per scan, whereas before these variables were being
> > read on each loop iteration.
> >
> > Suggested-by: David Hildenbrand <david@kernel.org>
> > Acked-by: David Hildenbrand (Arm) <david@kernel.org>
>
> Some nits when re-reading:
>
> > Acked-by: Usama Arif <usama.arif@linux.dev>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  mm/khugepaged.c | 118 +++++++++++++++++++++++++++++++++---------------
> >  1 file changed, 82 insertions(+), 36 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index f0e29d5c7b1f..f68853b3caa7 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -348,6 +348,62 @@ static bool pte_none_or_zero(pte_t pte)
> >       return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
> >  }
> >
> > +/**
> > + * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
>
> I know, it's painful, but ...
>
> There is no "none-page".

Yeah I think you mentioned that last review... sorry!

>
> Calculate maximum allowed empty PTEs or PTEs mapping the shared zeropage ... ?
>
> > + * PTEs for the given collapse operation.
>
> We usually indent here (second line of subject), I think. Same applies to the
> other doc below.

Hmm tbh I couldn't find a example of what you meant here. There are
some that put a space between the first sentence and the @ list.

>
> > + * @cc: The collapse control struct
> > + * @vma: The vma to check for userfaultfd
> > + *
> > + * Return: Maximum number of none-page or zero-page PTEs allowed for the
> > + * collapse operation.
>
> Same here.
>
> > + */
> > +static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> > +             struct vm_area_struct *vma)
> > +{
> > +     // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
>
> Lance commented on the comment style.
>
> Is this comment really required? It's pretty self-documenting already.

Dropped it, thanks.

>
> > +     if (vma && userfaultfd_armed(vma))
> > +             return 0;
> > +     // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
> > +     if (!cc->is_khugepaged)
> > +             return HPAGE_PMD_NR;
> > +     // For all other cases repect the user defined maximum.
> > +     return khugepaged_max_ptes_none;
> > +}
> > +
> --
> Cheers,
>
> David
>


^ permalink raw reply

* Re: [PATCH mm-unstable v17 03/14] mm/khugepaged: rework max_ptes_* handling with helper functions
From: Nico Pache @ 2026-05-19 18:21 UTC (permalink / raw)
  To: Lance Yang
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, aarcange,
	akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat, mhocko,
	peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
	rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
	surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe, usama.arif
In-Reply-To: <20260512044444.71798-1-lance.yang@linux.dev>

On Mon, May 11, 2026 at 10:45 PM Lance Yang <lance.yang@linux.dev> wrote:
>
>
> On Mon, May 11, 2026 at 12:58:03PM -0600, Nico Pache wrote:
> >The following cleanup reworks all the max_ptes_* handling into helper
> >functions. This increases the code readability and will later be used to
> >implement the mTHP handling of these variables.
> >
> >With these changes we abstract all the madvise_collapse() special casing
> >(dont respect the sysctls) away from the functions that utilize them. And
>
> Nit: s/dont/do not/
>
> >will be used later in this series to cleanly restrict the mTHP collapse
> >behavior.
> >
> >No functional change is intended; however, we are now only reading the
> >sysfs variables once per scan, whereas before these variables were being
> >read on each loop iteration.
> >
> >Suggested-by: David Hildenbrand <david@kernel.org>
> >Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> >Acked-by: Usama Arif <usama.arif@linux.dev>
> >Signed-off-by: Nico Pache <npache@redhat.com>
> >---
> > mm/khugepaged.c | 118 +++++++++++++++++++++++++++++++++---------------
> > 1 file changed, 82 insertions(+), 36 deletions(-)
> >
> >diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> >index f0e29d5c7b1f..f68853b3caa7 100644
> >--- a/mm/khugepaged.c
> >+++ b/mm/khugepaged.c
> >@@ -348,6 +348,62 @@ static bool pte_none_or_zero(pte_t pte)
> >       return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
> > }
> >
> >+/**
> >+ * collapse_max_ptes_none - Calculate maximum allowed none-page or zero-page
> >+ * PTEs for the given collapse operation.
> >+ * @cc: The collapse control struct
> >+ * @vma: The vma to check for userfaultfd
> >+ *
> >+ * Return: Maximum number of none-page or zero-page PTEs allowed for the
> >+ * collapse operation.
> >+ */
> >+static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> >+              struct vm_area_struct *vma)
> >+{
> >+      // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
> >+      if (vma && userfaultfd_armed(vma))
> >+              return 0;
> >+      // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
> >+      if (!cc->is_khugepaged)
> >+              return HPAGE_PMD_NR;
> >+      // For all other cases repect the user defined maximum.
> >+      return khugepaged_max_ptes_none;
>
> Nit: kernel code usually uses C-style comments. This could be:
>
> /* For all other cases, respect the user-defined maximum. */
>
> Also, s/repect/respect/.
>
> >+}
> >+
> >+/**
> >+ * collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
> >+ * anonymous pages for the given collapse operation.
> >+ * @cc: The collapse control struct
> >+ *
> >+ * Return: Maximum number of PTEs that map shared anonymous pages for the
> >+ * collapse operation
> >+ */
> >+static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
> >+{
> >+      // for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
> >+      // anonymous pages.
>
> Ditto.
>
> >+      if (!cc->is_khugepaged)
> >+              return HPAGE_PMD_NR;
> >+      return khugepaged_max_ptes_shared;
> >+}
> >+
> >+/**
> >+ * collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
> >+ * maximum allowed non-present pagecache entries for the given collapse operation.
> >+ * @cc: The collapse control struct
> >+ *
> >+ * Return: Maximum number of non-present PTEs or the maximum allowed non-present
> >+ * pagecache entries for the collapse operation.
> >+ */
> >+static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
> >+{
> >+      // for MADV_COLLAPSE, do not restrict the number PTEs entries or
> >+      // pagecache entries that are non-present.
>
> Same here.
>
> >+      if (!cc->is_khugepaged)
> >+              return HPAGE_PMD_NR;
> >+      return khugepaged_max_ptes_swap;
> >+}
> >+
> > int hugepage_madvise(struct vm_area_struct *vma,
> >                    vm_flags_t *vm_flags, int advice)
> > {
> >@@ -546,21 +602,19 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >       pte_t *_pte;
> >       int none_or_zero = 0, shared = 0, referenced = 0;
> >       enum scan_result result = SCAN_FAIL;
> >+      unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> >+      unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
>
> Nit: could these be const, as David suggested earlier?
>
> Nothing else jumped out at me. LGTM!
>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>

Ack on all the above thank you !

>


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox