Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19  3:23 UTC (permalink / raw)
  To: LKML, Linux Trace Kernel, bpf
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Mark Rutland, Peter Zijlstra,
	Namhyung Kim, Takaya Saeki, Douglas Raillard, Tom Zanussi,
	Andrew Morton, Thomas Gleixner, Ian Rogers, Jiri Olsa,
	"Subject:[PATCH  v2]", tracing/pr

From: Steven Rostedt <rostedt@goodmis.org>

Add syntax to the FETCHARGS parsing of probes to be able to typecast a
value to a pointer to a structure.

Currently, a dereference must be a number, where the user has to figure
out manually the offset of a member of a structure that they want to
dereference, unless the member is a function parameter that BTF already has
information about what structure the argument is pointing to.

But for event probes, or generic kprobes that records a register that
happens to be a pointer to a structure, they cannot dereference these
values with BTF naming, but must use numerical offsets.

For example, to find out what device a sk_buff is pointing to in the
net_dev_xmit trace event, one must first use gdb to find the offsets of the
members of the structures:

 (gdb) p &((struct sk_buff *)0)->dev
 $1 = (struct net_device **) 0x10
 (gdb) p &((struct net_device *)0)->name
 $2 = (char (*)[16]) 0x118

And then use the raw numbers to dereference:

  # echo 'e:xmit net.net_dev_xmit +0x118(+0x10($skbaddr)):string' >> dynamic_events

If BTF is in the kernel, then instead, the $skbaddr can be typecast to
sk_buff and use the normal dereference logic.

  # echo 'e:xmit net.net_dev_xmit (sk_buff*)$skbaddr->dev->name:string' >> dynamic_events
  # echo 1 > events/eprobes/xmit/enable
  # cat trace
[..]
    sshd-session-1022    [000] b..2.   860.249343: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.250061: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.250142: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.263553: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.283820: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.302716: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.322905: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.342828: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.362268: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.382335: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.400856: xmit: (net.net_dev_xmit) arg1="enp7s0"
    sshd-session-1022    [000] b..2.   860.419893: xmit: (net.net_dev_xmit) arg1="enp7s0"

The syntax is simply: ([STRUCT]*)(VAR)->FIELD[->FIELD..]

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v3: https://patch.msgid.link/20260518095832.52659a3a@gandalf.local.home

 *** COMPLETE REWRITE FROM V3 ***

- Rewrote it to use typecasting instead of simply replacing BTF names with
  offsets.

 Documentation/trace/kprobetrace.rst |   3 +
 kernel/trace/trace_probe.c          | 110 ++++++++++++++++++++++++----
 kernel/trace/trace_probe.h          |   3 +
 3 files changed, 100 insertions(+), 16 deletions(-)

diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 3b6791c17e9b..450ac646fe4c 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -54,6 +54,9 @@ Synopsis of kprobe_events
   $retval	: Fetch return value.(\*2)
   $comm		: Fetch current task comm.
   +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
+  (STRUCT*)FETCHARG->FIELD[->FIELD] : If BTF is supported, typecast FETCHARG to
+                  a pointer to STRUCT and then derference the pointer defined by
+                  ->FIELD.
   \IMM		: Store an immediate value to the argument.
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0d3a0da26af..b0829eb1cb52 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -464,6 +464,26 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
 	return NULL;
 }
 
+static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
+{
+	int id;
+
+	if (!ctx->btf) {
+		struct btf *btf;
+		id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
+		if (id < 0)
+			return -EINVAL;
+		ctx->btf = btf;
+	} else {
+		id = btf_find_by_name_kind(ctx->btf, sname, BTF_KIND_STRUCT);
+		if (id < 0)
+			return -EINVAL;
+	}
+
+	ctx->last_struct = btf_type_by_id(ctx->btf, id);
+	return 0;
+}
+
 static int query_btf_context(struct traceprobe_parse_context *ctx)
 {
 	const struct btf_param *param;
@@ -471,12 +491,12 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
 	struct btf *btf;
 	s32 nr;
 
-	if (ctx->btf)
-		return 0;
-
 	if (!ctx->funcname)
 		return -EINVAL;
 
+	if (ctx->btf)
+		return 0;
+
 	type = btf_find_func_proto(ctx->funcname, &btf);
 	if (!type)
 		return -ENOENT;
@@ -514,6 +534,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
 		ctx->proto = NULL;
 		ctx->params = NULL;
 		ctx->nr_params = 0;
+		ctx->last_struct = NULL;
 	}
 }
 
@@ -554,22 +575,28 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 	struct fetch_insn *code = *pcode;
 	const struct btf_member *field;
 	u32 bitoffs, anon_offs;
+	bool is_struct = ctx->flags & TPARG_FL_STRUCT;
 	char *next;
 	int is_ptr;
 	s32 tid;
 
 	do {
-		/* Outer loop for solving arrow operator ('->') */
-		if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
-			trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
-			return -EINVAL;
-		}
-		/* Convert a struct pointer type to a struct type */
-		type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
-		if (!type) {
-			trace_probe_log_err(ctx->offset, BAD_BTF_TID);
-			return -EINVAL;
+		if (!is_struct) {
+			/* Outer loop for solving arrow operator ('->') */
+			if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
+				trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+				return -EINVAL;
+			}
+
+			/* Convert a struct pointer type to a struct type */
+			type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
+			if (!type) {
+				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+				return -EINVAL;
+			}
 		}
+		/* Only the first type can skip being a pointer */
+		is_struct = false;
 
 		bitoffs = 0;
 		do {
@@ -635,12 +662,12 @@ static int parse_btf_arg(char *varname,
 {
 	struct fetch_insn *code = *pcode;
 	const struct btf_param *params;
-	const struct btf_type *type;
+	const struct btf_type *type = NULL;
 	char *field = NULL;
 	int i, is_ptr, ret;
 	u32 tid;
 
-	if (WARN_ON_ONCE(!ctx->funcname))
+	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_STRUCT)))
 		return -EINVAL;
 
 	is_ptr = split_next_field(varname, &field, ctx);
@@ -704,11 +731,18 @@ static int parse_btf_arg(char *varname,
 			goto found;
 		}
 	}
+
+	if (ctx->flags & TPARG_FL_STRUCT) {
+		type = ctx->last_struct;
+		goto found;
+	}
+
 	trace_probe_log_err(ctx->offset, NO_BTFARG);
 	return -ENOENT;
 
 found:
-	type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+	if (!type)
+		type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
 	if (!type) {
 		trace_probe_log_err(ctx->offset, BAD_BTF_TID);
 		return -EINVAL;
@@ -952,6 +986,12 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
 	int ret = 0;
 	int len;
 
+	if (ctx->flags & TPARG_FL_STRUCT) {
+		ret = parse_btf_arg(orig_arg, pcode, end, ctx);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (ctx->flags & TPARG_FL_TEVENT) {
 		if (code->data)
 			return -EFAULT;
@@ -1231,6 +1271,43 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 				code->op = FETCH_OP_IMM;
 		}
 		break;
+	case '(':
+		tmp = strrchr(arg, ')');
+		if (!tmp) {
+			trace_probe_log_err(ctx->offset + strlen(arg),
+					    DEREF_OPEN_BRACE);
+			return -EINVAL;
+		}
+
+		tmp--;
+		if (*tmp != '*') {
+			trace_probe_log_err(ctx->offset + (tmp - arg),
+					    NO_PTR_STRCT);
+			return -EINVAL;
+		}
+		*tmp = '\0';
+		ret = query_btf_struct(arg + 1, ctx);
+		*tmp = '*';
+
+		if (ret < 0) {
+			trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
+			return -EINVAL;
+		}
+
+		ctx->flags |= TPARG_FL_STRUCT;
+		tmp += 2;
+
+		if (*tmp != '$') {
+			trace_probe_log_err(ctx->offset + (tmp - arg),
+					    BAD_VAR);
+			return -EINVAL;
+		}
+
+		ctx->offset += tmp - arg;
+		ret = parse_probe_vars(tmp, type, pcode, end, ctx);
+		ctx->flags &= ~TPARG_FL_STRUCT;
+		ctx->last_struct = NULL;
+		break;
 	default:
 		if (isalpha(arg[0]) || arg[0] == '_') {	/* BTF variable */
 			if (!tparg_is_function_entry(ctx->flags) &&
@@ -1504,6 +1581,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
 	code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
 
 	ctx->last_type = NULL;
+	ctx->last_struct = NULL;
 	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
 			      ctx);
 	if (ret < 0)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 262d8707a3df..88ab9f6da591 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -394,6 +394,7 @@ static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
  * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive.
  * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with
  * TPARG_FL_KERNEL.
+ * TPARG_FL_STRUCT is set if an argument was typecast to a structure.
  */
 #define TPARG_FL_RETURN BIT(0)
 #define TPARG_FL_KERNEL BIT(1)
@@ -402,6 +403,7 @@ static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
 #define TPARG_FL_USER   BIT(4)
 #define TPARG_FL_FPROBE BIT(5)
 #define TPARG_FL_TPOINT BIT(6)
+#define TPARG_FL_STRUCT BIT(7)
 #define TPARG_FL_LOC_MASK	GENMASK(4, 0)
 
 static inline bool tparg_is_function_entry(unsigned int flags)
@@ -423,6 +425,7 @@ struct traceprobe_parse_context {
 	s32 nr_params;			/* The number of the parameters */
 	struct btf *btf;		/* The BTF to be used */
 	const struct btf_type *last_type;	/* Saved type */
+	const struct btf_type *last_struct;	/* Saved structure */
 	u32 last_bitoffs;		/* Saved bitoffs */
 	u32 last_bitsize;		/* Saved bitsize */
 	struct trace_probe *tp;
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 06/11] drm: Use trace_call__##name() at guarded tracepoint call sites
From: Philipp Stanner @ 2026-05-19  7:23 UTC (permalink / raw)
  To: Vineeth Remanan Pillai, phasta
  Cc: Alex Deucher, Christian König, David Airlie, Simona Vetter,
	Harry Wentland, Leo Li, Matthew Brost, Danilo Krummrich,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, amd-gfx,
	dri-devel, Steven Rostedt, linux-trace-kernel, Peter Zijlstra
In-Reply-To: <CAO7JXPhMBd0xgDRO-gZ2HpSTnrj1OD67c39jrXWEKaowNc9GEA@mail.gmail.com>

On Mon, 2026-05-18 at 19:20 -0400, Vineeth Remanan Pillai wrote:
> On Mon, May 18, 2026 at 11:01 AM Philipp Stanner <phasta@mailbox.org> wrote:
> > 
> > On Fri, 2026-05-15 at 09:59 -0400, Vineeth Pillai (Google) wrote:
> > > From: Vineeth Pillai <vineeth@bitbyteword.org>
> > > 
> > > Replace trace_foo() with the new trace_call__foo() at sites already
> > > guarded by trace_foo_enabled(), avoiding a redundant
> > > static_branch_unlikely() re-evaluation inside the tracepoint.
> > > trace_call__foo() calls the tracepoint callbacks directly without
> > > utilizing the static branch again.
> > 
> > The "foo" terminology is unusual I think? I always wrote it with regex,
> > like "trace_*()".
> > 
> Sorry about the terminology. Part of the patches got merged this way,
> so is it okay to continue the terminology to have consistency?

Sure, no big deal.

> 
> > 
> > 
> > > 
> > > Original v2 series:
> > > https://lore.kernel.org/linux-trace-kernel/20260323160052.17528-1-vineeth@bitbyteword.org/
> > 
> > I'd put this in a Link: tag section below.
> > 
> Makes sense, will do. Steve also suggested to put this whole section
> after "---" because it isn't relevant to the changes. Will fix this in
> next iteration.

I agree with Steve. We have had some tendency in the past to have all
sorts of versioning information in the git log, which IMO is not useful
to anyone a few months after merging. The commit message should detail
the why, how and what, not the history.

> 
> > > 
> > > Parts of the original v2 series have already been merged in mainline.
> > > This patch is being reposted as a follow-up cleanup for the remaining
> > > unmerged pieces.
> > 
> > So this v3 series as a whole is a followup to that v2?
> > 
> v3 is a follow up to remaining patches that were not merged with the
> previous cycle. The core api and couple of patches went in the
> previous cycle, so this is for rest of it.
> 
> The intention was to send this v3 as a direct patch to individual
> subsystem maintainers but forgot to remove the numbering and hence
> there might be a confusion. Will remove the numbering and send it  as
> stand alone patch in the next iteration.

OK, cool.

Thanks,
Philipp


> 
> > > 
> > > Suggested-by: Steven Rostedt <rostedt@goodmis.org>
> > > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > > Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> > > Assisted-by: Claude:claude-sonnet-4-6
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c            |  2 +-
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c            |  4 ++--
> > >  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 10 +++++-----
> > >  drivers/gpu/drm/scheduler/sched_entity.c          |  5 +++--
> > >  4 files changed, 11 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > > index b24d5d21be5f..cb0b5cb07d57 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > > @@ -1004,7 +1004,7 @@ static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
> > >               struct amdgpu_job *job = p->jobs[i];
> > > 
> > >               for (j = 0; j < job->num_ibs; ++j)
> > > -                     trace_amdgpu_cs(p, job, &job->ibs[j]);
> > > +                     trace_call__amdgpu_cs(p, job, &job->ibs[j]);
> > >       }
> > >  }
> > > 
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > index 9ba9de16a27a..a36ae94c425f 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > > @@ -1415,7 +1415,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
> > > 
> > >       if (trace_amdgpu_vm_bo_mapping_enabled()) {
> > >               list_for_each_entry(mapping, &bo_va->valids, list)
> > > -                     trace_amdgpu_vm_bo_mapping(mapping);
> > > +                     trace_call__amdgpu_vm_bo_mapping(mapping);
> > >       }
> > > 
> > >  error_free:
> > > @@ -2183,7 +2183,7 @@ void amdgpu_vm_bo_trace_cs(struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket)
> > >                               continue;
> > >               }
> > > 
> > > -             trace_amdgpu_vm_bo_cs(mapping);
> > > +             trace_call__amdgpu_vm_bo_cs(mapping);
> > >       }
> > >  }
> > > 
> > > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > index 5fc5d5608506..fbdc12cdd6bb 100644
> > > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > @@ -5263,11 +5263,11 @@ static void amdgpu_dm_backlight_set_level(struct amdgpu_display_manager *dm,
> > >       }
> > > 
> > >       if (trace_amdgpu_dm_brightness_enabled()) {
> > > -             trace_amdgpu_dm_brightness(__builtin_return_address(0),
> > > -                                        user_brightness,
> > > -                                        brightness,
> > > -                                        caps->aux_support,
> > > -                                        power_supply_is_system_supplied() > 0);
> > > +             trace_call__amdgpu_dm_brightness(__builtin_return_address(0),
> > > +                                              user_brightness,
> > > +                                              brightness,
> > > +                                              caps->aux_support,
> > > +                                              power_supply_is_system_supplied() > 0);
> > >       }
> > > 
> > >       if (caps->aux_support) {
> > > diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c
> > > index fe174a4857be..185a2636b599 100644
> > > --- a/drivers/gpu/drm/scheduler/sched_entity.c
> > > +++ b/drivers/gpu/drm/scheduler/sched_entity.c
> > > @@ -429,7 +429,8 @@ static bool drm_sched_entity_add_dependency_cb(struct drm_sched_entity *entity,
> > > 
> > >       if (trace_drm_sched_job_unschedulable_enabled() &&
> > >           !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &entity->dependency->flags))
> > > -             trace_drm_sched_job_unschedulable(sched_job, entity->dependency);
> > > +             trace_call__drm_sched_job_unschedulable(sched_job,
> > > +                                                     entity->dependency);
> > 
> > I would be more happy if you sacrifice a bit of space here and keep it
> > a single line since the if condition is already quite convoluted and
> > challenging to read.
> > 
> I understand, will fix it in next iteration.
> 
> Thanks,
> Vineeth


^ permalink raw reply

* Re: [PATCH v2 08/14] verification/rvgen: Add golden and spec folders for tests
From: Gabriele Monaco @ 2026-05-19  7:29 UTC (permalink / raw)
  To: Nam Cao
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang,
	linux-kernel, linux-trace-kernel, Steven Rostedt
In-Reply-To: <87pl2t6qo8.fsf@yellow.woof>

On Mon, 2026-05-18 at 10:57 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> > Create reference models specifications and generated files in the golded
> > folder. Those can be used as reference to validate rvgen still generates
> > files as expected in automated tests.
> > 
> > Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> 
> Didn't look at the "golden" files, I presume those are generated.
> 
> Reviewed-by: Nam Cao <namcao@linutronix.de>

Thanks for the review!

Yes the golden are generated, I checked them and had a few AIs run through them
and that's how I spotted the True/true issue.
They aren't guaranteed to be spotless but again, some test is better than no
test.

Thanks,
Gabriele


^ permalink raw reply

* [PATCH 0/3] rv: rtapp monitor update
From: Nam Cao @ 2026-05-19  7:49 UTC (permalink / raw)
  To: Gabriele Monaco, Steven Rostedt, linux-kernel, linux-trace-kernel; +Cc: Nam Cao

Hi,

A couple of minor improvements to the rtapp monitor, making the monitor
more informative to user and update the allow list regarding
clock_nanosleep syscall.

Nam Cao (3):
  rv/rtapp/sleep: Make the error more informative for user
  rv/rtapp/sleep: Update nanosleep rule
  rv/rtapp: Add wakeup monitor

 kernel/trace/rv/Kconfig                       |   1 +
 kernel/trace/rv/Makefile                      |   1 +
 kernel/trace/rv/monitors/sleep/sleep.c        |  18 +-
 kernel/trace/rv/monitors/sleep/sleep.h        |  52 +++---
 kernel/trace/rv/monitors/wakeup/Kconfig       |  17 ++
 kernel/trace/rv/monitors/wakeup/wakeup.c      | 155 ++++++++++++++++++
 kernel/trace/rv/monitors/wakeup/wakeup.h      |  92 +++++++++++
 .../trace/rv/monitors/wakeup/wakeup_trace.h   |  14 ++
 kernel/trace/rv/rv_trace.h                    |   1 +
 tools/verification/models/rtapp/sleep.ltl     |   2 +-
 tools/verification/models/rtapp/wakeup.ltl    |   5 +
 11 files changed, 318 insertions(+), 40 deletions(-)
 create mode 100644 kernel/trace/rv/monitors/wakeup/Kconfig
 create mode 100644 kernel/trace/rv/monitors/wakeup/wakeup.c
 create mode 100644 kernel/trace/rv/monitors/wakeup/wakeup.h
 create mode 100644 kernel/trace/rv/monitors/wakeup/wakeup_trace.h
 create mode 100644 tools/verification/models/rtapp/wakeup.ltl

-- 
2.47.3


^ permalink raw reply

* [PATCH 1/3] rv/rtapp/sleep: Make the error more informative for user
From: Nam Cao @ 2026-05-19  7:49 UTC (permalink / raw)
  To: Gabriele Monaco, Steven Rostedt, linux-kernel, linux-trace-kernel; +Cc: Nam Cao
In-Reply-To: <cover.1779176466.git.namcao@linutronix.de>

The rtapp/sleep monitor detects real-time tasks which go to sleep in an
real-time-unsafe manner. If this happen, the monitor triggers a trace event
in the sched_wakeup tracepoint's handler.

However, the invoking context of that trace event is not the most
informative, because of the stack trace of that event is the wakeup's code
path which is not very helpful:

74.669317: rv:error_sleep: condvar[254]: violation detected
    ltl_validate+0x345 ([kernel.kallsyms])
    handle_sched_wakeup+0x34 ([kernel.kallsyms])
    ttwu_do_activate+0xff ([kernel.kallsyms])
    sched_ttwu_pending+0x104 ([kernel.kallsyms])
    __flush_smp_call_function_queue+0x15b ([kernel.kallsyms])
    __sysvec_call_function_single+0x18 ([kernel.kallsyms])
    sysvec_call_function_single+0x66 ([kernel.kallsyms])
    asm_sysvec_call_function_single+0x1a ([kernel.kallsyms])
    pv_native_safe_halt+0xf ([kernel.kallsyms])
    default_idle+0x9 ([kernel.kallsyms])
    default_idle_call+0x33 ([kernel.kallsyms])
    do_idle+0x234 ([kernel.kallsyms])
    cpu_startup_entry+0x24 ([kernel.kallsyms])
    start_secondary+0xf8 ([kernel.kallsyms])
    common_startup_64+0x13e ([kernel.kallsyms])

What would be much more valuable is the stack trace of the task itself.

Change the update of WAKEUP from being in sched_wakeup trace point's
handler to sched_exit trace point's handler. This makes the event happen in
the task's context, making the stack trace far more informative for user:

rv:error_sleep: condvar[254]: violation detected
    ltl_validate+0x345 ([kernel.kallsyms])
    handle_sched_exit+0x39 ([kernel.kallsyms])
    __schedule+0x80f ([kernel.kallsyms])
    schedule+0x22 ([kernel.kallsyms])
    futex_do_wait+0x33 ([kernel.kallsyms])
    __futex_wait+0x8c ([kernel.kallsyms])
    futex_wait+0x73 ([kernel.kallsyms])
    do_futex+0xc6 ([kernel.kallsyms])
    __x64_sys_futex+0x121 ([kernel.kallsyms])
    do_syscall_64+0xf3 ([kernel.kallsyms])
    entry_SYSCALL_64_after_hwframe+0x77 ([kernel.kallsyms])
    __futex_abstimed_wait_common64+0xc6 (inlined)
    __futex_abstimed_wait_common+0xc6 (/usr/lib/x86_64-linux-gnu/libc.so.6)

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 kernel/trace/rv/monitors/sleep/sleep.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index 8dfe5ec13e19..0a36f5519e6b 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -92,9 +92,9 @@ static void handle_sched_set_state(void *data, struct task_struct *task, int sta
 		ltl_atom_pulse(task, LTL_ABORT_SLEEP, true);
 }
 
-static void handle_sched_wakeup(void *data, struct task_struct *task)
+static void handle_sched_exit(void *data, bool is_switch)
 {
-	ltl_atom_pulse(task, LTL_WAKE, true);
+	ltl_atom_pulse(current, LTL_WAKE, true);
 }
 
 static void handle_sched_waking(void *data, struct task_struct *task)
@@ -200,7 +200,7 @@ static int enable_sleep(void)
 		return retval;
 
 	rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
-	rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+	rv_attach_trace_probe("rtapp_sleep", sched_exit_tp, handle_sched_exit);
 	rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
 	rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
 	rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
@@ -213,7 +213,7 @@ static int enable_sleep(void)
 static void disable_sleep(void)
 {
 	rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
-	rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+	rv_detach_trace_probe("rtapp_sleep", sched_exit_tp, handle_sched_exit);
 	rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
 	rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
 	rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
-- 
2.47.3


^ permalink raw reply related

* [PATCH 2/3] rv/rtapp/sleep: Update nanosleep rule
From: Nam Cao @ 2026-05-19  7:49 UTC (permalink / raw)
  To: Gabriele Monaco, Steven Rostedt, linux-kernel, linux-trace-kernel; +Cc: Nam Cao
In-Reply-To: <cover.1779176466.git.namcao@linutronix.de>

CLOCK_REALTIME is the only clock that often is misused in real-time
applications. The other clocks either are safe for real-time uses
(CLOCK_TAI, CLOCK_MONOTONIC, CLOCK_BOOTTIME) or are unlikely to be misused
(CLOCK_AUX, CLOCK_PROCESS_CPUTIME_ID).

The rtapp monitor's purpose is warning people about common mistakes with
real-time design. However, warning about all clock types generates too much
false positives.

Update the monitor to only warn about CLOCK_REALTIME.

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 kernel/trace/rv/monitors/sleep/sleep.c    | 10 ++---
 kernel/trace/rv/monitors/sleep/sleep.h    | 52 +++++++++++------------
 tools/verification/models/rtapp/sleep.ltl |  2 +-
 3 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index 0a36f5519e6b..e01ac56b3f4a 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -43,9 +43,7 @@ static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bo
 	ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
 
 	if (task_creation) {
-		ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
-		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
-		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, false);
 		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
 		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
 		ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
@@ -136,8 +134,7 @@ static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
 	case __NR_clock_nanosleep_time64:
 #endif
 		syscall_get_arguments(current, regs, args);
-		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC);
-		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI);
+		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, args[0] == CLOCK_REALTIME);
 		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME);
 		ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
 		break;
@@ -178,8 +175,7 @@ static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
 
 	ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
 	ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
-	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
-	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, false);
 	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
 	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
 	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
index 95dc2727c059..ed1ac7ad008e 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.h
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -20,8 +20,7 @@ enum ltl_atom {
 	LTL_FUTEX_WAIT,
 	LTL_KERNEL_THREAD,
 	LTL_KTHREAD_SHOULD_STOP,
-	LTL_NANOSLEEP_CLOCK_MONOTONIC,
-	LTL_NANOSLEEP_CLOCK_TAI,
+	LTL_NANOSLEEP_CLOCK_REALTIME,
 	LTL_NANOSLEEP_TIMER_ABSTIME,
 	LTL_RT,
 	LTL_SLEEP,
@@ -46,8 +45,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
 		"fu_wa",
 		"ker_th",
 		"kth_sh_st",
-		"na_cl_mo",
-		"na_cl_ta",
+		"na_cl_re",
 		"na_ti_ab",
 		"rt",
 		"sl",
@@ -87,8 +85,7 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
 	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
 	bool rt = test_bit(LTL_RT, mon->atoms);
 	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
-	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
-	bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+	bool nanosleep_clock_realtime = test_bit(LTL_NANOSLEEP_CLOCK_REALTIME, mon->atoms);
 	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
 	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
 	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
@@ -97,17 +94,17 @@ static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
 	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
 	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
 	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
-	bool val42 = task_is_rcu || task_is_migration;
-	bool val43 = futex_lock_pi || val42;
-	bool val5 = block_on_rt_mutex || val43;
-	bool val34 = abort_sleep || kthread_should_stop;
-	bool val35 = woken_by_nmi || val34;
-	bool val36 = woken_by_hardirq || val35;
-	bool val14 = woken_by_equal_or_higher_prio || val36;
+	bool val41 = task_is_rcu || task_is_migration;
+	bool val42 = futex_lock_pi || val41;
+	bool val5 = block_on_rt_mutex || val42;
+	bool val33 = abort_sleep || kthread_should_stop;
+	bool val34 = woken_by_nmi || val33;
+	bool val35 = woken_by_hardirq || val34;
+	bool val14 = woken_by_equal_or_higher_prio || val35;
 	bool val13 = !wake;
-	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
-	bool val27 = nanosleep_timer_abstime && val26;
-	bool val18 = clock_nanosleep && val27;
+	bool val25 = !nanosleep_clock_realtime;
+	bool val26 = nanosleep_timer_abstime && val25;
+	bool val18 = clock_nanosleep && val26;
 	bool val20 = val18 || epoll_wait;
 	bool val9 = futex_wait || val20;
 	bool val11 = val9 || kernel_thread;
@@ -138,8 +135,7 @@ ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned l
 	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
 	bool rt = test_bit(LTL_RT, mon->atoms);
 	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
-	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
-	bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+	bool nanosleep_clock_realtime = test_bit(LTL_NANOSLEEP_CLOCK_REALTIME, mon->atoms);
 	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
 	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
 	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
@@ -148,17 +144,17 @@ ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned l
 	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
 	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
 	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
-	bool val42 = task_is_rcu || task_is_migration;
-	bool val43 = futex_lock_pi || val42;
-	bool val5 = block_on_rt_mutex || val43;
-	bool val34 = abort_sleep || kthread_should_stop;
-	bool val35 = woken_by_nmi || val34;
-	bool val36 = woken_by_hardirq || val35;
-	bool val14 = woken_by_equal_or_higher_prio || val36;
+	bool val41 = task_is_rcu || task_is_migration;
+	bool val42 = futex_lock_pi || val41;
+	bool val5 = block_on_rt_mutex || val42;
+	bool val33 = abort_sleep || kthread_should_stop;
+	bool val34 = woken_by_nmi || val33;
+	bool val35 = woken_by_hardirq || val34;
+	bool val14 = woken_by_equal_or_higher_prio || val35;
 	bool val13 = !wake;
-	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
-	bool val27 = nanosleep_timer_abstime && val26;
-	bool val18 = clock_nanosleep && val27;
+	bool val25 = !nanosleep_clock_realtime;
+	bool val26 = nanosleep_timer_abstime && val25;
+	bool val18 = clock_nanosleep && val26;
 	bool val20 = val18 || epoll_wait;
 	bool val9 = futex_wait || val20;
 	bool val11 = val9 || kernel_thread;
diff --git a/tools/verification/models/rtapp/sleep.ltl b/tools/verification/models/rtapp/sleep.ltl
index 6f26c4810f78..2637bc48a620 100644
--- a/tools/verification/models/rtapp/sleep.ltl
+++ b/tools/verification/models/rtapp/sleep.ltl
@@ -9,7 +9,7 @@ RT_VALID_SLEEP_REASON = FUTEX_WAIT
 
 RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
                     and NANOSLEEP_TIMER_ABSTIME
-                    and (NANOSLEEP_CLOCK_MONOTONIC or NANOSLEEP_CLOCK_TAI)
+                    and not NANOSLEEP_CLOCK_REALTIME
 
 RT_FRIENDLY_WAKE = WOKEN_BY_EQUAL_OR_HIGHER_PRIO
                 or WOKEN_BY_HARDIRQ
-- 
2.47.3


^ permalink raw reply related

* [PATCH 3/3] rv/rtapp: Add wakeup monitor
From: Nam Cao @ 2026-05-19  7:49 UTC (permalink / raw)
  To: Gabriele Monaco, Steven Rostedt, linux-kernel, linux-trace-kernel; +Cc: Nam Cao
In-Reply-To: <cover.1779176466.git.namcao@linutronix.de>

Add a wakeup monitor to detect a lower-priority task waking up a
higher-priority task.

The rtapp/sleep monitor already detects this. However, that monitor
triggers an error in the context of the woken task and user only gets the
stacktrace of that task. It is also extremely useful to get the stacktrace
of the waking task, which this monitor offers. In other words, this monitor
complements the rtapp/sleep monitor.

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 kernel/trace/rv/Kconfig                       |   1 +
 kernel/trace/rv/Makefile                      |   1 +
 kernel/trace/rv/monitors/wakeup/Kconfig       |  17 ++
 kernel/trace/rv/monitors/wakeup/wakeup.c      | 155 ++++++++++++++++++
 kernel/trace/rv/monitors/wakeup/wakeup.h      |  92 +++++++++++
 .../trace/rv/monitors/wakeup/wakeup_trace.h   |  14 ++
 kernel/trace/rv/rv_trace.h                    |   1 +
 tools/verification/models/rtapp/wakeup.ltl    |   5 +
 8 files changed, 286 insertions(+)
 create mode 100644 kernel/trace/rv/monitors/wakeup/Kconfig
 create mode 100644 kernel/trace/rv/monitors/wakeup/wakeup.c
 create mode 100644 kernel/trace/rv/monitors/wakeup/wakeup.h
 create mode 100644 kernel/trace/rv/monitors/wakeup/wakeup_trace.h
 create mode 100644 tools/verification/models/rtapp/wakeup.ltl

diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 3884b14df375..4d3a14a0bac2 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -76,6 +76,7 @@ source "kernel/trace/rv/monitors/opid/Kconfig"
 source "kernel/trace/rv/monitors/rtapp/Kconfig"
 source "kernel/trace/rv/monitors/pagefault/Kconfig"
 source "kernel/trace/rv/monitors/sleep/Kconfig"
+source "kernel/trace/rv/monitors/wakeup/Kconfig"
 # Add new rtapp monitors here
 
 source "kernel/trace/rv/monitors/stall/Kconfig"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 94498da35b37..c2c0e4142eb4 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
 obj-$(CONFIG_RV_MON_STALL) += monitors/stall/stall.o
 obj-$(CONFIG_RV_MON_DEADLINE) += monitors/deadline/deadline.o
 obj-$(CONFIG_RV_MON_NOMISS) += monitors/nomiss/nomiss.o
+obj-$(CONFIG_RV_MON_WAKEUP) += monitors/wakeup/wakeup.o
 # Add new monitors here
 obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
 obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/wakeup/Kconfig b/kernel/trace/rv/monitors/wakeup/Kconfig
new file mode 100644
index 000000000000..3cf11c5cd5f7
--- /dev/null
+++ b/kernel/trace/rv/monitors/wakeup/Kconfig
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_WAKEUP
+	depends on RV
+	depends on RV_MON_RTAPP
+	depends on HAVE_SYSCALL_TRACEPOINTS
+	select TRACE_IRQFLAGS
+	default y
+	select LTL_MON_EVENTS_ID
+	bool "wakeup monitor"
+	help
+	  This monitor detects a lower-priority task waking up a
+	  higher-priority task. The RV_MON_SLEEP monitor already
+	  detects this case, but this monitor detects in the context
+	  of the waking task instead. This and RV_MON_SLEEP can be
+	  enabled together to get the stacktrace of both the waking
+	  task and the woken task.
diff --git a/kernel/trace/rv/monitors/wakeup/wakeup.c b/kernel/trace/rv/monitors/wakeup/wakeup.c
new file mode 100644
index 000000000000..534997a7b45c
--- /dev/null
+++ b/kernel/trace/rv/monitors/wakeup/wakeup.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "wakeup"
+
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/lock.h>
+#include <uapi/linux/futex.h>
+
+#include <rv_trace.h>
+#include <monitors/rtapp/rtapp.h>
+
+
+#ifndef __NR_futex
+#define __NR_futex (-__COUNTER__)
+#endif
+#ifndef __NR_futex_time64
+#define __NR_futex_time64 (-__COUNTER__)
+#endif
+
+#include "wakeup.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+	/*
+	 * This includes "actual" real-time tasks and also PI-boosted
+	 * tasks. A task being PI-boosted means it is blocking an "actual"
+	 * real-task, therefore it should also obey the monitor's rule,
+	 * otherwise the "actual" real-task may be delayed.
+	 */
+	ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+	ltl_atom_set(mon, LTL_WOKEN_BY_LOWER_PRIO, false);
+	ltl_atom_set(mon, LTL_WOKEN_BY_SOFTIRQ, false);
+
+	if (task_creation) {
+		ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
+		ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+	}
+
+	ltl_atom_set(mon, LTL_USER_THREAD, !(task->flags & PF_KTHREAD));
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+	if (this_cpu_read(hardirq_context)) {
+		return;
+	} else if (in_task()) {
+		if (current->prio > task->prio)
+			ltl_atom_pulse(task, LTL_WOKEN_BY_LOWER_PRIO, true);
+	} else if (in_serving_softirq()) {
+		ltl_atom_pulse(task, LTL_WOKEN_BY_SOFTIRQ, true);
+	}
+}
+
+static void handle_contention_begin(void *data, void *lock, unsigned int flags)
+{
+	if (flags & LCB_F_RT)
+		ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true);
+}
+
+static void handle_contention_end(void *data, void *lock, int ret)
+{
+	ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false);
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+	unsigned long args[6];
+	int op, cmd;
+
+	switch (id) {
+	case __NR_futex:
+	case __NR_futex_time64:
+		syscall_get_arguments(current, regs, args);
+		op = args[1];
+		cmd = op & FUTEX_CMD_MASK;
+
+		switch (cmd) {
+		case FUTEX_LOCK_PI:
+		case FUTEX_LOCK_PI2:
+			ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true);
+			break;
+		}
+		break;
+	}
+}
+
+static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
+{
+	ltl_atom_update(current, LTL_FUTEX_LOCK_PI, false);
+}
+
+static int enable_wakeup(void)
+{
+	int retval;
+
+	retval = ltl_monitor_init();
+	if (retval)
+		return retval;
+
+	rv_attach_trace_probe("rtapp_wakeup", sched_waking, handle_sched_waking);
+	rv_attach_trace_probe("rtapp_wakeup", contention_begin, handle_contention_begin);
+	rv_attach_trace_probe("rtapp_wakeup", contention_end, handle_contention_end);
+	rv_attach_trace_probe("rtapp_wakeup", sys_enter, handle_sys_enter);
+	rv_attach_trace_probe("rtapp_wakeup", sys_exit, handle_sys_exit);
+
+	return 0;
+}
+
+static void disable_wakeup(void)
+{
+	rv_detach_trace_probe("rtapp_wakeup", sched_waking, handle_sched_waking);
+	rv_detach_trace_probe("rtapp_wakeup", contention_begin, handle_contention_begin);
+	rv_detach_trace_probe("rtapp_wakeup", contention_end, handle_contention_end);
+	rv_detach_trace_probe("rtapp_wakeup", sys_enter, handle_sys_enter);
+	rv_detach_trace_probe("rtapp_wakeup", sys_exit, handle_sys_exit);
+
+	ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_wakeup = {
+	.name = "wakeup",
+	.description = "Monitor that real-time tasks are not woken by lower-priority tasks",
+	.enable = enable_wakeup,
+	.disable = disable_wakeup,
+};
+
+static int __init register_wakeup(void)
+{
+	return rv_register_monitor(&rv_wakeup, &rv_rtapp);
+}
+
+static void __exit unregister_wakeup(void)
+{
+	rv_unregister_monitor(&rv_wakeup);
+}
+
+module_init(register_wakeup);
+module_exit(unregister_wakeup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("Monitor that real-time tasks are not woken by lower-priority tasks");
diff --git a/kernel/trace/rv/monitors/wakeup/wakeup.h b/kernel/trace/rv/monitors/wakeup/wakeup.h
new file mode 100644
index 000000000000..6f80da64e0e1
--- /dev/null
+++ b/kernel/trace/rv/monitors/wakeup/wakeup.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ *   Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME wakeup
+
+enum ltl_atom {
+	LTL_BLOCK_ON_RT_MUTEX,
+	LTL_FUTEX_LOCK_PI,
+	LTL_RT,
+	LTL_USER_THREAD,
+	LTL_WOKEN_BY_LOWER_PRIO,
+	LTL_WOKEN_BY_SOFTIRQ,
+	LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+	static const char *const names[] = {
+		"bl_on_rt_mu",
+		"fu_lo_pi",
+		"rt",
+		"us_th",
+		"wo_lo_pr",
+		"wo_so",
+	};
+
+	return names[atom];
+}
+
+enum ltl_buchi_state {
+	S0,
+	RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+	bool woken_by_softirq = test_bit(LTL_WOKEN_BY_SOFTIRQ, mon->atoms);
+	bool woken_by_lower_prio = test_bit(LTL_WOKEN_BY_LOWER_PRIO, mon->atoms);
+	bool user_thread = test_bit(LTL_USER_THREAD, mon->atoms);
+	bool rt = test_bit(LTL_RT, mon->atoms);
+	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+	bool val9 = block_on_rt_mutex || futex_lock_pi;
+	bool val6 = !woken_by_softirq;
+	bool val5 = !woken_by_lower_prio;
+	bool val8 = val5 && val6;
+	bool val10 = val8 || val9;
+	bool val3 = !user_thread;
+	bool val2 = !rt;
+	bool val4 = val2 || val3;
+	bool val11 = val4 || val10;
+
+	if (val11)
+		__set_bit(S0, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+	bool woken_by_softirq = test_bit(LTL_WOKEN_BY_SOFTIRQ, mon->atoms);
+	bool woken_by_lower_prio = test_bit(LTL_WOKEN_BY_LOWER_PRIO, mon->atoms);
+	bool user_thread = test_bit(LTL_USER_THREAD, mon->atoms);
+	bool rt = test_bit(LTL_RT, mon->atoms);
+	bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+	bool val9 = block_on_rt_mutex || futex_lock_pi;
+	bool val6 = !woken_by_softirq;
+	bool val5 = !woken_by_lower_prio;
+	bool val8 = val5 && val6;
+	bool val10 = val8 || val9;
+	bool val3 = !user_thread;
+	bool val2 = !rt;
+	bool val4 = val2 || val3;
+	bool val11 = val4 || val10;
+
+	switch (state) {
+	case S0:
+		if (val11)
+			__set_bit(S0, next);
+		break;
+	}
+}
diff --git a/kernel/trace/rv/monitors/wakeup/wakeup_trace.h b/kernel/trace/rv/monitors/wakeup/wakeup_trace.h
new file mode 100644
index 000000000000..7e056183f920
--- /dev/null
+++ b/kernel/trace/rv/monitors/wakeup/wakeup_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WAKEUP
+DEFINE_EVENT(event_ltl_monitor_id, event_wakeup,
+	     TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+	     TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_wakeup,
+	     TP_PROTO(struct task_struct *task),
+	     TP_ARGS(task));
+#endif /* CONFIG_RV_MON_WAKEUP */
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 9622c269789c..2f8a932432c9 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -241,6 +241,7 @@ DECLARE_EVENT_CLASS(error_ltl_monitor_id,
 );
 #include <monitors/pagefault/pagefault_trace.h>
 #include <monitors/sleep/sleep_trace.h>
+#include <monitors/wakeup/wakeup_trace.h>
 // Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
 #endif /* CONFIG_LTL_MON_EVENTS_ID */
 
diff --git a/tools/verification/models/rtapp/wakeup.ltl b/tools/verification/models/rtapp/wakeup.ltl
new file mode 100644
index 000000000000..a5d63ca0811a
--- /dev/null
+++ b/tools/verification/models/rtapp/wakeup.ltl
@@ -0,0 +1,5 @@
+RULE = always (((RT and USER_THREAD) imply
+		(not (WOKEN_BY_LOWER_PRIO or WOKEN_BY_SOFTIRQ)) or ALLOWLIST))
+
+ALLOWLIST = BLOCK_ON_RT_MUTEX
+         or FUTEX_LOCK_PI
-- 
2.47.3


^ permalink raw reply related

* [PATCH] tracing/blktrace: Use sysfs_emit() for sysfs show callbacks
From: Yu Peng @ 2026-05-19  7:50 UTC (permalink / raw)
  To: Jens Axboe, Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-block, linux-kernel, linux-trace-kernel,
	Yu Peng

Use sysfs_emit() and sysfs_emit_at() instead of sprintf() when
formatting blktrace sysfs show output.

No functional change intended.

Signed-off-by: Yu Peng <pengyu@kylinos.cn>
---
 kernel/trace/blktrace.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8cd2520b4c99e..1eda8158883ca 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -2025,11 +2025,11 @@ static ssize_t blk_trace_mask2str(char *buf, int mask)
 
 	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
 		if (mask & mask_maps[i].mask) {
-			p += sprintf(p, "%s%s",
+			p += sysfs_emit_at(buf, p - buf, "%s%s",
 				    (p == buf) ? "" : ",", mask_maps[i].str);
 		}
 	}
-	*p++ = '\n';
+	p += sysfs_emit_at(buf, p - buf, "\n");
 
 	return p - buf;
 }
@@ -2048,20 +2048,20 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	bt = rcu_dereference_protected(q->blk_trace,
 				       lockdep_is_held(&q->debugfs_mutex));
 	if (attr == &dev_attr_enable) {
-		ret = sprintf(buf, "%u\n", !!bt);
+		ret = sysfs_emit(buf, "%u\n", !!bt);
 		goto out_unlock_bdev;
 	}
 
 	if (bt == NULL)
-		ret = sprintf(buf, "disabled\n");
+		ret = sysfs_emit(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
 		ret = blk_trace_mask2str(buf, bt->act_mask);
 	else if (attr == &dev_attr_pid)
-		ret = sprintf(buf, "%u\n", bt->pid);
+		ret = sysfs_emit(buf, "%u\n", bt->pid);
 	else if (attr == &dev_attr_start_lba)
-		ret = sprintf(buf, "%llu\n", bt->start_lba);
+		ret = sysfs_emit(buf, "%llu\n", bt->start_lba);
 	else if (attr == &dev_attr_end_lba)
-		ret = sprintf(buf, "%llu\n", bt->end_lba);
+		ret = sysfs_emit(buf, "%llu\n", bt->end_lba);
 
 out_unlock_bdev:
 	blk_debugfs_unlock_nomemrestore(q);
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH 07/13] rv: Simply hybrid automata monitors's clock variables
From: Gabriele Monaco @ 2026-05-19  7:58 UTC (permalink / raw)
  To: Nam Cao
  Cc: Steven Rostedt, Wander Lairson Costa, linux-trace-kernel,
	linux-kernel
In-Reply-To: <87h5o588m4.fsf@yellow.woof>

On Mon, 2026-05-18 at 09:44 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> > On Mon, 2026-05-11 at 13:55 +0200, Nam Cao wrote:
> > > That can work, but not ideal, because hrtimer will not be usable.
> > 
> > Why not? If we have HA_TIMER_WHEEL , we'd use timer and expire, if we have
> > HA_TIMER_HRTIMER we'd only need hrtimer with it's hrtimer_get_expires():
> > 
> >  union {
> >  struct hrtimer hrtimer;
> >  struct {
> >  struct timer_list timer;
> >  u64 expire; /* Explicitly store the armed budget */
> >  };
> > 
> > we already can't use timer and hrtimer interchangeably.
> > What am I missing here?
> 
> Ah, now I understand the trick, thanks.
> 
> We already have an "expires" field in struct timer_list. But I am not
> sure if we are supposed to touch that field. Your proposal looks safer.

Yeah and even if we did, that'd be jiffy-granularity, so not good if the clock
is ns-based.

Let me sketch it out.

Anyway back to the patch, you need to fix the build for HA_TIMER_HRTIMER as well
(too many arguments to function ‘ha_invariant_passed_ns’; expected 3, have 4),
and the title should s/Simply/Simplify/

Thanks,
Gabriele

> 
> > > Looking at the throttle monitor again, is it possible to rewrite
> > > runtime_left_ns() to read .dl_runtime instead of .runtime? I don't know
> > > the deadline schedule very well, but I think .dl_runtime is not changing
> > > like .runtime?
> > 
> > In theory yes, but since the runtime is consumed only when running, we
> > cannot
> > just set the timeout once. We either save how much was consumed somewhere or
> > do
> > some start/pause mechanism.
> > Neither looks simpler to me.
> 
> Understood.
> 
> Nam


^ permalink raw reply

* [PATCH] tracing/branch: Use pr_warn() instead of printk(KERN_WARNING)
From: Yu Peng @ 2026-05-19  8:16 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, Yu Peng

Use pr_warn() instead of printk(KERN_WARNING ...) for the branch tracer
warning messages.

Keep the message text unchanged. The change only removes the open-coded
log level from these warnings.

Signed-off-by: Yu Peng <pengyu@kylinos.cn>
 kernel/trace/trace_branch.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index d1564db95a8f5..d8e97ad798f07 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -181,8 +181,7 @@ __init static int init_branch_tracer(void)
 
 	ret = register_trace_event(&trace_branch_event);
 	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register "
-				    "branch events\n");
+		pr_warn("Warning: could not register branch events\n");
 		return 1;
 	}
 	return register_tracer(&branch_trace);
@@ -374,8 +373,7 @@ __init static int init_annotated_branch_stats(void)
 
 	ret = register_stat_tracer(&annotated_branch_stats);
 	if (ret) {
-		printk(KERN_WARNING "Warning: could not register "
-				    "annotated branches stats\n");
+		pr_warn("Warning: could not register annotated branches stats\n");
 		return ret;
 	}
 	return 0;
@@ -439,8 +437,7 @@ __init static int all_annotated_branch_stats(void)
 
 	ret = register_stat_tracer(&all_branch_stats);
 	if (ret) {
-		printk(KERN_WARNING "Warning: could not register "
-				    "all branches stats\n");
+		pr_warn("Warning: could not register all branches stats\n");
 		return ret;
 	}
 	return 0;
-- 
2.43.0

^ permalink raw reply related

* [PATCH] tracing: Use krealloc_array() for trace option array growth
From: Yu Peng @ 2026-05-19  8:34 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu
  Cc: Mathieu Desnoyers, linux-kernel, linux-trace-kernel, Yu Peng

Use krealloc_array() when growing tr->topts instead of open-coding the
size calculation in krealloc().

This makes the resize path use the helper intended for array allocations
and avoids manual multiplication of the element count and element size.

Signed-off-by: Yu Peng <pengyu@kylinos.cn>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d5..bde22d693d2e4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7928,8 +7928,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer,
 	if (!topts)
 		return 0;
 
-	tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
-			    GFP_KERNEL);
+	tr_topts = krealloc_array(tr->topts, tr->nr_topts + 1, sizeof(*tr->topts),
+				  GFP_KERNEL);
 	if (!tr_topts) {
 		kfree(topts);
 		return -ENOMEM;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH 6/9] rv: Ensure synchronous cleanup for HA monitors
From: Gabriele Monaco @ 2026-05-19  9:31 UTC (permalink / raw)
  To: Wen Yang; +Cc: linux-kernel, Steven Rostedt, Nam Cao, linux-trace-kernel
In-Reply-To: <88a6fc5c08d18e3c1f6d29dc106db80fa688bf87.camel@redhat.com>



On Mon, 2026-05-18 at 13:54 +0200, Gabriele Monaco wrote:
> Something like:
> 
> void __ha_monitor_timer_callback() {
> 	guard(rcu)(); //this is only for waiters, let them wait more
> 
> 	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
> 		return;
> 	smp_rmb();
> 	curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
> 	...
> }
> 
> void da_monitor_reset() {
> 	da_monitor_reset_hook(da_mon);
> 	WRITE_ONCE(da_mon->monitoring, 0);
> 	smp_wmb();
> 	WRITE_ONCE(da_mon->curr_state, model_get_initial_state());
> }

That's obviously not going to work unless I read curr_state earlier (and use the
acquire/release helpers while at it):

void __ha_monitor_timer_callback() {
	guard(rcu)(); //this is only for waiters, let them wait more

	curr_state = smp_load_acquire(&ha_mon->da_mon.curr_state);
	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
		return;
	...
}

void da_monitor_reset() {
	da_monitor_reset_hook(da_mon);
	WRITE_ONCE(da_mon->monitoring, 0);
	smp_store_release(&da_mon->curr_state, model_get_initial_state());
}


^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: kernel test robot @ 2026-05-19  9:34 UTC (permalink / raw)
  To: Steven Rostedt, LKML, Linux Trace Kernel, bpf
  Cc: oe-kbuild-all, Masami Hiramatsu, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Linux Memory Management List,
	Thomas Gleixner, Ian Rogers, Jiri Olsa, Subject:[PATCH v2]
In-Reply-To: <20260518232312.0c78f055@gandalf.local.home>

Hi Steven,

kernel test robot noticed the following build errors:

[auto build test ERROR on trace/for-next]
[also build test ERROR on linus/master v7.1-rc4 next-20260518]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-probes-Allow-use-of-BTF-names-to-dereference-pointers/20260519-121930
base:   https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next
patch link:    https://lore.kernel.org/r/20260518232312.0c78f055%40gandalf.local.home
patch subject: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
config: sh-defconfig (https://download.01.org/0day-ci/archive/20260519/202605191710.jVjifK67-lkp@intel.com/config)
compiler: sh4-linux-gcc (GCC) 15.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260519/202605191710.jVjifK67-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605191710.jVjifK67-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/trace/trace_probe.c: In function 'parse_probe_arg':
>> kernel/trace/trace_probe.c:1289:23: error: implicit declaration of function 'query_btf_struct' [-Wimplicit-function-declaration]
    1289 |                 ret = query_btf_struct(arg + 1, ctx);
         |                       ^~~~~~~~~~~~~~~~


vim +/query_btf_struct +1289 kernel/trace/trace_probe.c

  1120	
  1121	/* Recursive argument parser */
  1122	static int
  1123	parse_probe_arg(char *arg, const struct fetch_type *type,
  1124			struct fetch_insn **pcode, struct fetch_insn *end,
  1125			struct traceprobe_parse_context *ctx)
  1126	{
  1127		struct fetch_insn *code = *pcode;
  1128		unsigned long param;
  1129		int deref = FETCH_OP_DEREF;
  1130		long offset = 0;
  1131		char *tmp;
  1132		int ret = 0;
  1133	
  1134		switch (arg[0]) {
  1135		case '$':
  1136			ret = parse_probe_vars(arg, type, pcode, end, ctx);
  1137			break;
  1138	
  1139		case '%':	/* named register */
  1140			if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) {
  1141				/* eprobe and fprobe do not handle registers */
  1142				trace_probe_log_err(ctx->offset, BAD_VAR);
  1143				break;
  1144			}
  1145			ret = regs_query_register_offset(arg + 1);
  1146			if (ret >= 0) {
  1147				code->op = FETCH_OP_REG;
  1148				code->param = (unsigned int)ret;
  1149				ret = 0;
  1150			} else
  1151				trace_probe_log_err(ctx->offset, BAD_REG_NAME);
  1152			break;
  1153	
  1154		case '@':	/* memory, file-offset or symbol */
  1155			if (isdigit(arg[1])) {
  1156				ret = kstrtoul(arg + 1, 0, &param);
  1157				if (ret) {
  1158					trace_probe_log_err(ctx->offset, BAD_MEM_ADDR);
  1159					break;
  1160				}
  1161				/* load address */
  1162				code->op = FETCH_OP_IMM;
  1163				code->immediate = param;
  1164			} else if (arg[1] == '+') {
  1165				/* kprobes don't support file offsets */
  1166				if (ctx->flags & TPARG_FL_KERNEL) {
  1167					trace_probe_log_err(ctx->offset, FILE_ON_KPROBE);
  1168					return -EINVAL;
  1169				}
  1170				ret = kstrtol(arg + 2, 0, &offset);
  1171				if (ret) {
  1172					trace_probe_log_err(ctx->offset, BAD_FILE_OFFS);
  1173					break;
  1174				}
  1175	
  1176				code->op = FETCH_OP_FOFFS;
  1177				code->immediate = (unsigned long)offset;  // imm64?
  1178			} else {
  1179				/* uprobes don't support symbols */
  1180				if (!(ctx->flags & TPARG_FL_KERNEL)) {
  1181					trace_probe_log_err(ctx->offset, SYM_ON_UPROBE);
  1182					return -EINVAL;
  1183				}
  1184				/* Preserve symbol for updating */
  1185				code->op = FETCH_NOP_SYMBOL;
  1186				code->data = kstrdup(arg + 1, GFP_KERNEL);
  1187				if (!code->data)
  1188					return -ENOMEM;
  1189				if (++code == end) {
  1190					trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
  1191					return -EINVAL;
  1192				}
  1193				code->op = FETCH_OP_IMM;
  1194				code->immediate = 0;
  1195			}
  1196			/* These are fetching from memory */
  1197			if (++code == end) {
  1198				trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
  1199				return -EINVAL;
  1200			}
  1201			*pcode = code;
  1202			code->op = FETCH_OP_DEREF;
  1203			code->offset = offset;
  1204			break;
  1205	
  1206		case '+':	/* deref memory */
  1207		case '-':
  1208			if (arg[1] == 'u') {
  1209				deref = FETCH_OP_UDEREF;
  1210				arg[1] = arg[0];
  1211				arg++;
  1212			}
  1213			if (arg[0] == '+')
  1214				arg++;	/* Skip '+', because kstrtol() rejects it. */
  1215			tmp = strchr(arg, '(');
  1216			if (!tmp) {
  1217				trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE);
  1218				return -EINVAL;
  1219			}
  1220			*tmp = '\0';
  1221			ret = kstrtol(arg, 0, &offset);
  1222			if (ret) {
  1223				trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS);
  1224				break;
  1225			}
  1226			ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
  1227			arg = tmp + 1;
  1228			tmp = strrchr(arg, ')');
  1229			if (!tmp) {
  1230				trace_probe_log_err(ctx->offset + strlen(arg),
  1231						    DEREF_OPEN_BRACE);
  1232				return -EINVAL;
  1233			} else {
  1234				const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags);
  1235				int cur_offs = ctx->offset;
  1236	
  1237				*tmp = '\0';
  1238				ret = parse_probe_arg(arg, t2, &code, end, ctx);
  1239				if (ret)
  1240					break;
  1241				ctx->offset = cur_offs;
  1242				if (code->op == FETCH_OP_COMM ||
  1243				    code->op == FETCH_OP_DATA) {
  1244					trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
  1245					return -EINVAL;
  1246				}
  1247				if (++code == end) {
  1248					trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
  1249					return -EINVAL;
  1250				}
  1251				*pcode = code;
  1252	
  1253				code->op = deref;
  1254				code->offset = offset;
  1255				/* Reset the last type if used */
  1256				ctx->last_type = NULL;
  1257			}
  1258			break;
  1259		case '\\':	/* Immediate value */
  1260			if (arg[1] == '"') {	/* Immediate string */
  1261				ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2);
  1262				if (ret)
  1263					break;
  1264				code->op = FETCH_OP_DATA;
  1265				code->data = tmp;
  1266			} else {
  1267				ret = str_to_immediate(arg + 1, &code->immediate);
  1268				if (ret)
  1269					trace_probe_log_err(ctx->offset + 1, BAD_IMM);
  1270				else
  1271					code->op = FETCH_OP_IMM;
  1272			}
  1273			break;
  1274		case '(':
  1275			tmp = strrchr(arg, ')');
  1276			if (!tmp) {
  1277				trace_probe_log_err(ctx->offset + strlen(arg),
  1278						    DEREF_OPEN_BRACE);
  1279				return -EINVAL;
  1280			}
  1281	
  1282			tmp--;
  1283			if (*tmp != '*') {
  1284				trace_probe_log_err(ctx->offset + (tmp - arg),
  1285						    NO_PTR_STRCT);
  1286				return -EINVAL;
  1287			}
  1288			*tmp = '\0';
> 1289			ret = query_btf_struct(arg + 1, ctx);
  1290			*tmp = '*';
  1291	
  1292			if (ret < 0) {
  1293				trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
  1294				return -EINVAL;
  1295			}
  1296	
  1297			ctx->flags |= TPARG_FL_STRUCT;
  1298			tmp += 2;
  1299	
  1300			if (*tmp != '$') {
  1301				trace_probe_log_err(ctx->offset + (tmp - arg),
  1302						    BAD_VAR);
  1303				return -EINVAL;
  1304			}
  1305	
  1306			ctx->offset += tmp - arg;
  1307			ret = parse_probe_vars(tmp, type, pcode, end, ctx);
  1308			ctx->flags &= ~TPARG_FL_STRUCT;
  1309			ctx->last_struct = NULL;
  1310			break;
  1311		default:
  1312			if (isalpha(arg[0]) || arg[0] == '_') {	/* BTF variable */
  1313				if (!tparg_is_function_entry(ctx->flags) &&
  1314				    !tparg_is_function_return(ctx->flags)) {
  1315					trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
  1316					return -EINVAL;
  1317				}
  1318				ret = parse_btf_arg(arg, pcode, end, ctx);
  1319				break;
  1320			}
  1321		}
  1322		if (!ret && code->op == FETCH_OP_NOP) {
  1323			/* Parsed, but do not find fetch method */
  1324			trace_probe_log_err(ctx->offset, BAD_FETCH_ARG);
  1325			ret = -EINVAL;
  1326		}
  1327		return ret;
  1328	}
  1329	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Masami Hiramatsu @ 2026-05-19  9:53 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Linux Trace Kernel, bpf, Masami Hiramatsu,
	Mathieu Desnoyers, Mark Rutland, Peter Zijlstra, Namhyung Kim,
	Takaya Saeki, Douglas Raillard, Tom Zanussi, Andrew Morton,
	Thomas Gleixner, Ian Rogers, Jiri Olsa,
	"Subject:[PATCH  v2]", tracing/pr
In-Reply-To: <20260518232312.0c78f055@gandalf.local.home>

On Mon, 18 May 2026 23:23:12 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> From: Steven Rostedt <rostedt@goodmis.org>
> 
> Add syntax to the FETCHARGS parsing of probes to be able to typecast a
> value to a pointer to a structure.
> 
> Currently, a dereference must be a number, where the user has to figure
> out manually the offset of a member of a structure that they want to
> dereference, unless the member is a function parameter that BTF already has
> information about what structure the argument is pointing to.
> 
> But for event probes, or generic kprobes that records a register that
> happens to be a pointer to a structure, they cannot dereference these
> values with BTF naming, but must use numerical offsets.

Thanks for updating!

> 
> For example, to find out what device a sk_buff is pointing to in the
> net_dev_xmit trace event, one must first use gdb to find the offsets of the
> members of the structures:
> 
>  (gdb) p &((struct sk_buff *)0)->dev
>  $1 = (struct net_device **) 0x10
>  (gdb) p &((struct net_device *)0)->name
>  $2 = (char (*)[16]) 0x118
> 
> And then use the raw numbers to dereference:
> 
>   # echo 'e:xmit net.net_dev_xmit +0x118(+0x10($skbaddr)):string' >> dynamic_events
> 
> If BTF is in the kernel, then instead, the $skbaddr can be typecast to
> sk_buff and use the normal dereference logic.
> 
>   # echo 'e:xmit net.net_dev_xmit (sk_buff*)$skbaddr->dev->name:string' >> dynamic_events

Ah, eprobes supports "$PARAM" to access its parameter by name.
That is a bit complicated. Should we allow user to access
parameter without '$' prefix for eprobes?

>   # echo 1 > events/eprobes/xmit/enable
>   # cat trace
> [..]
>     sshd-session-1022    [000] b..2.   860.249343: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.250061: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.250142: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.263553: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.283820: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.302716: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.322905: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.342828: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.362268: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.382335: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.400856: xmit: (net.net_dev_xmit) arg1="enp7s0"
>     sshd-session-1022    [000] b..2.   860.419893: xmit: (net.net_dev_xmit) arg1="enp7s0"

Looks very nice!

> 
> The syntax is simply: ([STRUCT]*)(VAR)->FIELD[->FIELD..]

Is the STRUCT optional?? (because [] means optional.) I guess no.

I think we maybe possible to skip '*' (Or, make it optional)
because this is not C-like typecasting, we don't support "struct"
reserved word, and it does not support white-spaces in each
fetcharg. In this case, (STRUCT)VAR->FIELD should work.

BTW, I'm also considering to support new cast syntax, which allows
us to derefer a pointer with "container_of". This is typically
used in the kernel.

We usually see this pattern:

struct {
	unsigned long		data;
	struct list_head	list;
} foo;

void callback(struct list_head *foo_list)
{
	unsigned long data = container_of(foo_list, struct foo, list)->data;
	...
}

To access @data, simple casting does not work. Thus we need a
new syntax:

	(STRUCT)(PTR,ASSIGN)->FIELD

So the above case, we can do:

	data=(foo)(foo_list,list)->data

This is naturally extend the type casting to support container_of()
equivalent casting.

> 
> Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> ---
> Changes since v3: https://patch.msgid.link/20260518095832.52659a3a@gandalf.local.home
> 
>  *** COMPLETE REWRITE FROM V3 ***
> 
> - Rewrote it to use typecasting instead of simply replacing BTF names with
>   offsets.
> 
>  Documentation/trace/kprobetrace.rst |   3 +
>  kernel/trace/trace_probe.c          | 110 ++++++++++++++++++++++++----
>  kernel/trace/trace_probe.h          |   3 +
>  3 files changed, 100 insertions(+), 16 deletions(-)
> 
> diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
> index 3b6791c17e9b..450ac646fe4c 100644
> --- a/Documentation/trace/kprobetrace.rst
> +++ b/Documentation/trace/kprobetrace.rst
> @@ -54,6 +54,9 @@ Synopsis of kprobe_events
>    $retval	: Fetch return value.(\*2)
>    $comm		: Fetch current task comm.
>    +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
> +  (STRUCT*)FETCHARG->FIELD[->FIELD] : If BTF is supported, typecast FETCHARG to
> +                  a pointer to STRUCT and then derference the pointer defined by
> +                  ->FIELD.
>    \IMM		: Store an immediate value to the argument.
>    NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
>    FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
> diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> index e0d3a0da26af..b0829eb1cb52 100644
> --- a/kernel/trace/trace_probe.c
> +++ b/kernel/trace/trace_probe.c
> @@ -464,6 +464,26 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
>  	return NULL;
>  }
>  
> +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
> +{
> +	int id;
> +
> +	if (!ctx->btf) {
> +		struct btf *btf;

This needs an empty line here.

> +		id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
> +		if (id < 0)
> +			return -EINVAL;

Why don't you return id (it has corresponding errno)?

> +		ctx->btf = btf;
> +	} else {
> +		id = btf_find_by_name_kind(ctx->btf, sname, BTF_KIND_STRUCT);
> +		if (id < 0)
> +			return -EINVAL;

Ditto.

> +	}
> +
> +	ctx->last_struct = btf_type_by_id(ctx->btf, id);
> +	return 0;
> +}
> +
>  static int query_btf_context(struct traceprobe_parse_context *ctx)
>  {
>  	const struct btf_param *param;
> @@ -471,12 +491,12 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
>  	struct btf *btf;
>  	s32 nr;
>  
> -	if (ctx->btf)
> -		return 0;
> -
>  	if (!ctx->funcname)
>  		return -EINVAL;
>  
> +	if (ctx->btf)
> +		return 0;
> +

Could you tell me why this order is changed?
I think this type casting will allow us to skip checking funcname
because btf context is already specified.

Ah, BTW, we may need to use a special struct btf* for type
casting. If the target function is in a module and the
casting type is defined in vmlinux, those are stored in
the different places...


for example,

 p funcA (foo)$arg1->bar buz

In this case, buz needs to use BTF including funcA.
Maybe we need to introduce ctx->func_btf, which resets ctx->btf
in traceprobe_parse_probe_arg_body() where parse_probe_arg()
is calling, e.g.

	ctx->last_type = NULL;
+	if (ctx->btf)
+		btf_put(ctx->btf);
+	ctx->btf = ctx->func_btf;
	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
			      ctx);


>  	type = btf_find_func_proto(ctx->funcname, &btf);
>  	if (!type)
>  		return -ENOENT;
> @@ -514,6 +534,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
>  		ctx->proto = NULL;
>  		ctx->params = NULL;
>  		ctx->nr_params = 0;
> +		ctx->last_struct = NULL;
>  	}
>  }
>  
> @@ -554,22 +575,28 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
>  	struct fetch_insn *code = *pcode;
>  	const struct btf_member *field;
>  	u32 bitoffs, anon_offs;
> +	bool is_struct = ctx->flags & TPARG_FL_STRUCT;
>  	char *next;
>  	int is_ptr;
>  	s32 tid;
>  
>  	do {
> -		/* Outer loop for solving arrow operator ('->') */
> -		if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
> -			trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
> -			return -EINVAL;
> -		}
> -		/* Convert a struct pointer type to a struct type */
> -		type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
> -		if (!type) {
> -			trace_probe_log_err(ctx->offset, BAD_BTF_TID);
> -			return -EINVAL;
> +		if (!is_struct) {
> +			/* Outer loop for solving arrow operator ('->') */
> +			if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
> +				trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
> +				return -EINVAL;
> +			}
> +
> +			/* Convert a struct pointer type to a struct type */
> +			type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
> +			if (!type) {
> +				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
> +				return -EINVAL;
> +			}
>  		}
> +		/* Only the first type can skip being a pointer */
> +		is_struct = false;
>  
>  		bitoffs = 0;
>  		do {
> @@ -635,12 +662,12 @@ static int parse_btf_arg(char *varname,
>  {
>  	struct fetch_insn *code = *pcode;
>  	const struct btf_param *params;
> -	const struct btf_type *type;
> +	const struct btf_type *type = NULL;
>  	char *field = NULL;
>  	int i, is_ptr, ret;
>  	u32 tid;
>  
> -	if (WARN_ON_ONCE(!ctx->funcname))
> +	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_STRUCT)))
>  		return -EINVAL;
>  
>  	is_ptr = split_next_field(varname, &field, ctx);
> @@ -704,11 +731,18 @@ static int parse_btf_arg(char *varname,
>  			goto found;
>  		}
>  	}
> +
> +	if (ctx->flags & TPARG_FL_STRUCT) {
> +		type = ctx->last_struct;
> +		goto found;

I rather like to jump type_found: label instead of
checking !type. (Or, save tid instead of type)

> +	}
> +
>  	trace_probe_log_err(ctx->offset, NO_BTFARG);
>  	return -ENOENT;
>  
>  found:
> -	type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
> +	if (!type)
> +		type = btf_type_skip_modifiers(ctx->btf, tid, &tid);

type_found:

>  	if (!type) {
>  		trace_probe_log_err(ctx->offset, BAD_BTF_TID);
>  		return -EINVAL;
> @@ -952,6 +986,12 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
>  	int ret = 0;
>  	int len;
>  
> +	if (ctx->flags & TPARG_FL_STRUCT) {
> +		ret = parse_btf_arg(orig_arg, pcode, end, ctx);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	if (ctx->flags & TPARG_FL_TEVENT) {
>  		if (code->data)
>  			return -EFAULT;
> @@ -1231,6 +1271,43 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
>  				code->op = FETCH_OP_IMM;
>  		}
>  		break;
> +	case '(':
> +		tmp = strrchr(arg, ')');

OK, in this step, we don't support nested cast etc. so this works.

> +		if (!tmp) {
> +			trace_probe_log_err(ctx->offset + strlen(arg),
> +					    DEREF_OPEN_BRACE);
> +			return -EINVAL;
> +		}
> +
> +		tmp--;
> +		if (*tmp != '*') {
> +			trace_probe_log_err(ctx->offset + (tmp - arg),
> +					    NO_PTR_STRCT);
> +			return -EINVAL;
> +		}

So I think this can be optional, not an error.

> +		*tmp = '\0';
> +		ret = query_btf_struct(arg + 1, ctx);
> +		*tmp = '*';
> +
> +		if (ret < 0) {
> +			trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
> +			return -EINVAL;
> +		}
> +
> +		ctx->flags |= TPARG_FL_STRUCT;
> +		tmp += 2;
> +
> +		if (*tmp != '$') {
> +			trace_probe_log_err(ctx->offset + (tmp - arg),
> +					    BAD_VAR);
> +			return -EINVAL;
> +		}

Ok, this limitation will be removed afterwards.

Thanks,

> +
> +		ctx->offset += tmp - arg;
> +		ret = parse_probe_vars(tmp, type, pcode, end, ctx);
> +		ctx->flags &= ~TPARG_FL_STRUCT;
> +		ctx->last_struct = NULL;
> +		break;
>  	default:
>  		if (isalpha(arg[0]) || arg[0] == '_') {	/* BTF variable */
>  			if (!tparg_is_function_entry(ctx->flags) &&
> @@ -1504,6 +1581,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
>  	code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
>  
>  	ctx->last_type = NULL;
> +	ctx->last_struct = NULL;
>  	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
>  			      ctx);
>  	if (ret < 0)
> diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
> index 262d8707a3df..88ab9f6da591 100644
> --- a/kernel/trace/trace_probe.h
> +++ b/kernel/trace/trace_probe.h
> @@ -394,6 +394,7 @@ static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
>   * TPARG_FL_KERNEL and TPARG_FL_USER are also mutually exclusive.
>   * TPARG_FL_FPROBE and TPARG_FL_TPOINT are optional but it should be with
>   * TPARG_FL_KERNEL.
> + * TPARG_FL_STRUCT is set if an argument was typecast to a structure.
>   */
>  #define TPARG_FL_RETURN BIT(0)
>  #define TPARG_FL_KERNEL BIT(1)
> @@ -402,6 +403,7 @@ static inline int traceprobe_get_entry_data_size(struct trace_probe *tp)
>  #define TPARG_FL_USER   BIT(4)
>  #define TPARG_FL_FPROBE BIT(5)
>  #define TPARG_FL_TPOINT BIT(6)
> +#define TPARG_FL_STRUCT BIT(7)
>  #define TPARG_FL_LOC_MASK	GENMASK(4, 0)
>  
>  static inline bool tparg_is_function_entry(unsigned int flags)
> @@ -423,6 +425,7 @@ struct traceprobe_parse_context {
>  	s32 nr_params;			/* The number of the parameters */
>  	struct btf *btf;		/* The BTF to be used */
>  	const struct btf_type *last_type;	/* Saved type */
> +	const struct btf_type *last_struct;	/* Saved structure */
>  	u32 last_bitoffs;		/* Saved bitoffs */
>  	u32 last_bitsize;		/* Saved bitsize */
>  	struct trace_probe *tp;
> -- 
> 2.53.0
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: kernel test robot @ 2026-05-19 10:10 UTC (permalink / raw)
  To: Steven Rostedt, LKML, Linux Trace Kernel, bpf
  Cc: llvm, oe-kbuild-all, Masami Hiramatsu, Mathieu Desnoyers,
	Mark Rutland, Peter Zijlstra, Namhyung Kim, Takaya Saeki,
	Douglas Raillard, Tom Zanussi, Andrew Morton,
	Linux Memory Management List, Thomas Gleixner, Ian Rogers,
	Jiri Olsa, Subject:[PATCH v2]
In-Reply-To: <20260518232312.0c78f055@gandalf.local.home>

Hi Steven,

kernel test robot noticed the following build errors:

[auto build test ERROR on trace/for-next]
[also build test ERROR on linus/master v7.1-rc4 next-20260518]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Steven-Rostedt/tracing-probes-Allow-use-of-BTF-names-to-dereference-pointers/20260519-121930
base:   https://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace for-next
patch link:    https://lore.kernel.org/r/20260518232312.0c78f055%40gandalf.local.home
patch subject: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
config: sparc64-defconfig (https://download.01.org/0day-ci/archive/20260519/202605191828.Y3E73pH1-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260519/202605191828.Y3E73pH1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605191828.Y3E73pH1-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/trace/trace_probe.c:1289:9: error: call to undeclared function 'query_btf_struct'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1289 |                 ret = query_btf_struct(arg + 1, ctx);
         |                       ^
   1 error generated.


vim +/query_btf_struct +1289 kernel/trace/trace_probe.c

  1120	
  1121	/* Recursive argument parser */
  1122	static int
  1123	parse_probe_arg(char *arg, const struct fetch_type *type,
  1124			struct fetch_insn **pcode, struct fetch_insn *end,
  1125			struct traceprobe_parse_context *ctx)
  1126	{
  1127		struct fetch_insn *code = *pcode;
  1128		unsigned long param;
  1129		int deref = FETCH_OP_DEREF;
  1130		long offset = 0;
  1131		char *tmp;
  1132		int ret = 0;
  1133	
  1134		switch (arg[0]) {
  1135		case '$':
  1136			ret = parse_probe_vars(arg, type, pcode, end, ctx);
  1137			break;
  1138	
  1139		case '%':	/* named register */
  1140			if (ctx->flags & (TPARG_FL_TEVENT | TPARG_FL_FPROBE)) {
  1141				/* eprobe and fprobe do not handle registers */
  1142				trace_probe_log_err(ctx->offset, BAD_VAR);
  1143				break;
  1144			}
  1145			ret = regs_query_register_offset(arg + 1);
  1146			if (ret >= 0) {
  1147				code->op = FETCH_OP_REG;
  1148				code->param = (unsigned int)ret;
  1149				ret = 0;
  1150			} else
  1151				trace_probe_log_err(ctx->offset, BAD_REG_NAME);
  1152			break;
  1153	
  1154		case '@':	/* memory, file-offset or symbol */
  1155			if (isdigit(arg[1])) {
  1156				ret = kstrtoul(arg + 1, 0, &param);
  1157				if (ret) {
  1158					trace_probe_log_err(ctx->offset, BAD_MEM_ADDR);
  1159					break;
  1160				}
  1161				/* load address */
  1162				code->op = FETCH_OP_IMM;
  1163				code->immediate = param;
  1164			} else if (arg[1] == '+') {
  1165				/* kprobes don't support file offsets */
  1166				if (ctx->flags & TPARG_FL_KERNEL) {
  1167					trace_probe_log_err(ctx->offset, FILE_ON_KPROBE);
  1168					return -EINVAL;
  1169				}
  1170				ret = kstrtol(arg + 2, 0, &offset);
  1171				if (ret) {
  1172					trace_probe_log_err(ctx->offset, BAD_FILE_OFFS);
  1173					break;
  1174				}
  1175	
  1176				code->op = FETCH_OP_FOFFS;
  1177				code->immediate = (unsigned long)offset;  // imm64?
  1178			} else {
  1179				/* uprobes don't support symbols */
  1180				if (!(ctx->flags & TPARG_FL_KERNEL)) {
  1181					trace_probe_log_err(ctx->offset, SYM_ON_UPROBE);
  1182					return -EINVAL;
  1183				}
  1184				/* Preserve symbol for updating */
  1185				code->op = FETCH_NOP_SYMBOL;
  1186				code->data = kstrdup(arg + 1, GFP_KERNEL);
  1187				if (!code->data)
  1188					return -ENOMEM;
  1189				if (++code == end) {
  1190					trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
  1191					return -EINVAL;
  1192				}
  1193				code->op = FETCH_OP_IMM;
  1194				code->immediate = 0;
  1195			}
  1196			/* These are fetching from memory */
  1197			if (++code == end) {
  1198				trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
  1199				return -EINVAL;
  1200			}
  1201			*pcode = code;
  1202			code->op = FETCH_OP_DEREF;
  1203			code->offset = offset;
  1204			break;
  1205	
  1206		case '+':	/* deref memory */
  1207		case '-':
  1208			if (arg[1] == 'u') {
  1209				deref = FETCH_OP_UDEREF;
  1210				arg[1] = arg[0];
  1211				arg++;
  1212			}
  1213			if (arg[0] == '+')
  1214				arg++;	/* Skip '+', because kstrtol() rejects it. */
  1215			tmp = strchr(arg, '(');
  1216			if (!tmp) {
  1217				trace_probe_log_err(ctx->offset, DEREF_NEED_BRACE);
  1218				return -EINVAL;
  1219			}
  1220			*tmp = '\0';
  1221			ret = kstrtol(arg, 0, &offset);
  1222			if (ret) {
  1223				trace_probe_log_err(ctx->offset, BAD_DEREF_OFFS);
  1224				break;
  1225			}
  1226			ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
  1227			arg = tmp + 1;
  1228			tmp = strrchr(arg, ')');
  1229			if (!tmp) {
  1230				trace_probe_log_err(ctx->offset + strlen(arg),
  1231						    DEREF_OPEN_BRACE);
  1232				return -EINVAL;
  1233			} else {
  1234				const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags);
  1235				int cur_offs = ctx->offset;
  1236	
  1237				*tmp = '\0';
  1238				ret = parse_probe_arg(arg, t2, &code, end, ctx);
  1239				if (ret)
  1240					break;
  1241				ctx->offset = cur_offs;
  1242				if (code->op == FETCH_OP_COMM ||
  1243				    code->op == FETCH_OP_DATA) {
  1244					trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
  1245					return -EINVAL;
  1246				}
  1247				if (++code == end) {
  1248					trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
  1249					return -EINVAL;
  1250				}
  1251				*pcode = code;
  1252	
  1253				code->op = deref;
  1254				code->offset = offset;
  1255				/* Reset the last type if used */
  1256				ctx->last_type = NULL;
  1257			}
  1258			break;
  1259		case '\\':	/* Immediate value */
  1260			if (arg[1] == '"') {	/* Immediate string */
  1261				ret = __parse_imm_string(arg + 2, &tmp, ctx->offset + 2);
  1262				if (ret)
  1263					break;
  1264				code->op = FETCH_OP_DATA;
  1265				code->data = tmp;
  1266			} else {
  1267				ret = str_to_immediate(arg + 1, &code->immediate);
  1268				if (ret)
  1269					trace_probe_log_err(ctx->offset + 1, BAD_IMM);
  1270				else
  1271					code->op = FETCH_OP_IMM;
  1272			}
  1273			break;
  1274		case '(':
  1275			tmp = strrchr(arg, ')');
  1276			if (!tmp) {
  1277				trace_probe_log_err(ctx->offset + strlen(arg),
  1278						    DEREF_OPEN_BRACE);
  1279				return -EINVAL;
  1280			}
  1281	
  1282			tmp--;
  1283			if (*tmp != '*') {
  1284				trace_probe_log_err(ctx->offset + (tmp - arg),
  1285						    NO_PTR_STRCT);
  1286				return -EINVAL;
  1287			}
  1288			*tmp = '\0';
> 1289			ret = query_btf_struct(arg + 1, ctx);
  1290			*tmp = '*';
  1291	
  1292			if (ret < 0) {
  1293				trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
  1294				return -EINVAL;
  1295			}
  1296	
  1297			ctx->flags |= TPARG_FL_STRUCT;
  1298			tmp += 2;
  1299	
  1300			if (*tmp != '$') {
  1301				trace_probe_log_err(ctx->offset + (tmp - arg),
  1302						    BAD_VAR);
  1303				return -EINVAL;
  1304			}
  1305	
  1306			ctx->offset += tmp - arg;
  1307			ret = parse_probe_vars(tmp, type, pcode, end, ctx);
  1308			ctx->flags &= ~TPARG_FL_STRUCT;
  1309			ctx->last_struct = NULL;
  1310			break;
  1311		default:
  1312			if (isalpha(arg[0]) || arg[0] == '_') {	/* BTF variable */
  1313				if (!tparg_is_function_entry(ctx->flags) &&
  1314				    !tparg_is_function_return(ctx->flags)) {
  1315					trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
  1316					return -EINVAL;
  1317				}
  1318				ret = parse_btf_arg(arg, pcode, end, ctx);
  1319				break;
  1320			}
  1321		}
  1322		if (!ret && code->op == FETCH_OP_NOP) {
  1323			/* Parsed, but do not find fetch method */
  1324			trace_probe_log_err(ctx->offset, BAD_FETCH_ARG);
  1325			ret = -EINVAL;
  1326		}
  1327		return ret;
  1328	}
  1329	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* [PATCH] gpu: host1x: trace: fix string fields in host1x traces
From: Artur Kowalski @ 2026-05-19 10:16 UTC (permalink / raw)
  To: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers
  Cc: linux-kernel, linux-trace-kernel, linux-tegra, Artur Kowalski

Use __assign_str and __get_str as required by tracing subsystem. Fixes
string fields being rejected by the verifier and unreadable from
userspace.

Tested on v6.18.21.

Signed-off-by: Artur Kowalski <arturkow2000@gmail.com>
---
 include/trace/events/host1x.h | 50 ++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
index 1ba84b738e46..1b6aeb7b177b 100644
--- a/include/trace/events/host1x.h
+++ b/include/trace/events/host1x.h
@@ -21,9 +21,11 @@ struct host1x_bo;
 DECLARE_EVENT_CLASS(host1x,
 	TP_PROTO(const char *name),
 	TP_ARGS(name),
-	TP_STRUCT__entry(__field(const char *, name)),
-	TP_fast_assign(__entry->name = name;),
-	TP_printk("name=%s", __entry->name)
+	TP_STRUCT__entry(__string(name, name)),
+	TP_fast_assign(
+		__assign_str(name);
+	),
+	TP_printk("name=%s", __get_str(name))
 );
 
 DEFINE_EVENT(host1x, host1x_channel_open,
@@ -52,19 +54,19 @@ TRACE_EVENT(host1x_cdma_push,
 	TP_ARGS(name, op1, op2),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(u32, op1)
 		__field(u32, op2)
 	),
 
 	TP_fast_assign(
-		__entry->name = name;
+		__assign_str(name);
 		__entry->op1 = op1;
 		__entry->op2 = op2;
 	),
 
 	TP_printk("name=%s, op1=%08x, op2=%08x",
-		__entry->name, __entry->op1, __entry->op2)
+		__get_str(name), __entry->op1, __entry->op2)
 );
 
 TRACE_EVENT(host1x_cdma_push_wide,
@@ -73,7 +75,7 @@ TRACE_EVENT(host1x_cdma_push_wide,
 	TP_ARGS(name, op1, op2, op3, op4),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(u32, op1)
 		__field(u32, op2)
 		__field(u32, op3)
@@ -81,7 +83,7 @@ TRACE_EVENT(host1x_cdma_push_wide,
 	),
 
 	TP_fast_assign(
-		__entry->name = name;
+		__assign_str(name);
 		__entry->op1 = op1;
 		__entry->op2 = op2;
 		__entry->op3 = op3;
@@ -89,7 +91,7 @@ TRACE_EVENT(host1x_cdma_push_wide,
 	),
 
 	TP_printk("name=%s, op1=%08x, op2=%08x, op3=%08x op4=%08x",
-		__entry->name, __entry->op1, __entry->op2, __entry->op3,
+		__get_str(name), __entry->op1, __entry->op2, __entry->op3,
 		__entry->op4)
 );
 
@@ -100,7 +102,7 @@ TRACE_EVENT(host1x_cdma_push_gather,
 	TP_ARGS(name, bo, words, offset, cmdbuf),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(struct host1x_bo *, bo)
 		__field(u32, words)
 		__field(u32, offset)
@@ -114,14 +116,14 @@ TRACE_EVENT(host1x_cdma_push_gather,
 					words * sizeof(u32));
 		}
 		__entry->cmdbuf = cmdbuf;
-		__entry->name = name;
+		__assign_str(name);
 		__entry->bo = bo;
 		__entry->words = words;
 		__entry->offset = offset;
 	),
 
 	TP_printk("name=%s, bo=%p, words=%u, offset=%d, contents=[%s]",
-	  __entry->name, __entry->bo,
+	  __get_str(name), __entry->bo,
 	  __entry->words, __entry->offset,
 	  __print_hex(__get_dynamic_array(cmdbuf),
 		  __entry->cmdbuf ? __entry->words * 4 : 0))
@@ -134,7 +136,7 @@ TRACE_EVENT(host1x_channel_submit,
 	TP_ARGS(name, cmdbufs, relocs, syncpt_id, syncpt_incrs),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(u32, cmdbufs)
 		__field(u32, relocs)
 		__field(u32, syncpt_id)
@@ -142,7 +144,7 @@ TRACE_EVENT(host1x_channel_submit,
 	),
 
 	TP_fast_assign(
-		__entry->name = name;
+		__assign_str(name);
 		__entry->cmdbufs = cmdbufs;
 		__entry->relocs = relocs;
 		__entry->syncpt_id = syncpt_id;
@@ -151,7 +153,7 @@ TRACE_EVENT(host1x_channel_submit,
 
 	TP_printk("name=%s, cmdbufs=%u, relocs=%u, syncpt_id=%u, "
 		  "syncpt_incrs=%u",
-		  __entry->name, __entry->cmdbufs, __entry->relocs,
+		  __get_str(name), __entry->cmdbufs, __entry->relocs,
 		  __entry->syncpt_id, __entry->syncpt_incrs)
 );
 
@@ -161,19 +163,19 @@ TRACE_EVENT(host1x_channel_submitted,
 	TP_ARGS(name, syncpt_base, syncpt_max),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(u32, syncpt_base)
 		__field(u32, syncpt_max)
 	),
 
 	TP_fast_assign(
-		__entry->name = name;
+		__assign_str(name);
 		__entry->syncpt_base = syncpt_base;
 		__entry->syncpt_max = syncpt_max;
 	),
 
 	TP_printk("name=%s, syncpt_base=%d, syncpt_max=%d",
-		__entry->name, __entry->syncpt_base, __entry->syncpt_max)
+		__get_str(name), __entry->syncpt_base, __entry->syncpt_max)
 );
 
 TRACE_EVENT(host1x_channel_submit_complete,
@@ -182,19 +184,19 @@ TRACE_EVENT(host1x_channel_submit_complete,
 	TP_ARGS(name, count, thresh),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(int, count)
 		__field(u32, thresh)
 	),
 
 	TP_fast_assign(
-		__entry->name = name;
+		__assign_str(name);
 		__entry->count = count;
 		__entry->thresh = thresh;
 	),
 
 	TP_printk("name=%s, count=%d, thresh=%d",
-		__entry->name, __entry->count, __entry->thresh)
+		__get_str(name), __entry->count, __entry->thresh)
 );
 
 TRACE_EVENT(host1x_wait_cdma,
@@ -203,16 +205,16 @@ TRACE_EVENT(host1x_wait_cdma,
 	TP_ARGS(name, eventid),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, name)
 		__field(u32, eventid)
 	),
 
 	TP_fast_assign(
-		__entry->name = name;
+		__assign_str(name);
 		__entry->eventid = eventid;
 	),
 
-	TP_printk("name=%s, event=%d", __entry->name, __entry->eventid)
+	TP_printk("name=%s, event=%d", __get_str(name), __entry->eventid)
 );
 
 TRACE_EVENT(host1x_syncpt_load_min,

---
base-commit: ab5fce87a778cb780a05984a2ca448f2b41aafbf
change-id: 20260519-host1x-tracing-e2d608ec5e37

Best regards,
--  
Artur Kowalski <arturkow2000@gmail.com>


^ permalink raw reply related

* Re: [PATCH] ftrace: fix race in __modify_ftrace_direct() between tmp_ops registration and direct_functions update
From: Jiri Olsa @ 2026-05-19 10:55 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andrii Kuchmenko, linux-trace-kernel, mhiramat, linux-kernel,
	stable
In-Reply-To: <20260518121906.4eebad77@gandalf.local.home>

On Mon, May 18, 2026 at 12:19:06PM -0400, Steven Rostedt wrote:
> On Sun, 17 May 2026 14:01:53 +0300
> Andrii Kuchmenko <capyenglishlite@gmail.com> wrote:
> 
> > In __modify_ftrace_direct(), register_ftrace_function_nolock() makes
> > tmp_ops visible in ftrace_ops_list before entry->direct is updated
> > under ftrace_lock. During this window any CPU entering the traced
> > function calls call_direct_funcs(), reads the old address from
> > direct_functions via RCU, and jumps to it via
> > arch_ftrace_set_direct_caller(). If the caller freed or invalidated
> > the old trampoline before calling modify_ftrace_direct(), this is a
> > use-after-free in executable code context.
> > 
> > The race window:
> > 
> >   CPU 0 (__modify_ftrace_direct)       CPU 1 (executing traced func)
> >   ──────────────────────────────       ──────────────────────────────
> >   register_ftrace_function_nolock()
> >     -> tmp_ops visible in ops_list  
> >                                         call_direct_funcs()
> >                                           ftrace_find_rec_direct() -> old_addr
> >                                           arch_ftrace_set_direct_caller(old_addr)
> >                                           jump to old_addr  <- UAF if freed
> 
> You do not state where old_addr is freed.
> 
> >   mutex_lock(&ftrace_lock)
> >   entry->direct = addr   <- too late
> >   mutex_unlock(&ftrace_lock)
> > 
> > Fix: update entry->direct under ftrace_lock BEFORE registering tmp_ops.
> > Any CPU that observes tmp_ops in ftrace_ops_list after this point will
> > already see the new address when it calls ftrace_find_rec_direct().
> > Add smp_wmb() between the store and the registration to ensure the
> > write is visible on weakly-ordered architectures before tmp_ops
> > becomes observable via ftrace_ops_list.
> > 
> > On error from register_ftrace_function_nolock(), restore entry->direct
> > to old_addr since tmp_ops never became visible to other CPUs.
> 
> The above statement is incorrect. The tmp_ops hash entries are also
> *shared* with the ops that is being updated. That is, by changing the entry->direct, you 
> 
> > 
> > This affects all callers of __modify_ftrace_direct(), including:
> >   - modify_ftrace_direct() used by kernel modules and live patching
> >   - modify_ftrace_direct_nolock() used by BPF trampolines
> >     (kernel/bpf/trampoline.c) reachable with CAP_BPF + CAP_PERFMON
> > 
> > Fixes: 0567d6809440 ("ftrace: Add modify_ftrace_direct()")
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Masami Hiramatsu <mhiramat@kernel.org>
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Andrii Kuchmenko <capyenglishlite@gmail.com>
> > ---
> >  kernel/trace/ftrace.c | 35 +++++++++++++++++++++++++----------
> >  1 file changed, 25 insertions(+), 10 deletions(-)
> > 
> > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
> > index a1b2c3d4e5f6..b7c8d9e0f1a2 100644
> > --- a/kernel/trace/ftrace.c
> > +++ b/kernel/trace/ftrace.c
> > @@ -5950,6 +5950,7 @@ static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
> >  	struct ftrace_func_entry *entry;
> >  	struct ftrace_ops tmp_ops;
> > +	unsigned long old_addr;
> >  	int err;
> >  
> >  	lockdep_assert_held(&direct_mutex);
> > @@ -5960,22 +5961,36 @@ static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
> >  	if (!entry)
> >  		return -ENODEV;
> >  
> > -	/*
> > -	 * tmp_ops is registered into ftrace_ops_list here, making it
> > -	 * visible to all CPUs executing the traced function. However,
> > -	 * entry->direct is not updated until after this call returns,
> > -	 * leaving a window where CPUs read the stale (possibly freed)
> > -	 * direct call address via ftrace_find_rec_direct().
> > -	 */
> 
> Are you posting patches on top of your own patches that are not public?

hi,

right, the original email states 5.15 is affected, but I dont see
__modify_ftrace_direct in stable version v5.15.207 .. what kernel
version is the patch for?

> 
> > -	err = register_ftrace_function_nolock(&tmp_ops);
> > -	if (err)
> > -		return err;
> > -
> > +	/* Save old address in case we need to roll back on error. */
> > +	old_addr = entry->direct;
> > +
> > +	/*
> > +	 * Update entry->direct BEFORE registering tmp_ops into
> > +	 * ftrace_ops_list. This closes the race window where a CPU
> > +	 * executing the traced function could read the old (potentially
> > +	 * freed) direct call address between tmp_ops becoming visible
> > +	 * and entry->direct being updated.
> > +	 *
> > +	 * Any CPU that observes tmp_ops in ftrace_ops_list after the
> > +	 * smp_wmb() below is guaranteed to see the new address when
> > +	 * it calls ftrace_find_rec_direct().
> > +	 */
> >  	mutex_lock(&ftrace_lock);
> >  	entry->direct = addr;
> >  	mutex_unlock(&ftrace_lock);
> >  
> > +	/*
> > +	 * Ensure entry->direct store is ordered before tmp_ops
> > +	 * becomes visible via ftrace_ops_list on weakly-ordered archs.
> > +	 */
> > +	smp_wmb();
> 
> You do realize that register_ftrace_function_nolock() is itself a full
> memory barrier? It's doing code modification which requires lots of
> barriers to work.
> 
> Still, the only bug I see that is possible is that the caller may need to
> do some synchronize RCU calls before freeing an old trampoline.
> 
> Can you show a path that doesn't do that?

+1 

jirka

^ permalink raw reply

* Re: [PATCH v3 1/2] spi: qcom-geni: trace: Add trace events for Qualcomm GENI SPI
From: Mukesh Savaliya @ 2026-05-19 10:56 UTC (permalink / raw)
  To: Praveen Talari, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Mark Brown
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-spi,
	aniket.randive, chandana.chiluveru, jyothi.seerapu, Konrad Dybcio
In-Reply-To: <20260518-add-tracepoints-for-qcom-geni-spi-v3-1-7928f6810a79@oss.qualcomm.com>



On 5/18/2026 10:30 PM, Praveen Talari wrote:
> Add tracepoint support to the Qualcomm GENI SPI driver to provide
> runtime visibility into driver behavior without requiring invasive debug
> patches.
> 
> The trace events cover clock and setup parameter configuration,
> transfer metadata, interrupt status to be making it easier to diagnose
> communication issues in the field..
> 
> Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
> Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
> ---
> v2->v3:
> - Renamed geni_spi_fifo_params to geni_spi_setup_params trace event.
> - Updated commit text.
> 
Reviewed-by: Mukesh Kumar Savaliya <mukesh.savaliya@oss.qualcomm.com>

^ permalink raw reply

* Re: [PATCH v3 2/2] spi: qcom-geni: Add trace events for Qualcomm GENI SPI driver
From: Mukesh Savaliya @ 2026-05-19 10:59 UTC (permalink / raw)
  To: Praveen Talari, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Mark Brown
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-spi,
	aniket.randive, chandana.chiluveru, jyothi.seerapu, Konrad Dybcio
In-Reply-To: <20260518-add-tracepoints-for-qcom-geni-spi-v3-2-7928f6810a79@oss.qualcomm.com>

Hi Praveen, one question below.

On 5/18/2026 10:30 PM, Praveen Talari wrote:
> Add tracepoints to the Qualcomm GENI (Generic Interface) SPI driver.
> These trace events enable runtime debugging and performance analysis
> of SPI operations.
> 
> The trace events capture SPI clock configuration, setup parameters,
> transfer details, interrupt status.
Don't you need trace logs around PM operations ? ie. runtime and system PM ?
> 
> Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
> Signed-off-by: Praveen Talari <praveen.talari@oss.qualcomm.com>
> ---

[...]

^ permalink raw reply

* Re: [PATCH] Re: Re: [RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors
From: Gabriele Monaco @ 2026-05-19 11:14 UTC (permalink / raw)
  To: Wen Yang; +Cc: linux-kernel, linux-trace-kernel, rostedt
In-Reply-To: <6d2f3490-5e30-4966-a3cd-372a34e10ba2@linux.dev>

Hi Wen,

On Mon, 2026-05-18 at 01:13 +0800, Wen Yang wrote:
> 
> Yes.  The ftracetest check_requires logic calls `command -v <binary>` to
> satisfy `requires: <name>:program` directives.  Without the script's
> directory in PATH those checks evaluate to exit_unsupported and test cases
> are skipped rather than run.  The make path avoids this only because make
> sets OUTDIR and the runner appends it to PATH internally.
> 

So you're overriding PATH so the selftest's binaries can be found from
the test, right?

Wouldn't it be simpler to just put the absolute paths in the tests and
don't touch PATH.

If the selftests are run via makefile, it ensures the required binaries
are built and available, so there's no need to go through the `requires:
<name>:program` infrastructure (that's more about what's installed on
the system.

Or if you don't want anything hardcoded, you could pass the $OUTDIR from
the environment and use that in scripts, whatever looks cleaner.

Does it make sense to you?

> 
> -- Patch 04: pre-allocated storage pool
> 
>  > Since you're using spinlocks, isn't that going to sleep on PREEMPT_RT?
> 
> User-mode uprobe handlers run with preempt_count == 0, fully preemptible on
> both PREEMPT_RT and non-PREEMPT_RT.  The strongest evidence is in tlob
> itself: tlob_start_task() takes a mutex and calls kmem_cache_zalloc(...,
> GFP_KERNEL) from the uprobe entry handler; both are illegal in atomic
> context and would trigger lockdep splats immediately.
> 
> On PREEMPT_RT, spinlock_t becoming a sleeping lock in the uprobe handler 
> iscfine: both call sites (da_create_or_get_pool() from the handler and
> da_pool_return_cb() from the rcuc kthread) are in sleepable task context.
> 

Yeah exactly, the uprobe is fine with anything (even the automatic
`kmalloc_nolock`), but sure preallocation at least guarantees the slots
are there.

>  > We can have a macro DA_MON_ALLOCATION_STRATEGY = {DA_ALLOC_AUTO,
>  > DA_ALLOC_POOL, DA_ALLOC_MANUAL} where DA_MON_POOL also requires
>  > DA_MON_POOL_SIZE to be defined (force that with an #error).
>  >
>  > Anyway, this way you probably wouldn't need to define a different init
>  > function and let everything handled more transparently.
>  >
>  > Also you don't need to call da_create_or_get() explicitly,
>  > da_handle_start_event() should do it for you.
> 
> Agreed on all counts.  We plan to implement this in v3 as follows.
> The three strategies would be a compile-time selection in da_monitor.h:
> 
>    DA_ALLOC_AUTO   (default) - lock-free kmalloc_nolock on the hot path;
>                                unbounded capacity.
> 
>    DA_ALLOC_POOL             - pre-allocated fixed-size pool; 
> DA_MON_POOL_SIZE
>                                required, enforced with #error if missing.
> 
>    DA_ALLOC_MANUAL           - caller pre-inserts storage via
>                                da_create_empty_storage() before the first
>                                da_handle_start_event(); the framework only
>                                links the target field.
> 
> da_monitor_init_prealloc() would be removed; da_monitor_init() would
> select pool or kmalloc initialisation internally based on the strategy.
> 
> da_handle_start_event() and da_handle_start_run_event() would both call
> da_prepare_storage() at compile time:
> 
>    DA_ALLOC_AUTO   -> da_create_storage()        (kmalloc_nolock)
>    DA_ALLOC_POOL   -> da_create_or_get_pool()
>    DA_ALLOC_MANUAL -> da_fill_empty_storage()    (link target into pre-
>                       allocated slot; no allocation on the hot path)
> 
> No explicit da_create_or_get() call would be needed in any monitor.
> 
> da_create_or_get_kmalloc() would be removed: as you noted, a caller that
> uses kmalloc_nolock does so because locking is forbidden; a GFP_KERNEL
> fallback is equally forbidden if the lockless attempt fails, so the
> function has no viable use case.
> 
> tlob would define:
>    #define DA_MON_ALLOCATION_STRATEGY DA_ALLOC_POOL
>    #define DA_MON_POOL_SIZE            TLOB_MAX_MONITORED
> 
> nomiss would define:
>    #define DA_MON_ALLOCATION_STRATEGY DA_ALLOC_MANUAL
> 
> and call da_create_empty_storage() from handle_sys_enter() (the
> sched_setscheduler syscall path), which runs in safe task context;
> da_fill_empty_storage() would then link the sched_dl_entity target on
> the first da_handle_start_run_event() call in handle_sched_switch().

Yeah good point, there's no need to make it a special path even if we
have the target ready, da_handle_start_run_event() can do it just fine.

> 
> 
> -- Patch 05: generic uprobe infrastructure
> 
> Carried unchanged into v3 (as part of the 08-b split described below).
> 
> 
> -- Patch 06: rvgen __init arrow reset
> 
> Thanks, carried unchanged into v3.
> 

Well, if you don't need reset() on the __init arrow we can drop this,
right? Also it doesn't seem fully wired with the rest and requires a
separate event to do handle_monitor_start(), which can be only just
another handler for tlob, nothing general.

> 
>  > Why don't you make it a separate event (e.g. "start_tlob") [...] then
>  > you also wouldn't need to call reset() and start_timer() manually.
> 
> Good suggestion.  We plan to use a dedicated start_tlob event instead,
> with a self-loop in tlob.dot:
> 
>    "running" -> "running" [ label = "start;reset(clk_elapsed)" ]
> 
> da_handle_start_run_event(task->pid, ws, start_tlob) would put the
> monitor into running and deliver start_tlob, which resets clk_elapsed
> and arms the budget hrtimer via the generated ha_setup_invariants() —
> no manual reset() or start_timer() calls needed.
> 
> One guard would be added in tlob's ha_setup_invariants() to make the
> self-loop work correctly:
> 
>    if (next_state == curr_state && event != start_tlob)
>        return;
> 
> Without this, the start_tlob self-loop would be treated the same as any
> repeated switch_in (already running) and ha_setup_invariants() would
> return early, leaving the timer unarmed.  Does this look right to you?
> 

If you just add a separate event rvgen should take care of everything,
you should be able to take ha_verify_constraint() and friends as-is from
the generated code.

But yeah, that's what it would end up doing.

> 
> -- Patch 08: tlob monitor
> 
> --- Patch structure ---
> 
>  > Could you have everything that isn't strictly tlob-related in another
>  > patch.
> 
> Agreed.  With the ioctl interface deferred (see below), v3 would keep
> patch 08 as the tlob monitor only:
> 
>    05-b: rv: extend uprobe API with three-phase detach helpers
>          (rv_uprobe.c, rv_uprobe.h, rv_uprobe_detach refactoring)
>          — extension of patch 05, independent of tlob
> 
>    08:   rv/tlob: add the tlob monitor itself
>          (tlob.c, tlob.h, tlob_trace.h, Kconfig/Makefile, Documentation,
>           rv_trace.h include; ha_monitor.h EVENT_NONE_LBL override
>           bundled here as it is only needed by tlob)
> 
> The chardev infrastructure (rv_chardev.c, rv.h additions) and the UAPI
> header (include/uapi/linux/rv.h) would move to a follow-up series
> together with the ioctl self-instrumentation feature.
> 
> --- ioctl interface design ---
> 
>  > I'm not particularly fond of ioctls, they aren't that flexible and in
>  > this way I don't really see an added value.
>  > [...] cannot the same thing be achieved using uprobes alone, e.g. by
>  > registering a function address or the current instruction pointer?
>  > [...] wouldn't a sysfs/tracefs file achieve a similar purpose without
>  > much of the boilerplate code?
> 
> Fair point.  We plan to ship v3 with the tracefs/uprobe interface only
> and defer the ioctl (/dev/rv chardev) to a follow-up series once there
> is a concrete in-tree user that requires it.
> 
> The unique value of the ioctl is that TLOB_IOCTL_TRACE_STOP returns a
> synchronous per-call result (-EOVERFLOW or 0) to the calling thread,
> which neither uprobes nor tracefs writes can provide.  We want to keep
> that option open for later, but agree it should not block the initial
> tlob submission.
> 
> Does this approach work for you?
> 
> What is your preference?

Yeah looks good to me.
Ioctls are cumbersome to set up also for the user, perhaps another sysfs
file in the monitor directory would keep the control entirely in tlob.c
and give you roughly the same value with easier setup.

Heck we might even think of an RV reactor that does that: e.g. creates a
file where reads sleep until the first reaction (-EOVERFLOW) and returns
0 in other scenarios. I'm gonna have a thought on that, but anyway I
don't see why a sysfs file cannot do this.

Let's defer it for now.

> 
> --- Handler simplification ---
> 
>  > Perhaps keep the handler simpler by moving this reporting to a helper
>  > function and use guard(rcu)() there.
> 
> Done.  The accumulation logic is extracted into three inline helpers, each
> using scoped_guard(rcu) and returning bool (true if the task is monitored):
> 
>    tlob_acc_running(prev)   - accumulate running_ns on sched-out
>    tlob_acc_waiting(next)   - accumulate waiting_ns on sched-in
>    tlob_acc_sleeping(task)  - accumulate sleeping_ns on wakeup
> 
> handle_sched_switch() and handle_sched_wakeup() become one-liners:
> 
>    static void handle_sched_switch(...)
>    {
>        bool prev_preempted = (prev_state == 0);
> 
>        if (tlob_acc_running(prev))
>            da_handle_event(prev->pid, NULL,
>                            prev_preempted ? preempt_tlob : sleep_tlob);
>        if (tlob_acc_waiting(next))
>            da_handle_event(next->pid, NULL, switch_in_tlob);
>    }

Yeah sounds good.

> 
>  > You probably don't need these. da_handle_event should skip tasks without
>  > a monitor.
> 
> Agreed; the do_prev/do_next flags are gone.  The helpers return false
> for unmonitored tasks, and da_handle_event() skips them too — both paths
> are no-ops for tasks with no pool entry.
> 
> --- scoped_guard(rcu) ---
> 
>  > That should be a scoped_guard(rcu), definitely use guards if you have
>  > return paths, the compiler is going to clean up (unlock) for you.
> 
> Applied to all RCU-protected sections in tlob_start_task() and
> tlob_stop_task().  tlob_start_task() now uses guard(mutex) for the
> serialised duplicate-check (replacing the explicit mutex_lock/unlock),
> and tlob_stop_task() uses scoped_guard(rcu) for the atomic CAS section:
> 
>    scoped_guard(rcu) {
>        ws = da_get_target_by_id(task->pid);
>        if (!ws)
>            return -ESRCH;
>        ...
>        if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0)
>            return -EAGAIN;
>    }

Perfect.

> 
> --- tlob_stop_all removal ---
> 
>  > All this function does should be done by da_monitor_destroy. We could
>  > add a way to pass some additional deallocation for all the other cleanup
>  > you're doing on each storage.  Something like a da_extra_cleanup() you
>  > can define as whatever you need and gets called in all per-obj
>  > destruction paths.
> 
> Agreed.  tlob_stop_all() (~50 lines) has been removed entirely.
> 
> A da_extra_cleanup() hook macro is introduced in da_monitor.h: the default
> is a no-op; a monitor may override it before including the header.  tlob
> defines:
> 
>    static inline void tlob_extra_cleanup(struct da_monitor *da_mon)
>    {
>        struct ha_monitor *ha_mon = to_ha_monitor(da_mon);
>        struct tlob_task_state *ws = da_get_target(ha_mon);
> 
>        if (!ws)
>            return;
>        if (atomic_cmpxchg_release(&ws->stopping, 0, 1) != 0)
>            return;

>        ha_cancel_timer_sync(ha_mon);
After my patch making timer callbacks RCU read-side critical section,
you won't need that, just let the usual reset asynchronously stop the
timer and put everything that needs it stopped in your RCU callback.

Of course make sure the timer was stopped before this extra cleanup, so
put the macro accordingly.

I don't think da_extra_cleanup in general should be expected to sleep
and call_rcu should do the heavy lifting (it may run from any tracepoint).

Anyway we can see it later after that's merged.

>        atomic_dec(&tlob_num_monitored);
>        put_task_struct(ws->task);
>        call_rcu(&ws->rcu, tlob_free_rcu);
>    }
>    #define da_extra_cleanup tlob_extra_cleanup
> 
> da_monitor_destroy() iterates remaining entries via da_extra_cleanup +
> hash_del_rcu + call_rcu, then waits for all callbacks via rcu_barrier().
> tlob's disable path is now simply:
> 
>    static void __tlob_destroy_monitor(void)
>    {
>        da_monitor_destroy();
>    }

Looks good, let's see the full picture.

> --- EVENT_NONE_LBL ---
> 
>  > Why don't you just override EVENT_NONE_LBL (and if you prefer call it
>  > MONITOR_TIMER_EVENT_NAME) without the need for another function?
> 
> Done.  model_get_timer_event_name() has been removed from automata.h.
> In ha_monitor.h, EVENT_NONE_LBL is now overridable:
> 
>    #ifndef EVENT_NONE_LBL
>    #define EVENT_NONE_LBL "none"
>    #endif
> 
> tlob.c defines it before including the model header:
> 
>    #define EVENT_NONE_LBL "budget_exceeded"
> 
> The two call sites in ha_monitor.h that previously called
> model_get_timer_event_name() now use EVENT_NONE_LBL directly.
> 
> --- KUnit config / tristate ---
> 
>  > Do you need to add this here? Since you have a patch adding KUnit tests
>  > to tlob, cannot you put everything kunit-related there?
>  > I couldn't build it as module.
> 
> Agreed on moving the Kconfig entry to patch 09.
> 
> The module build issue is fixed by exporting the symbols needed by the
> test via EXPORT_SYMBOL_IF_KUNIT (EXPORTED_FOR_KUNIT_TESTING namespace);
> tlob_kunit.c imports the namespace with MODULE_IMPORT_NS.  We plan to
> keep tristate rather than changing to bool.  Does that work for you?

Yeah it's good as long as it works as module too.

I might have a look at making my patch module-ready, for now it just
can't work but I wonder if we can do something nicer to allow it
(like in your case a bunch of exports, a separate file and a standalone
testcase, perhaps all wrapped in some helper).

> 
> --- detail_env_tlob tracepoint ---
> 
>  > Since you are not documenting the detail_env_tlob tracepoint, is it
>  > something really required? I would at the very least document its usage.
> 
> Fair point.  detail_env_tlob emits (running_ns, waiting_ns, sleeping_ns)
> so the user can see which phase consumed the budget: high sleeping_ns
> indicates I/O latency, high waiting_ns indicates scheduler pressure, high
> running_ns indicates a compute overrun.  Without this breakdown the user
> only knows the total elapsed time exceeded the threshold, not why.
> 

Alright, then this can go into the docs.

> 
> --- Documentation ---
> 
>  > This is standard tracepoints usage, there's nothing about tlob we should
>  > document here.
>  > Same here, standard RV [for enable/desc tracefs files].
>  > And this is duplicating what mentioned above about uprobes, isn't it?
> 
> Agreed.  The following have been removed:
> 
>    - "Violation events" section: generic trace-cmd examples and cat-trace
>      instructions (standard tracepoints usage).
>    - tracefs files: "enable (rw)" and "desc (ro)" entries (standard RV).
>    - tracefs files: "monitor (rw)" description condensed to one line with
>      a cross-reference to the uprobes section above.
> 
> In their place, a new "Violation tracepoints" subsection documents both
> tlob-specific tracepoints with fields and a worked example:
> 
>    error_env_tlob: id, state, event ("budget_exceeded"), env ("clk_elapsed")
> 
>    detail_env_tlob: id, threshold_us, running_ns, waiting_ns, sleeping_ns
>      Use sleeping_ns to diagnose I/O latency, waiting_ns for scheduler
>      pressure, running_ns for compute overruns.
> 
>    Example:
>      trace-cmd record -e error_env_tlob -e detail_env_tlob &
>      # ... run workload ...
>      trace-cmd report

Yeah sounds good, also pointing out to enable the monitor.
We might think of a general way to do this kind of thing in
tools/rv, although detail_env_tlob is non-standard.

>  > Is kernel code going to use this API? RV monitors are meant to be
>  > enabled by userspace. What's the use-case here?
> 
> Agreed.  The uprobe interface is driven from userspace; tlob_start_task()
> and tlob_stop_task() are the internal implementation functions it calls,
> not a public API for external kernel modules.  The hypothetical
> kernel-module use case would be removed from the documentation; the
> kernel-doc block is retained for code maintainers.
> 
>  > That's probably a bit too detailed for this page. If you really want
>  > this information somewhere couldn't it stay in the code?
> 
> Agreed; moved to comments in handle_sched_switch() and
> handle_sched_wakeup().  The "Limitations" subsection is retained.
> 
> -- Patch 09: KUnit tests
> 
>  > What caught my eyes are tests enrolling tracepoints handlers. If you
>  > go there you're no longer doing unit testing, what's the advantage of
>  > testing the entire monitor here over doing that in selftests?
> 
> Agreed.  The three suites that register tracepoint handlers or create
> kthreads (tlob_sched_integration, tlob_trace_output, tlob_violation_react)
> have been removed from KUnit and will be added to selftests in v3.
> 
> Two pure unit test suites remain in KUnit:
> 
>    tlob_task_api:
>      Tests tlob_start_task / tlob_stop_task return values (-ENODEV,
>      -EALREADY, -ESRCH, -EOVERFLOW, -ENOSPC, -ERANGE) via direct calls
>      (these functions are the internal implementation used by both the
>      uprobe and, in future, the ioctl interface).
>      No tracepoints, no scheduling.
> 
>    tlob_uprobe_format:
>      Tests the uprobe line parser (tlob_parse_uprobe_line,
>      tlob_parse_remove_line) against valid and invalid input strings.
>      Pure string parsing; no scheduling, no tracepoints.
> 
> This also resolves the tristate-vs-bool issue: with only pure unit tests
> there is no dependency on sched_setscheduler_nocheck, so bool is correct.
> 

Yeah looks good.

> 
> --  Patch 10: selftests
> 
> --- PREEMPT_RT RCU stall ---
> 
>  > I run it on a VM and have it hanging at step 9 [...] rcu_preempt stall.
>  > Did you see that? Am I doing something wrong?
> 
> Thanks for reporting.  The patch changed ha_monitor.h from
> HRTIMER_MODE_REL_HARD (the existing upstream value) to REL_SOFT; the
> stall appeared on PREEMPT_RT after that change.  We have not fully
> confirmed whether REL_SOFT is the root cause — REL_SOFT defers the
> callback to the ktimers kthread, which could starve rcu_preempt under
> certain PREEMPT_RT configurations, but other factors may be involved.
> 
> We plan to revert to HRTIMER_MODE_REL_HARD at both sites in ha_monitor.h
> as the conservative choice:
> 
>    ha_setup_timer():     HRTIMER_MODE_REL_SOFT -> HRTIMER_MODE_REL_HARD
>    ha_start_timer_ns():  HRTIMER_MODE_REL_SOFT -> HRTIMER_MODE_REL_HARD
> 
> Do you have more insight into the stall, or does REL_HARD resolve it on
> your setup?

Right, good point, any specific reason why you wanted REL_SOFT?

I indeed always test under PREEMPT_RT but I still see the same splat
also after reverting REL_HARD..
Could you reproduce it on your setup?

My config is nothing special: what vng gives you adding
PREEMPT_RT/RCU_PREEMPT and lockdep (PROVE_LOCKING/PROVE_RCU).

> 
> --- Selftest structure ---
> 
>  > This should be tested together with the other monitors (enable/disable),
>  > we could at most expand those with the check_requires.
>  > Let's focus on tlob-only features in this patch.
> 
> Agreed.  In v3 we plan to drop tracefs.tc (covered by the generic
> rv_monitor_enable_disable.tc) and keep only the six uprobe-specific
> test cases under test.d/tlob/
> 
> ioctl.tc is deferred with the ioctl interface to the follow-up series.
> The KUnit integration tests (sched_switch accounting, budget-expiry
> tracepoint) would be moved to selftests as additional test cases.
> 

Thanks,
Gabriele


^ permalink raw reply

* Re: [PATCH v3 2/2] spi: qcom-geni: Add trace events for Qualcomm GENI SPI driver
From: Konrad Dybcio @ 2026-05-19 11:34 UTC (permalink / raw)
  To: Mukesh Savaliya, Praveen Talari, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Mark Brown
  Cc: linux-kernel, linux-trace-kernel, linux-arm-msm, linux-spi,
	aniket.randive, chandana.chiluveru, jyothi.seerapu
In-Reply-To: <12d8da90-f092-449f-af6b-14b9dd851f1e@oss.qualcomm.com>

On 5/19/26 12:59 PM, Mukesh Savaliya wrote:
> Hi Praveen, one question below.
> 
> On 5/18/2026 10:30 PM, Praveen Talari wrote:
>> Add tracepoints to the Qualcomm GENI (Generic Interface) SPI driver.
>> These trace events enable runtime debugging and performance analysis
>> of SPI operations.
>>
>> The trace events capture SPI clock configuration, setup parameters,
>> transfer details, interrupt status.
> Don't you need trace logs around PM operations ? ie. runtime and system PM ?

The PM core provides a couple, are those enough?

Konrad

^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 12:31 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: LKML, Linux Trace Kernel, bpf, Mathieu Desnoyers, Mark Rutland,
	Peter Zijlstra, Namhyung Kim, Takaya Saeki, Douglas Raillard,
	Tom Zanussi, Andrew Morton, Thomas Gleixner, Ian Rogers,
	Jiri Olsa
In-Reply-To: <20260519185302.5fb527085a64567a388f24f3@kernel.org>

On Tue, 19 May 2026 18:53:02 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> > If BTF is in the kernel, then instead, the $skbaddr can be typecast to
> > sk_buff and use the normal dereference logic.
> > 
> >   # echo 'e:xmit net.net_dev_xmit (sk_buff*)$skbaddr->dev->name:string' >> dynamic_events  
> 
> Ah, eprobes supports "$PARAM" to access its parameter by name.
> That is a bit complicated. Should we allow user to access
> parameter without '$' prefix for eprobes?

I guess.

> 
> >   # echo 1 > events/eprobes/xmit/enable
> >   # cat trace
> > [..]
> >     sshd-session-1022    [000] b..2.   860.249343: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.250061: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.250142: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.263553: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.283820: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.302716: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.322905: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.342828: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.362268: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.382335: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.400856: xmit: (net.net_dev_xmit) arg1="enp7s0"
> >     sshd-session-1022    [000] b..2.   860.419893: xmit: (net.net_dev_xmit) arg1="enp7s0"  
> 
> Looks very nice!
> 
> > 
> > The syntax is simply: ([STRUCT]*)(VAR)->FIELD[->FIELD..]  
> 
> Is the STRUCT optional?? (because [] means optional.) I guess no.

Oops, no, I was tired when I wrote this, and just put '[' and ']' to make
it a variable. But I wasn't consistent. I'll fix that to be:

  The syntax is simply: (STRUCT*)(VAR)->FIELD[->FIELD..]  

> 
> I think we maybe possible to skip '*' (Or, make it optional)
> because this is not C-like typecasting, we don't support "struct"
> reserved word, and it does not support white-spaces in each
> fetcharg. In this case, (STRUCT)VAR->FIELD should work.

I could remove the '*' as it doesn't support the "struct" C word.

> 
> BTW, I'm also considering to support new cast syntax, which allows
> us to derefer a pointer with "container_of". This is typically
> used in the kernel.
> 
> We usually see this pattern:
> 
> struct {
> 	unsigned long		data;
> 	struct list_head	list;
> } foo;
> 
> void callback(struct list_head *foo_list)
> {
> 	unsigned long data = container_of(foo_list, struct foo, list)->data;
> 	...
> }
> 
> To access @data, simple casting does not work. Thus we need a
> new syntax:
> 
> 	(STRUCT)(PTR,ASSIGN)->FIELD
> 
> So the above case, we can do:
> 
> 	data=(foo)(foo_list,list)->data

Hmm, it may be better to make it one parenthesis?

       (STRUCT,PTR,ASSIGN)->FIELD

       data=(foo,foo_list,list)->data

That would make it easier to differentiate between a simple "typecast" and
a container_of() by checking if the content between the parenthesis has a
comma.

Maybe even reorder it to:

       (PTR,STRUCT,ASSIGN)->FIELD

       data=(foo_list,foo,list)->data

to match the order of container_of():

      data = container_of(foo_list, struct foo, list)->data;

?

> 
> This is naturally extend the type casting to support container_of()
> equivalent casting.
> 
> > 
> > Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
> > ---
> > Changes since v3: https://patch.msgid.link/20260518095832.52659a3a@gandalf.local.home
> > 
> >  *** COMPLETE REWRITE FROM V3 ***
> > 
> > - Rewrote it to use typecasting instead of simply replacing BTF names with
> >   offsets.
> > 
> >  Documentation/trace/kprobetrace.rst |   3 +
> >  kernel/trace/trace_probe.c          | 110 ++++++++++++++++++++++++----
> >  kernel/trace/trace_probe.h          |   3 +
> >  3 files changed, 100 insertions(+), 16 deletions(-)
> > 
> > diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
> > index 3b6791c17e9b..450ac646fe4c 100644
> > --- a/Documentation/trace/kprobetrace.rst
> > +++ b/Documentation/trace/kprobetrace.rst
> > @@ -54,6 +54,9 @@ Synopsis of kprobe_events
> >    $retval	: Fetch return value.(\*2)
> >    $comm		: Fetch current task comm.
> >    +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
> > +  (STRUCT*)FETCHARG->FIELD[->FIELD] : If BTF is supported, typecast FETCHARG to
> > +                  a pointer to STRUCT and then derference the pointer defined by
> > +                  ->FIELD.
> >    \IMM		: Store an immediate value to the argument.
> >    NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
> >    FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
> > diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> > index e0d3a0da26af..b0829eb1cb52 100644
> > --- a/kernel/trace/trace_probe.c
> > +++ b/kernel/trace/trace_probe.c
> > @@ -464,6 +464,26 @@ static const char *fetch_type_from_btf_type(struct btf *btf,
> >  	return NULL;
> >  }
> >  
> > +static int query_btf_struct(const char *sname, struct traceprobe_parse_context *ctx)
> > +{
> > +	int id;
> > +
> > +	if (!ctx->btf) {
> > +		struct btf *btf;  
> 
> This needs an empty line here.

Sure.

For conditional blocks, I don't always add a newline, but this is your code
and I'll follow your suggestions.

> 
> > +		id = bpf_find_btf_id(sname, BTF_KIND_STRUCT, &btf);
> > +		if (id < 0)
> > +			return -EINVAL;  
> 
> Why don't you return id (it has corresponding errno)?

Because I forgot to ;-)

> 
> > +		ctx->btf = btf;
> > +	} else {
> > +		id = btf_find_by_name_kind(ctx->btf, sname, BTF_KIND_STRUCT);
> > +		if (id < 0)
> > +			return -EINVAL;  
> 
> Ditto.
> 
> > +	}
> > +
> > +	ctx->last_struct = btf_type_by_id(ctx->btf, id);
> > +	return 0;
> > +}
> > +
> >  static int query_btf_context(struct traceprobe_parse_context *ctx)
> >  {
> >  	const struct btf_param *param;
> > @@ -471,12 +491,12 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
> >  	struct btf *btf;
> >  	s32 nr;
> >  
> > -	if (ctx->btf)
> > -		return 0;
> > -
> >  	if (!ctx->funcname)
> >  		return -EINVAL;
> >  
> > +	if (ctx->btf)
> > +		return 0;
> > +  
> 
> Could you tell me why this order is changed?
> I think this type casting will allow us to skip checking funcname
> because btf context is already specified.

I wanted this to fail if btf was already set but funcname wasn't, because
this should only be called for functions.

> 
> Ah, BTW, we may need to use a special struct btf* for type
> casting. If the target function is in a module and the
> casting type is defined in vmlinux, those are stored in
> the different places...

OK, I'll make a separate btf for it then. I'll have to make sure the btf
used for parsing knows which one to use. Shouldn't be too hard if we check
for the STRUCT flag in the ctx->flags.

> 
> 
> for example,
> 
>  p funcA (foo)$arg1->bar buz
> 
> In this case, buz needs to use BTF including funcA.
> Maybe we need to introduce ctx->func_btf, which resets ctx->btf
> in traceprobe_parse_probe_arg_body() where parse_probe_arg()
> is calling, e.g.
> 
> 	ctx->last_type = NULL;
> +	if (ctx->btf)
> +		btf_put(ctx->btf);
> +	ctx->btf = ctx->func_btf;
> 	ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1],
> 			      ctx);
> 
> 
> >  	type = btf_find_func_proto(ctx->funcname, &btf);
> >  	if (!type)
> >  		return -ENOENT;
> > @@ -514,6 +534,7 @@ static void clear_btf_context(struct traceprobe_parse_context *ctx)
> >  		ctx->proto = NULL;
> >  		ctx->params = NULL;
> >  		ctx->nr_params = 0;
> > +		ctx->last_struct = NULL;
> >  	}
> >  }
> >  
> > @@ -554,22 +575,28 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
> >  	struct fetch_insn *code = *pcode;
> >  	const struct btf_member *field;
> >  	u32 bitoffs, anon_offs;
> > +	bool is_struct = ctx->flags & TPARG_FL_STRUCT;
> >  	char *next;
> >  	int is_ptr;
> >  	s32 tid;
> >  
> >  	do {
> > -		/* Outer loop for solving arrow operator ('->') */
> > -		if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
> > -			trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
> > -			return -EINVAL;
> > -		}
> > -		/* Convert a struct pointer type to a struct type */
> > -		type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
> > -		if (!type) {
> > -			trace_probe_log_err(ctx->offset, BAD_BTF_TID);
> > -			return -EINVAL;
> > +		if (!is_struct) {
> > +			/* Outer loop for solving arrow operator ('->') */
> > +			if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
> > +				trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
> > +				return -EINVAL;
> > +			}
> > +
> > +			/* Convert a struct pointer type to a struct type */
> > +			type = btf_type_skip_modifiers(ctx->btf, type->type, &tid);
> > +			if (!type) {
> > +				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
> > +				return -EINVAL;
> > +			}
> >  		}
> > +		/* Only the first type can skip being a pointer */
> > +		is_struct = false;
> >  
> >  		bitoffs = 0;
> >  		do {
> > @@ -635,12 +662,12 @@ static int parse_btf_arg(char *varname,
> >  {
> >  	struct fetch_insn *code = *pcode;
> >  	const struct btf_param *params;
> > -	const struct btf_type *type;
> > +	const struct btf_type *type = NULL;
> >  	char *field = NULL;
> >  	int i, is_ptr, ret;
> >  	u32 tid;
> >  
> > -	if (WARN_ON_ONCE(!ctx->funcname))
> > +	if (WARN_ON_ONCE(!ctx->funcname && !(ctx->flags & TPARG_FL_STRUCT)))
> >  		return -EINVAL;
> >  
> >  	is_ptr = split_next_field(varname, &field, ctx);
> > @@ -704,11 +731,18 @@ static int parse_btf_arg(char *varname,
> >  			goto found;
> >  		}
> >  	}
> > +
> > +	if (ctx->flags & TPARG_FL_STRUCT) {
> > +		type = ctx->last_struct;
> > +		goto found;  
> 
> I rather like to jump type_found: label instead of
> checking !type. (Or, save tid instead of type)
> 

OK.

> > +	}
> > +
> >  	trace_probe_log_err(ctx->offset, NO_BTFARG);
> >  	return -ENOENT;
> >  
> >  found:
> > -	type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
> > +	if (!type)
> > +		type = btf_type_skip_modifiers(ctx->btf, tid, &tid);  
> 
> type_found:
> 
> >  	if (!type) {
> >  		trace_probe_log_err(ctx->offset, BAD_BTF_TID);
> >  		return -EINVAL;
> > @@ -952,6 +986,12 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
> >  	int ret = 0;
> >  	int len;
> >  
> > +	if (ctx->flags & TPARG_FL_STRUCT) {
> > +		ret = parse_btf_arg(orig_arg, pcode, end, ctx);
> > +		if (ret < 0)
> > +			return ret;
> > +	}
> > +
> >  	if (ctx->flags & TPARG_FL_TEVENT) {
> >  		if (code->data)
> >  			return -EFAULT;
> > @@ -1231,6 +1271,43 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
> >  				code->op = FETCH_OP_IMM;
> >  		}
> >  		break;
> > +	case '(':
> > +		tmp = strrchr(arg, ')');  
> 
> OK, in this step, we don't support nested cast etc. so this works.
> 
> > +		if (!tmp) {
> > +			trace_probe_log_err(ctx->offset + strlen(arg),
> > +					    DEREF_OPEN_BRACE);
> > +			return -EINVAL;
> > +		}
> > +
> > +		tmp--;
> > +		if (*tmp != '*') {
> > +			trace_probe_log_err(ctx->offset + (tmp - arg),
> > +					    NO_PTR_STRCT);
> > +			return -EINVAL;
> > +		}  
> 
> So I think this can be optional, not an error.

I'll just remove it.

> 
> > +		*tmp = '\0';
> > +		ret = query_btf_struct(arg + 1, ctx);
> > +		*tmp = '*';
> > +
> > +		if (ret < 0) {
> > +			trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
> > +			return -EINVAL;
> > +		}
> > +
> > +		ctx->flags |= TPARG_FL_STRUCT;
> > +		tmp += 2;
> > +
> > +		if (*tmp != '$') {
> > +			trace_probe_log_err(ctx->offset + (tmp - arg),
> > +					    BAD_VAR);
> > +			return -EINVAL;
> > +		}  
> 
> Ok, this limitation will be removed afterwards.

Yeah.

Thanks for reviewing.

-- Steve

^ permalink raw reply

* Re: [PATCH v2 03/14] tools/rv: Fix exit status when monitor execution fails
From: Gabriele Monaco @ 2026-05-19 12:32 UTC (permalink / raw)
  To: Nam Cao
  Cc: Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang,
	linux-kernel, linux-trace-kernel, Steven Rostedt
In-Reply-To: <874ik586ex.fsf@yellow.woof>

On Mon, 2026-05-18 at 10:32 +0200, Nam Cao wrote:
> Gabriele Monaco <gmonaco@redhat.com> writes:
> > +	exit(run <= 0);
> 
> Probably better to stick to the C standard:
> 
>     exit(run > 0 ? EXIT_SUCCESS : EXIT_FAILURE)
> 
> but whatever.
> 
> Reviewed-by: Nam Cao <namcao@linutronix.de>

Good point, that's far more readable, will apply and perhaps use those macros
also for other 0/1 in the file.

Thanks,
Gabriele


^ permalink raw reply

* Re: [PATCH v4] tracing/probes: Allow use of BTF names to dereference pointers
From: Steven Rostedt @ 2026-05-19 12:36 UTC (permalink / raw)
  To: sashiko-bot
  Cc: sashiko-reviews, bpf, LKML, Linux Trace Kernel, Masami Hiramatsu
In-Reply-To: <20260519040928.458C4C2BCB3@smtp.kernel.org>

On Tue, 19 May 2026 04:09:27 +0000
sashiko-bot@kernel.org wrote:

> Thank you for your contribution! Sashiko AI review found 5 potential issue(s) to consider:
> - [High] NULL pointer dereference in `parse_btf_arg()` when typecasting `$retval`
> - [High] Instruction stream corruption when typecasting BTF variables with field dereferences
> - [High] Parser state corruption breaks multi-argument probes and standard BTF parameter resolution
> - [Medium] Hardcoded syntax check strictly prohibits typecasting hardware registers and standard BTF arguments
> - [Medium] Using `strrchr` causes parsing failures for memory dereferences containing parentheses
> --
> 
> commit 8ce6119a14206ed21706a773d9694dc24e53436e
> Author: Steven Rostedt <rostedt@goodmis.org>
> 
> tracing/probes: Allow use of BTF names to dereference pointers
> 
> This commit adds syntax to FETCHARG parsing to allow typecasting a value
> to a pointer to a structure. This allows users to rely on BTF to figure out
> member offsets instead of manually finding and hardcoding numerical offsets.
> 
> > But for event probes, or generic kprobes that records a register that
> > happens to be a pointer to a structure, they cannot dereference these
> > values with BTF naming, but must use numerical offsets.  
> 
> Does the code actually support dereferencing a register here? Hardware
> registers (like %rdi) and standard BTF arguments (like skb) do not begin
> with a $ character, but the parser explicitly enforces that the cast is
> followed by a $.

Heh, it doesn't support it yet. But I plan on doing so. I'll either add the
support or fix the change log.

> 
> > diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
> > index e1c73065dae51..29b5a99074d76 100644
> > --- a/kernel/trace/trace_probe.c
> > +++ b/kernel/trace/trace_probe.c  
> [ ... ]
> > @@ -471,12 +491,12 @@ static int query_btf_context(struct traceprobe_parse_context *ctx)
> >  	struct btf *btf;
> >  	s32 nr;
> >  
> > -	if (ctx->btf)
> > -		return 0;
> > -
> >  	if (!ctx->funcname)
> >  		return -EINVAL;
> >  
> > +	if (ctx->btf)
> > +		return 0;
> > +  
> 
> If a user creates a probe with a struct cast for the return value (like
> (struct task_struct*)$retval), query_btf_struct() resolves the struct and
> sets ctx->btf but leaves ctx->proto as NULL.
> 
> When parse_btf_arg() later handles $retval, it calls query_btf_context(),
> which will return 0 here because ctx->btf is already set. Does this lead
> to parse_btf_arg() unconditionally dereferencing a NULL ctx->proto?
> 
> Additionally, for multi-argument probes where the first argument is a cast
> (like arg1=(struct foo*)$arg1 arg2=skb), setting ctx->btf for the first
> argument causes this check to return early for the second argument. Could
> this skip populating ctx->params and ctx->nr_params, causing valid BTF
> arguments to be rejected?

Yeah, this will be fixed by Masami's suggestion of using a different btf
for the structure.

> 
> [ ... ]
> > @@ -952,6 +986,12 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
> >  	int ret = 0;
> >  	int len;
> >  
> > +	if (ctx->flags & TPARG_FL_STRUCT) {
> > +		ret = parse_btf_arg(orig_arg, pcode, end, ctx);
> > +		if (ret < 0)
> > +			return ret;
> > +	}
> > +  
> 
> When parse_btf_arg() successfully parses a typecast with field dereferences,
> it emits a FETCH_OP_DEREF instruction and advances the *pcode pointer.
> 
> Since this block doesn't return upon success, execution continues and matches
> the base variable name later in parse_probe_vars(). The local code pointer
> is not updated to reflect the advanced *pcode, so it writes a new instruction
> (like FETCH_OP_ARG) into the same slot.
> 
> Will this overwrite the FETCH_OP_DEREF instruction and corrupt the trace probe
> bytecode?

Good catch. I need to move the code = *pcode; to after that if block.

> 
> [ ... ]
> > @@ -1231,6 +1271,43 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
> >  				code->op = FETCH_OP_IMM;
> >  		}
> >  		break;
> > +	case '(':
> > +		tmp = strrchr(arg, ')');  
> 
> If a user attempts to cast a standard memory dereference that itself
> contains parentheses (such as (struct foo*)+0($arg1)), strrchr() will
> find the final parenthesis of $arg1).

Ah, I should have used strchr(). Thanks.

> 
> Will this cause the pointer validation logic below to evaluate the wrong
> segment of the string and incorrectly fail with NO_PTR_STRCT?
> 
> > +		if (!tmp) {
> > +			trace_probe_log_err(ctx->offset + strlen(arg),
> > +					    DEREF_OPEN_BRACE);
> > +			return -EINVAL;
> > +		}
> > +
> > +		tmp--;
> > +		if (*tmp != '*') {
> > +			trace_probe_log_err(ctx->offset + (tmp - arg),
> > +					    NO_PTR_STRCT);
> > +			return -EINVAL;
> > +		}
> > +		*tmp = '\0';
> > +		ret = query_btf_struct(arg + 1, ctx);
> > +		*tmp = '*';
> > +
> > +		if (ret < 0) {
> > +			trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
> > +			return -EINVAL;
> > +		}
> > +
> > +		ctx->flags |= TPARG_FL_STRUCT;
> > +		tmp += 2;
> > +
> > +		if (*tmp != '$') {  
> 
> Is this strict check for '$' intentional?
> 
> As mentioned regarding the commit message, this prevents typecasting hardware
> registers and standard BTF arguments since they do not start with a '$',
> effectively restricting the feature to $retval, $stack, and $argN.

It was the only thing I tested. I'll remove it and test other options.

Thanks,

-- Steve

> 
> > +			trace_probe_log_err(ctx->offset + (tmp - arg),
> > +					    BAD_VAR);
> > +			return -EINVAL;
> > +		}
> > +
> > +		ctx->offset += tmp - arg;
> > +		ret = parse_probe_vars(tmp, type, pcode, end, ctx);
> > +		ctx->flags &= ~TPARG_FL_STRUCT;
> > +		ctx->last_struct = NULL;
> > +		break;  
> 


^ permalink raw reply

* Re: [PATCH 2/3] rv/rtapp/sleep: Update nanosleep rule
From: Gabriele Monaco @ 2026-05-19 12:58 UTC (permalink / raw)
  To: Nam Cao; +Cc: Steven Rostedt, linux-kernel, linux-trace-kernel
In-Reply-To: <a2175a6647bb88797ad8275eddb4b20b749474ec.1779176466.git.namcao@linutronix.de>

On Tue, 2026-05-19 at 09:49 +0200, Nam Cao wrote:
> CLOCK_REALTIME is the only clock that often is misused in real-time
> applications. The other clocks either are safe for real-time uses
> (CLOCK_TAI, CLOCK_MONOTONIC, CLOCK_BOOTTIME) or are unlikely to be misused
> (CLOCK_AUX, CLOCK_PROCESS_CPUTIME_ID).
> 
> The rtapp monitor's purpose is warning people about common mistakes with
> real-time design. However, warning about all clock types generates too much
> false positives.

I'm fine with the change, but are those really false positives?

From what I understand before this change we would report any non-rt-friendly
clock type, now only realtime.
What we are skipping would still be true positives, just so uncommon not to
justify the extra complexity, right?

Or do you mean an RT task using CLOCK_AUX is a false positive because likely the
users weren't even trying to do real-time?

Thanks,
Gabriele

> 
> Update the monitor to only warn about CLOCK_REALTIME.
> 
> Signed-off-by: Nam Cao <namcao@linutronix.de>
> ---
>  kernel/trace/rv/monitors/sleep/sleep.c    | 10 ++---
>  kernel/trace/rv/monitors/sleep/sleep.h    | 52 +++++++++++------------
>  tools/verification/models/rtapp/sleep.ltl |  2 +-
>  3 files changed, 28 insertions(+), 36 deletions(-)
> 
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.c
> b/kernel/trace/rv/monitors/sleep/sleep.c
> index 0a36f5519e6b..e01ac56b3f4a 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.c
> +++ b/kernel/trace/rv/monitors/sleep/sleep.c
> @@ -43,9 +43,7 @@ static void ltl_atoms_init(struct task_struct *task, struct
> ltl_monitor *mon, bo
>  	ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
>  
>  	if (task_creation) {
> -		ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
> +		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, false);
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  		ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
>  		ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
> @@ -136,8 +134,7 @@ static void handle_sys_enter(void *data, struct pt_regs
> *regs, long id)
>  	case __NR_clock_nanosleep_time64:
>  #endif
>  		syscall_get_arguments(current, regs, args);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] ==
> CLOCK_MONOTONIC);
> -		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] ==
> CLOCK_TAI);
> +		ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, args[0] ==
> CLOCK_REALTIME);
>  		ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] ==
> TIMER_ABSTIME);
>  		ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
>  		break;
> @@ -178,8 +175,7 @@ static void handle_sys_exit(void *data, struct pt_regs
> *regs, long ret)
>  
>  	ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
>  	ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
> -	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
> -	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
> +	ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_REALTIME, false);
>  	ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
>  	ltl_atom_set(mon, LTL_EPOLL_WAIT, false);
>  	ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
> diff --git a/kernel/trace/rv/monitors/sleep/sleep.h
> b/kernel/trace/rv/monitors/sleep/sleep.h
> index 95dc2727c059..ed1ac7ad008e 100644
> --- a/kernel/trace/rv/monitors/sleep/sleep.h
> +++ b/kernel/trace/rv/monitors/sleep/sleep.h
> @@ -20,8 +20,7 @@ enum ltl_atom {
>  	LTL_FUTEX_WAIT,
>  	LTL_KERNEL_THREAD,
>  	LTL_KTHREAD_SHOULD_STOP,
> -	LTL_NANOSLEEP_CLOCK_MONOTONIC,
> -	LTL_NANOSLEEP_CLOCK_TAI,
> +	LTL_NANOSLEEP_CLOCK_REALTIME,
>  	LTL_NANOSLEEP_TIMER_ABSTIME,
>  	LTL_RT,
>  	LTL_SLEEP,
> @@ -46,8 +45,7 @@ static const char *ltl_atom_str(enum ltl_atom atom)
>  		"fu_wa",
>  		"ker_th",
>  		"kth_sh_st",
> -		"na_cl_mo",
> -		"na_cl_ta",
> +		"na_cl_re",
>  		"na_ti_ab",
>  		"rt",
>  		"sl",
> @@ -87,8 +85,7 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
> -	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> +	bool nanosleep_clock_realtime =
> test_bit(LTL_NANOSLEEP_CLOCK_REALTIME, mon->atoms);
>  	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
>  	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> @@ -97,17 +94,17 @@ static void ltl_start(struct task_struct *task, struct
> ltl_monitor *mon)
>  	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
>  	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
>  	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val42 = task_is_rcu || task_is_migration;
> -	bool val43 = futex_lock_pi || val42;
> -	bool val5 = block_on_rt_mutex || val43;
> -	bool val34 = abort_sleep || kthread_should_stop;
> -	bool val35 = woken_by_nmi || val34;
> -	bool val36 = woken_by_hardirq || val35;
> -	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val41 = task_is_rcu || task_is_migration;
> +	bool val42 = futex_lock_pi || val41;
> +	bool val5 = block_on_rt_mutex || val42;
> +	bool val33 = abort_sleep || kthread_should_stop;
> +	bool val34 = woken_by_nmi || val33;
> +	bool val35 = woken_by_hardirq || val34;
> +	bool val14 = woken_by_equal_or_higher_prio || val35;
>  	bool val13 = !wake;
> -	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool val27 = nanosleep_timer_abstime && val26;
> -	bool val18 = clock_nanosleep && val27;
> +	bool val25 = !nanosleep_clock_realtime;
> +	bool val26 = nanosleep_timer_abstime && val25;
> +	bool val18 = clock_nanosleep && val26;
>  	bool val20 = val18 || epoll_wait;
>  	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> @@ -138,8 +135,7 @@ ltl_possible_next_states(struct ltl_monitor *mon, unsigned
> int state, unsigned l
>  	bool sleep = test_bit(LTL_SLEEP, mon->atoms);
>  	bool rt = test_bit(LTL_RT, mon->atoms);
>  	bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME,
> mon->atoms);
> -	bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon-
> >atoms);
> -	bool nanosleep_clock_monotonic =
> test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
> +	bool nanosleep_clock_realtime =
> test_bit(LTL_NANOSLEEP_CLOCK_REALTIME, mon->atoms);
>  	bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon-
> >atoms);
>  	bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
>  	bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
> @@ -148,17 +144,17 @@ ltl_possible_next_states(struct ltl_monitor *mon,
> unsigned int state, unsigned l
>  	bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
>  	bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
>  	bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
> -	bool val42 = task_is_rcu || task_is_migration;
> -	bool val43 = futex_lock_pi || val42;
> -	bool val5 = block_on_rt_mutex || val43;
> -	bool val34 = abort_sleep || kthread_should_stop;
> -	bool val35 = woken_by_nmi || val34;
> -	bool val36 = woken_by_hardirq || val35;
> -	bool val14 = woken_by_equal_or_higher_prio || val36;
> +	bool val41 = task_is_rcu || task_is_migration;
> +	bool val42 = futex_lock_pi || val41;
> +	bool val5 = block_on_rt_mutex || val42;
> +	bool val33 = abort_sleep || kthread_should_stop;
> +	bool val34 = woken_by_nmi || val33;
> +	bool val35 = woken_by_hardirq || val34;
> +	bool val14 = woken_by_equal_or_higher_prio || val35;
>  	bool val13 = !wake;
> -	bool val26 = nanosleep_clock_monotonic || nanosleep_clock_tai;
> -	bool val27 = nanosleep_timer_abstime && val26;
> -	bool val18 = clock_nanosleep && val27;
> +	bool val25 = !nanosleep_clock_realtime;
> +	bool val26 = nanosleep_timer_abstime && val25;
> +	bool val18 = clock_nanosleep && val26;
>  	bool val20 = val18 || epoll_wait;
>  	bool val9 = futex_wait || val20;
>  	bool val11 = val9 || kernel_thread;
> diff --git a/tools/verification/models/rtapp/sleep.ltl
> b/tools/verification/models/rtapp/sleep.ltl
> index 6f26c4810f78..2637bc48a620 100644
> --- a/tools/verification/models/rtapp/sleep.ltl
> +++ b/tools/verification/models/rtapp/sleep.ltl
> @@ -9,7 +9,7 @@ RT_VALID_SLEEP_REASON = FUTEX_WAIT
>  
>  RT_FRIENDLY_NANOSLEEP = CLOCK_NANOSLEEP
>                      and NANOSLEEP_TIMER_ABSTIME
> -                    and (NANOSLEEP_CLOCK_MONOTONIC or NANOSLEEP_CLOCK_TAI)
> +                    and not NANOSLEEP_CLOCK_REALTIME
>  
>  RT_FRIENDLY_WAKE = WOKEN_BY_EQUAL_OR_HIGHER_PRIO
>                  or WOKEN_BY_HARDIRQ


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox