* [PATCH v5 7/7] tracing/probes: Add a new testcase for BTF typecasts
From: Masami Hiramatsu (Google) @ 2026-06-17 1:03 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
With the introduction of container_of-style BTF typecasting and
per-CPU variable access support in trace probes, we need a way to
verify their functionality and prevent regressions.
Add a new ftrace kselftest and update the trace event sample module
to test and validate these features.
Specifically, update the trace-events-sample module to set up a
periodic timer whose callback accesses a per-CPU counter. Introduce
a new sample trace event, foo_timer_fn, to trace this callback
and log the current counter value.
Then, add a new test case, btf_probe_event.tc, which defines a
dynamic probe on the timer callback. The probe uses BTF typecasting
to recover the parent structure from the timer argument and
this_cpu_read() to fetch the per-CPU counter. The test verifies
the integrity of the implementation by ensuring the values
recorded by the dynamic probe match those from the static tracepoint.
Assisted-by: Antigravity:gemini-3.5-flash
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v5:
- Add more syntax test cases.
Changes in v4:
- Fix uprobe $current test.
Changes in v3:
- Add syntax test case.
- Update testcase to use this_cpu_read()
Changes in v2:
- Use timer_shutdown_sync() instead of timer_delete_sync() for teardown.
---
samples/trace_events/trace-events-sample.c | 40 +++++++++++++++-
samples/trace_events/trace-events-sample.h | 34 ++++++++++++-
.../ftrace/test.d/dynevent/btf_probe_event.tc | 51 ++++++++++++++++++++
.../ftrace/test.d/dynevent/fprobe_syntax_errors.tc | 11 ++++
.../ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 11 ++++
.../ftrace/test.d/kprobe/uprobe_syntax_errors.tc | 5 ++
6 files changed, 147 insertions(+), 5 deletions(-)
create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 0b7a6efdb247..ca5d98c360cb 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -94,6 +94,20 @@ static int simple_thread_fn(void *arg)
static DEFINE_MUTEX(thread_mutex);
static int simple_thread_cnt;
+static struct foo_timer_data *foo_timer_data;
+
+static void sample_timer_cb(struct timer_list *t)
+{
+ struct foo_timer_data *data = container_of(t, struct foo_timer_data, timer);
+
+ get_cpu();
+ trace_foo_timer_fn(data);
+ (*this_cpu_ptr(data->counter))++;
+ put_cpu();
+
+ mod_timer(t, jiffies + HZ);
+}
+
int foo_bar_reg(void)
{
mutex_lock(&thread_mutex);
@@ -132,9 +146,27 @@ void foo_bar_unreg(void)
static int __init trace_event_init(void)
{
+ foo_timer_data = kzalloc_obj(*foo_timer_data, GFP_KERNEL);
+ if (!foo_timer_data)
+ return -ENOMEM;
+
+ foo_timer_data->name = "sample_timer_counter";
+ foo_timer_data->counter = alloc_percpu(int);
+ if (!foo_timer_data->counter) {
+ kfree(foo_timer_data);
+ return -ENOMEM;
+ }
+
+ timer_setup(&foo_timer_data->timer, sample_timer_cb, 0);
+ mod_timer(&foo_timer_data->timer, jiffies + HZ);
+
simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
- if (IS_ERR(simple_tsk))
- return -1;
+ if (IS_ERR(simple_tsk)) {
+ timer_shutdown_sync(&foo_timer_data->timer);
+ free_percpu(foo_timer_data->counter);
+ kfree(foo_timer_data);
+ return PTR_ERR(simple_tsk);
+ }
return 0;
}
@@ -147,6 +179,10 @@ static void __exit trace_event_exit(void)
kthread_stop(simple_tsk_fn);
simple_tsk_fn = NULL;
mutex_unlock(&thread_mutex);
+
+ timer_shutdown_sync(&foo_timer_data->timer);
+ free_percpu(foo_timer_data->counter);
+ kfree(foo_timer_data);
}
module_init(trace_event_init);
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 1a05fc153353..816848a456a2 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -247,12 +247,14 @@
*/
/*
- * It is OK to have helper functions in the file, but they need to be protected
- * from being defined more than once. Remember, this file gets included more
- * than once.
+ * It is OK to have helper functions and data structures in the file, but they
+ * need to be protected from being defined more than once. Remember, this file
+ * gets included more than once.
*/
#ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
#define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
+#include <linux/timer.h>
+
static inline int __length_of(const int *list)
{
int i;
@@ -270,6 +272,13 @@ enum {
TRACE_SAMPLE_BAR = 4,
TRACE_SAMPLE_ZOO = 8,
};
+
+struct foo_timer_data {
+ const char *name;
+ struct timer_list timer;
+ int __percpu *counter;
+};
+
#endif
/*
@@ -595,6 +604,25 @@ TRACE_EVENT(foo_rel_loc,
__get_rel_bitmask(bitmask),
__get_rel_cpumask(cpumask))
);
+
+TRACE_EVENT(foo_timer_fn,
+
+ TP_PROTO(struct foo_timer_data *data),
+
+ TP_ARGS(data),
+
+ TP_STRUCT__entry(
+ __string( name, data->name )
+ __field( int, count )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ __entry->count = *this_cpu_ptr(data->counter);
+ ),
+
+ TP_printk("name=%s count=%d", __get_str(name), __entry->count)
+);
#endif
/***** NOTICE! The #if protection ends here. *****/
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc b/tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
new file mode 100644
index 000000000000..96791e120b7d
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
@@ -0,0 +1,51 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: BTF event with typecast and percpu access
+# requires: dynamic_events "this_cpu_read(<fetcharg>)":README "[(structname[,field])]<argname>[->field[->field|.field...]]":README
+
+# Check if the sample module is loaded
+if ! lsmod | grep -q trace_events_sample; then
+ modprobe trace-events-sample || exit_unsupported
+fi
+
+echo 0 > events/enable
+echo > dynamic_events
+
+# The sample_timer_cb(struct timer_list *t) is called.
+# We want to check (STRUCT,FIELD)VAR typecast and this_cpu_read() access.
+# (foo_timer_data,timer)t converts t to struct foo_timer_data * using container_of.
+# data->counter is a per-cpu pointer to int.
+# this_cpu_read(data->counter) should give the value of the counter.
+
+echo 'f:mysample/myevent sample_timer_cb name=(foo_timer_data,timer)t->name:string count=this_cpu_read((foo_timer_data,timer)t->counter)' >> dynamic_events
+
+echo 1 > events/mysample/myevent/enable
+echo 1 > events/sample-trace/foo_timer_fn/enable
+
+sleep 2
+
+echo 0 > events/mysample/myevent/enable
+echo 0 > events/sample-trace/foo_timer_fn/enable
+
+# Compare the values.
+MATCH=0
+while read line; do
+ if echo $line | grep -q "foo_timer_fn:"; then
+ NAME=`echo $line | sed 's/.*name=\([^ ]*\) .*/\1/'`
+ COUNT=`echo $line | sed 's/.*count=\([^ ]*\).*/\1/'`
+ if grep -q "myevent:.*name=\"${NAME}\" count=$COUNT" trace; then
+ MATCH=$((MATCH+1))
+ fi
+ fi
+done < trace
+
+if [ $MATCH -eq 0 ]; then
+ echo "No matching events found"
+ exit_fail
+fi
+
+# Clean up
+echo 0 > events/mysample/myevent/enable
+echo 0 > events/sample-trace/foo_timer_fn/enable
+echo > dynamic_events
+clear_trace
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
index fee479295e2f..0402d0271bfe 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
@@ -112,6 +112,17 @@ check_error 'f vfs_read%return $retval->^foo' # NO_PTR_STRCT
check_error 'f vfs_read file->^foo' # NO_BTF_FIELD
check_error 'f vfs_read file^-.foo' # BAD_HYPHEN
check_error 'f vfs_read ^file:string' # BAD_TYPE4STR
+if grep -qF "[(structname" README ; then
+check_error 'f vfs_read arg1=(task_struct)file^' # TYPECAST_REQ_FIELD
+check_error 'f vfs_read arg1=(f)((f)((f)((f)^((f)file->f)->f)->f)->f)->f' # TOO_MANY_NESTED
+check_error 'f vfs_read arg1=(task_struct,^in_execve)file->comm' # TYPECAST_NOT_ALIGNED
+check_error 'f vfs_read arg1=(task_struct,^foo_bar)file:u8' # NO_BTF_FIELD
+check_error 'f vfs_read arg1=(^task_struct1234)file:u8' # NO_PTR_STRCT
+check_error 'f vfs_read arg1=(task_struct,se^->group_node)file->comm' # TYPECAST_BAD_ARROW
+check_error 'f vfs_read arg1=(task_struct,^->pid)file->comm' # NO_BTF_FIELD
+check_error 'f vfs_read arg1=(task_struct,^.pid)file->comm' # NO_BTF_FIELD
+check_error 'f vfs_read arg1=(task_struct,^.)file->comm' # NO_BTF_FIELD
+fi
fi
else
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
index 8f1c58f0c239..742b342e4930 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
@@ -115,6 +115,17 @@ check_error 'p vfs_read+20 ^$arg*' # NOFENTRY_ARGS
check_error 'p vfs_read ^hoge' # NO_BTFARG
check_error 'p kfree ^$arg10' # NO_BTFARG (exceed the number of parameters)
check_error 'r kfree ^$retval' # NO_RETVAL
+if grep -qF "[(structname" README ; then
+check_error 'p vfs_read arg1=(task_struct)file^' # TYPECAST_REQ_FIELD
+check_error 'p vfs_read arg1=(f)((f)((f)((f)^((f)file->f)->f)->f)->f)->f' # TOO_MANY_NESTED
+check_error 'p vfs_read arg1=(task_struct,^in_execve)file->comm' # TYPECAST_NOT_ALIGNED
+check_error 'p vfs_read arg1=(task_struct,^foo_bar)file:u8' # NO_BTF_FIELD
+check_error 'p vfs_read arg1=(^task_struct1234)file:u8' # NO_PTR_STRCT
+check_error 'p vfs_read arg1=(task_struct,se^->group_node)file->comm' # TYPECAST_BAD_ARROW
+check_error 'p vfs_read arg1=(task_struct,^->pid)file->comm' # NO_BTF_FIELD
+check_error 'p vfs_read arg1=(task_struct,^.pid)file->comm' # NO_BTF_FIELD
+check_error 'p vfs_read arg1=(task_struct,^.)file->comm' # NO_BTF_FIELD
+fi
else
check_error 'p vfs_read ^$arg*' # NOSUP_BTFARG
fi
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
index c817158b99db..e12dc967ec76 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
@@ -28,4 +28,9 @@ if grep -q ".*symstr.*" README; then
check_error 'p /bin/sh:10 $stack0:^symstr' # BAD_TYPE
fi
+# $current is not supported by uprobe
+if grep -q "\$current.*" README; then
+check_error 'p /bin/sh:10 ^$current:u8' # BAD_VAR
+fi
+
exit 0
^ permalink raw reply related
* [PATCH v5 6/7] tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
From: Masami Hiramatsu (Google) @ 2026-06-17 1:03 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
When tracing the kernel local variables, sometimes we need to get the
CPU local variables. To access it, current simple dereference is not
enough.
Thus, introduce a special this_cpu_read() dereference to access per-cpu
variable for the current CPU (accessing other CPU variable may race with
updates on other CPUs). Also this_cpu_ptr() is for accessing per-cpu
pointer.
Those are working as same as the kernel percpu macro.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v5:
- Simplify this_cpu_read() into +0(this_cpu_ptr()).
Changes in v3:
- Remove NULL check for percpu var because it is just an offset, could be 0.
- Simplify process_fetch_insn_bottom() code.
- If the last operation is this_cpu_read(), read only memory of the specific
size (of type).
Changes in v2:
- Drop +CPU/+PCPU and introduce this_cpu_read() and this_cpu_ptr().
- Support these method with BTF typecast.
- Just check the base address is NOT NULL instead of is_kernel_percpu_address().
---
Documentation/trace/eprobetrace.rst | 2
Documentation/trace/fprobetrace.rst | 2
Documentation/trace/kprobetrace.rst | 2
kernel/trace/trace.c | 1
kernel/trace/trace_probe.c | 151 +++++++++++++++++++++++++----------
kernel/trace/trace_probe.h | 1
kernel/trace/trace_probe_tmpl.h | 22 ++++-
7 files changed, 132 insertions(+), 49 deletions(-)
diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index 680e0af43d5d..279396951b34 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -39,6 +39,8 @@ Synopsis of eprobe_events
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
$comm : Fetch current task comm.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
+ this_cpu_read(FETCHARG) : Read the value of the per-CPU variable FETCHARG on the current CPU.
+ this_cpu_ptr(FETCHARG) : Get the address of the per-CPU variable FETCHARG on the current CPU.
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 3392cab016b3..3439bc9bd351 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -52,6 +52,8 @@ Synopsis of fprobe-events
$comm : Fetch current task comm.
$current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*4)(\*5)
+ this_cpu_read(FETCHARG) : Read the value of the per-CPU variable FETCHARG on the current CPU.
+ this_cpu_ptr(FETCHARG) : Get the address of the per-CPU variable FETCHARG on the current CPU.
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 81e4fe38791d..9ae330eb0a52 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -55,6 +55,8 @@ Synopsis of kprobe_events
$comm : Fetch current task comm.
$current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
+ this_cpu_read(FETCHARG) : Read the value of the per-CPU variable FETCHARG on the current CPU.
+ this_cpu_ptr(FETCHARG) : Get the address of the per-CPU variable FETCHARG on the current CPU.
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7a5676524f1a..d4121acc2938 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4332,6 +4332,7 @@ static const char readme_msg[] =
"\t $stack<index>, $stack, $retval, $comm, $current\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
+ "\t this_cpu_read(<fetcharg>), this_cpu_ptr(<fetcharg>)\n"
"\t kernel return probes support: $retval, $arg<N>, $comm\n"
"\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
"\t b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 93edac7fc943..34286a2ac72e 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -349,6 +349,93 @@ static int parse_trace_event(char *arg, struct fetch_insn *code,
return -EINVAL;
}
+/* this_cpu_* parser */
+#define THIS_CPU_PTR_PREFIX "this_cpu_ptr("
+#define THIS_CPU_READ_PREFIX "this_cpu_read("
+#define THIS_CPU_PTR_LEN (sizeof(THIS_CPU_PTR_PREFIX) - 1)
+#define THIS_CPU_READ_LEN (sizeof(THIS_CPU_READ_PREFIX) - 1)
+
+static int
+parse_probe_arg(char *arg, const struct fetch_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx);
+
+/* handle dereference nested call */
+static inline int handle_dereference(char *arg, struct fetch_insn **pcode,
+ struct fetch_insn *end, struct traceprobe_parse_context *ctx,
+ int deref, long offset)
+{
+ const struct fetch_type *type = find_fetch_type(NULL, ctx->flags);
+ struct fetch_insn *code = *pcode;
+ int cur_offs = ctx->offset;
+ char *tmp;
+ int ret;
+
+ tmp = strrchr(arg, ')');
+ if (!tmp) {
+ trace_probe_log_err(ctx->offset + strlen(arg),
+ DEREF_OPEN_BRACE);
+ return -EINVAL;
+ }
+
+ *tmp = '\0';
+ ret = parse_probe_arg(arg, type, &code, end, ctx);
+ if (ret)
+ return ret;
+ ctx->offset = cur_offs;
+ if (code->op == FETCH_OP_COMM || code->op == FETCH_OP_DATA) {
+ trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
+ return -EINVAL;
+ }
+ code++;
+ if (code == end) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ *pcode = code;
+
+ code->op = deref;
+ code->offset = offset;
+ /* Reset the last type if used */
+ ctx->last_type = NULL;
+ return 0;
+}
+
+static int parse_this_cpu(char *arg, struct fetch_insn **pcode,
+ struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx)
+{
+ struct fetch_insn *code;
+ bool is_ptr = false;
+ int ret;
+
+ if (str_has_prefix(arg, THIS_CPU_PTR_PREFIX)) {
+ arg += THIS_CPU_PTR_LEN;
+ ctx->offset += THIS_CPU_PTR_LEN;
+ is_ptr = true;
+ } else if (str_has_prefix(arg, THIS_CPU_READ_PREFIX)) {
+ arg += THIS_CPU_READ_LEN;
+ ctx->offset += THIS_CPU_READ_LEN;
+ } else
+ return -EINVAL;
+
+ ret = handle_dereference(arg, pcode, end, ctx, FETCH_OP_CPU_PTR, 0);
+ if (ret || is_ptr)
+ return ret;
+
+ /* this_cpu_read(VAR) -> +0(this_cpu_ptr(VAR)) */
+ code = *pcode;
+ code++;
+ if (code == end) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+ return -EINVAL;
+ }
+ code->op = FETCH_OP_DEREF;
+ code->offset = 0;
+ *pcode = code;
+ return 0;
+}
+
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
static u32 btf_type_int(const struct btf_type *t)
@@ -940,11 +1027,6 @@ static char *find_matched_close_paren(char *s)
return NULL;
}
-static int
-parse_probe_arg(char *arg, const struct fetch_type *type,
- struct fetch_insn **pcode, struct fetch_insn *end,
- struct traceprobe_parse_context *ctx);
-
static int handle_typecast(char *arg, struct fetch_insn **pcode,
struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
@@ -970,7 +1052,8 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
*tmp++ = '\0';
/* Handle the nested structure like (STRUCT)(VAR->FIELD)->... */
- if (*tmp == '(') {
+ if (*tmp == '(' || str_has_prefix(tmp, THIS_CPU_PTR_PREFIX) ||
+ str_has_prefix(tmp, THIS_CPU_READ_PREFIX)) {
char *close = find_matched_close_paren(tmp);
ctx->offset += tmp - arg;
@@ -990,12 +1073,18 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
trace_probe_log_err(ctx->offset, TOO_MANY_NESTED);
return -E2BIG;
}
- *close = '\0';
- ctx->offset += 1; /* for the '(' */
- /* We need to parse the nested one */
- ret = parse_probe_arg(tmp + 1, find_fetch_type(NULL, ctx->flags),
- pcode, end, ctx);
+ if (*tmp == '(') {
+ /* Extract the inner argument */
+ *close = '\0';
+ ctx->offset += 1;/* for the '(' */
+ /* Parse the nested one */
+ ret = parse_probe_arg(tmp + 1, find_fetch_type(NULL, ctx->flags),
+ pcode, end, ctx);
+ } else {
+ /* this_cpu_* will be parsed in parse_this_cpu() */
+ ret = parse_this_cpu(tmp, pcode, end, ctx);
+ }
if (ret < 0)
return ret;
ctx->nested_level--;
@@ -1465,36 +1554,9 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
}
ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
arg = tmp + 1;
- tmp = strrchr(arg, ')');
- if (!tmp) {
- trace_probe_log_err(ctx->offset + strlen(arg),
- DEREF_OPEN_BRACE);
- return -EINVAL;
- } else {
- const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags);
- int cur_offs = ctx->offset;
-
- *tmp = '\0';
- ret = parse_probe_arg(arg, t2, &code, end, ctx);
- if (ret)
- break;
- ctx->offset = cur_offs;
- if (code->op == FETCH_OP_COMM ||
- code->op == FETCH_OP_DATA) {
- trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
- return -EINVAL;
- }
- if (++code == end) {
- trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
- return -EINVAL;
- }
- *pcode = code;
-
- code->op = deref;
- code->offset = offset;
- /* Reset the last type if used */
- ctx->last_type = NULL;
- }
+ ret = handle_dereference(arg, pcode, end, ctx, deref, offset);
+ if (ret < 0)
+ return ret;
break;
case '\\': /* Immediate value */
if (arg[1] == '"') { /* Immediate string */
@@ -1515,15 +1577,18 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
ret = handle_typecast(arg, pcode, end, ctx);
break;
default:
- if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
+ if (str_has_prefix(arg, THIS_CPU_PTR_PREFIX) ||
+ str_has_prefix(arg, THIS_CPU_READ_PREFIX)) {
+ ret = parse_this_cpu(arg, pcode, end, ctx);
+ } else if (isalpha(arg[0]) || arg[0] == '_') { /* BTF variable */
if (!tparg_is_function_entry(ctx->flags) &&
!tparg_is_function_return(ctx->flags)) {
trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
return -EINVAL;
}
ret = parse_btf_arg(arg, pcode, end, ctx);
- break;
}
+ break;
}
if (!ret && code->op == FETCH_OP_NOP) {
/* Parsed, but do not find fetch method */
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 62645e847bd1..3651ce1f144f 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -100,6 +100,7 @@ enum fetch_op {
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
+ FETCH_OP_CPU_PTR, /* Per-CPU pointer for this CPU */
// Stage 3 (store) ops
FETCH_OP_ST_RAW, /* Raw: .size */
FETCH_OP_ST_MEM, /* Mem: .offset, .size */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index f630930288d2..9265b03cf19d 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -129,25 +129,35 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
struct fetch_insn *s3 = NULL;
int total = 0, ret = 0, i = 0;
u32 loc = 0;
- unsigned long lval = val;
+ unsigned long lval, llval = val;
stage2:
/* 2nd stage: dereference memory if needed */
do {
- if (code->op == FETCH_OP_DEREF) {
- lval = val;
+ lval = val;
+ switch (code->op) {
+ case FETCH_OP_DEREF:
ret = probe_mem_read(&val, (void *)val + code->offset,
sizeof(val));
- } else if (code->op == FETCH_OP_UDEREF) {
- lval = val;
+ break;
+ case FETCH_OP_UDEREF:
ret = probe_mem_read_user(&val,
(void *)val + code->offset, sizeof(val));
- } else
break;
+ case FETCH_OP_CPU_PTR:
+ val = (unsigned long)this_cpu_ptr((void __percpu *)val);
+ ret = 0;
+ break;
+ default:
+ lval = llval;
+ goto out;
+ }
if (ret)
return ret;
+ llval = lval;
code++;
} while (1);
+out:
s3 = code;
stage3:
^ permalink raw reply related
* [PATCH v5 5/7] tracing/probes: Add $current variable support
From: Masami Hiramatsu (Google) @ 2026-06-17 1:03 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Since we can use the BTF to cast value to a structure pointer type,
it is useful to introduce "$current" special variable support to
fetcharg.
User can define a fetcharg to access current task_struct properties
using BTF info. e.g.
$current->cpus_ptr
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v5:
- Use s32 for bof_find_btf_id().
Changes in v4:
- Add $current in README when CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y case.
- Fix to prohibit using $current in eprobes and address based kprobes.
Changes in v3:
- Remove $current support from eprobes (because eprobes is only for event)
- Prohibit uprobes to use $current.
Changes in v2:
- Support to parse $current in parse_btf_arg().
- If no typecast on $current, it automatically casted to task_struct.
- Check error case if $current follows something except for "-".
---
Documentation/trace/fprobetrace.rst | 1 +
Documentation/trace/kprobetrace.rst | 1 +
kernel/trace/trace.c | 4 ++--
kernel/trace/trace_probe.c | 38 ++++++++++++++++++++++++++++++++++-
kernel/trace/trace_probe.h | 1 +
kernel/trace/trace_probe_tmpl.h | 3 +++
6 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 290a9e6f7491..3392cab016b3 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -50,6 +50,7 @@ Synopsis of fprobe-events
$argN : Fetch the Nth function argument. (N >= 1) (\*2)
$retval : Fetch return value.(\*3)
$comm : Fetch current task comm.
+ $current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*4)(\*5)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index a62707e6a9f2..81e4fe38791d 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -53,6 +53,7 @@ Synopsis of kprobe_events
$argN : Fetch the Nth function argument. (N >= 1) (\*1)
$retval : Fetch return value.(\*2)
$comm : Fetch current task comm.
+ $current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e36af853199..7a5676524f1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4323,13 +4323,13 @@ static const char readme_msg[] =
"\t args: <name>=fetcharg[:type]\n"
"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
- "\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+ "\t $stack<index>, $stack, $retval, $comm, $arg<N>, $current\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t [(structname[,field])]<argname>[->field[->field|.field...]],\n"
"\t [(structname[,field])](fetcharg)->field[->field|.field...],\n"
#endif
#else
- "\t $stack<index>, $stack, $retval, $comm,\n"
+ "\t $stack<index>, $stack, $retval, $comm, $current\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
"\t kernel return probes support: $retval, $arg<N>, $comm\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 0f2be8c817fc..93edac7fc943 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -730,6 +730,24 @@ static int parse_btf_arg(char *varname,
goto found;
}
+ if (strcmp(varname, "$current") == 0) {
+ code->op = FETCH_OP_CURRENT;
+ /* If no typecast is specified for $current, use task_struct by default */
+ if (!ctx->struct_btf) {
+ s32 ttid = bpf_find_btf_id("task_struct", BTF_KIND_STRUCT,
+ &ctx->struct_btf);
+
+ if (ttid < 0) {
+ trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ /* btf_type_skip_modifier() requires u32 for type id. */
+ tid = ttid;
+ ctx->last_struct = btf_type_skip_modifiers(ctx->struct_btf, tid, &tid);
+ }
+ goto found;
+ }
+
if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
/* Check whether the function return type is not void, even with typecast. */
@@ -763,8 +781,8 @@ static int parse_btf_arg(char *varname,
return -ENOENT;
}
}
- params = ctx->params;
+ params = ctx->params;
for (i = 0; i < ctx->nr_params; i++) {
const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
@@ -1254,6 +1272,24 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
return 0;
}
+ /* $current returns the address of the current task_struct. */
+ if (str_has_prefix(arg, "current")) {
+ /* $current is only supported by kernel probe and need function name. */
+ if (!(ctx->flags & TPARG_FL_KERNEL) || !ctx->funcname) {
+ err = TP_ERR_BAD_VAR;
+ goto inval;
+ }
+ arg += strlen("current");
+ if (*arg == '-' && IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS))
+ return parse_btf_arg(orig_arg, pcode, end, ctx);
+
+ if (*arg != '\0')
+ goto inval;
+
+ code->op = FETCH_OP_CURRENT;
+ return 0;
+ }
+
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
len = str_has_prefix(arg, "arg");
if (len) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 44f113faae61..62645e847bd1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -96,6 +96,7 @@ enum fetch_op {
FETCH_OP_FOFFS, /* File offset: .immediate */
FETCH_OP_DATA, /* Allocated data: .data */
FETCH_OP_EDATA, /* Entry data: .offset */
+ FETCH_OP_CURRENT, /* Current task_struct address */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index f39b37fcdb3b..f630930288d2 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -112,6 +112,9 @@ process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
case FETCH_OP_DATA:
*val = (unsigned long)code->data;
break;
+ case FETCH_OP_CURRENT:
+ *val = (unsigned long)current;
+ break;
default:
return -EILSEQ;
}
^ permalink raw reply related
* [PATCH v5 4/7] tracing/probes: Support field specifier option for typecast
From: Masami Hiramatsu (Google) @ 2026-06-17 1:03 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Add a field specifier option for the typecast. This works like
container_of() macro.
(STRUCT[,FIELD[.FIELD2...]])VAR
This is equivalent to :
container_of(VAR, struct STRUCT, FIELD[.FIELD2...])
For example:
echo "f tick_nohz_handler next_tick=(tick_sched,sched_timer)timer->next_tick" >> dynamic_events
This will trace tick_nohz_handler() with its tick_sched::next_tick which
is converted from @timer by contianer_of(tick, struct tick_sched, sched_timer).
So, if you enabkle both fprobes:tick_nohz_handler__entry and
timer:hrtimer_expire_entry events, we will see something like:
<idle>-0 [002] d.h1. 3778.087272: hrtimer_expire_entry: hrtimer=00000000d63db328 f
unction=tick_nohz_handler now=3777450051040
<idle>-0 [002] d.h1. 3778.087281: tick_nohz_handler__entry: (tick_nohz_handler+0x4
/0x140) next_tick=3777450000000
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v3:
- Fix error caret position.
Changes in v2:
- Use byteoffset for typecast field offset instead of bitoffset. This fixes negative modulo calculation.
- Check whether a field is specified after typecast.
- Reject if typecast field option has arrow operator.
---
Documentation/trace/eprobetrace.rst | 5 +
Documentation/trace/fprobetrace.rst | 8 +-
Documentation/trace/kprobetrace.rst | 8 +-
kernel/trace/trace.c | 4 -
kernel/trace/trace_probe.c | 179 ++++++++++++++++++++++++-----------
kernel/trace/trace_probe.h | 5 +
6 files changed, 142 insertions(+), 67 deletions(-)
diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index cd0b4aa7f896..680e0af43d5d 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -49,7 +49,10 @@ Synopsis of eprobe_events
(STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that when this is used, the FIELD name does not
- need to be prefixed with a '$'.
+ need to be prefixed with a '$'. ASGN can be specified optionally.
+ If ASGN is specified, FIELD will be cast to the same offset
+ position as the ASGN member, rather than to the beginning of
+ the STRUCT.
(STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
also be used with another FETCHARG instead of FIELD.
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 6b8bb27bb62d..290a9e6f7491 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -57,10 +57,12 @@ Synopsis of fprobe-events
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
and bitfield are supported.
- (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ (STRUCT[,ASGN])FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
- ->MEMBER.
- (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ ->MEMBER. ASGN can be specified optionally. If ASGN is specified,
+ FIELD will be cast to the same offset position as the ASGN member,
+ rather than to the beginning of the STRUCT.
+ (STRUCT[,ASGN])(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
also be used with another FETCHARG instead of FIELD.
(\*1) This is available only when BTF is enabled.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index c4382765d5b2..a62707e6a9f2 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -61,11 +61,13 @@ Synopsis of kprobe_events
(x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
"string", "ustring", "symbol", "symstr" and bitfield are
supported.
- (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ (STRUCT[,ASGN])FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that this is available only when the probe is
- on function entry.
- (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ on function entry. ASGN can be specified optionally. If ASGN
+ is specified, FIELD will be cast to the same offset position
+ as the ASGN member, rather than to the beginning of the STRUCT.
+ (STRUCT[,ASGN])(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
also be used with another FETCHARG instead of FIELD.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f70318918c2..0e36af853199 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4325,8 +4325,8 @@ static const char readme_msg[] =
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
- "\t [(structname)]<argname>[->field[->field|.field...]],\n"
- "\t [(structname)](fetcharg)->field[->field|.field...],\n"
+ "\t [(structname[,field])]<argname>[->field[->field|.field...]],\n"
+ "\t [(structname[,field])](fetcharg)->field[->field|.field...],\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 5d8b85435eda..0f2be8c817fc 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -574,6 +574,65 @@ static int split_next_field(char *varname, char **next_field,
return ret;
}
+/* Inner loop for solving dot operator ('.'). Return bit-offset of the given field */
+static int get_bitoffset_of_field(char **pfieldname, const struct btf_type **ptype,
+ struct traceprobe_parse_context *ctx)
+{
+ const struct btf_type *type = *ptype;
+ const struct btf_member *field;
+ struct btf *btf = ctx_btf(ctx);
+ char *fieldname = *pfieldname;
+ int bitoffs = 0;
+ u32 anon_offs;
+ char *next;
+ int is_ptr;
+ s32 tid;
+
+ do {
+ next = NULL;
+ is_ptr = split_next_field(fieldname, &next, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+
+ anon_offs = 0;
+ field = btf_find_struct_member(btf, type, fieldname,
+ &anon_offs);
+ if (IS_ERR(field)) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return PTR_ERR(field);
+ }
+ if (!field) {
+ trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
+ return -ENOENT;
+ }
+ /* Add anonymous structure/union offset */
+ bitoffs += anon_offs;
+
+ /* Accumulate the bit-offsets of the dot-connected fields */
+ if (btf_type_kflag(type)) {
+ bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
+ ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
+ } else {
+ bitoffs += field->offset;
+ ctx->last_bitsize = 0;
+ }
+
+ type = btf_type_skip_modifiers(btf, field->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ if (next)
+ ctx->offset += next - fieldname;
+ fieldname = next;
+ } while (!is_ptr && fieldname);
+
+ *pfieldname = fieldname;
+ *ptype = type;
+
+ return bitoffs;
+}
/*
* Parse the field of data structure. The @type must be a pointer type
* pointing the target data structure type.
@@ -583,16 +642,14 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code = *pcode;
- const struct btf_member *field;
- u32 bitoffs, anon_offs;
- bool is_struct = ctx->struct_btf != NULL;
struct btf *btf = ctx_btf(ctx);
- char *next;
- int is_ptr;
+ bool is_first_field = true;
+ int bitoffs;
s32 tid;
do {
- if (!is_struct) {
+ /* For the first field of typecast, @type will be the target structure type. */
+ if (!(is_first_field && ctx->struct_btf)) {
/* Outer loop for solving arrow operator ('->') */
if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
@@ -606,60 +663,25 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return -EINVAL;
}
}
- /* Only the first type can skip being a pointer */
- is_struct = false;
-
- bitoffs = 0;
- do {
- /* Inner loop for solving dot operator ('.') */
- next = NULL;
- is_ptr = split_next_field(fieldname, &next, ctx);
- if (is_ptr < 0)
- return is_ptr;
-
- anon_offs = 0;
- field = btf_find_struct_member(btf, type, fieldname,
- &anon_offs);
- if (IS_ERR(field)) {
- trace_probe_log_err(ctx->offset, BAD_BTF_TID);
- return PTR_ERR(field);
- }
- if (!field) {
- trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
- return -ENOENT;
- }
- /* Add anonymous structure/union offset */
- bitoffs += anon_offs;
-
- /* Accumulate the bit-offsets of the dot-connected fields */
- if (btf_type_kflag(type)) {
- bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
- ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
- } else {
- bitoffs += field->offset;
- ctx->last_bitsize = 0;
- }
-
- type = btf_type_skip_modifiers(btf, field->type, &tid);
- if (!type) {
- trace_probe_log_err(ctx->offset, BAD_BTF_TID);
- return -EINVAL;
- }
-
- ctx->offset += next - fieldname;
- fieldname = next;
- } while (!is_ptr && fieldname);
+ bitoffs = get_bitoffset_of_field(&fieldname, &type, ctx);
+ if (bitoffs < 0)
+ return bitoffs;
if (++code == end) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
code->op = FETCH_OP_DEREF; /* TODO: user deref support */
code->offset = bitoffs / 8;
+ if (is_first_field && ctx->struct_btf) {
+ /* The first field can be typecasted with field option. */
+ code->offset -= ctx->prefix_byteoffs;
+ }
*pcode = code;
ctx->last_bitoffs = bitoffs % 8;
ctx->last_type = type;
+ is_first_field = false;
} while (fieldname);
return 0;
@@ -690,6 +712,11 @@ static int parse_btf_arg(char *varname,
NOSUP_DAT_ARG);
return -EOPNOTSUPP;
}
+ if (!field && ctx->struct_btf) {
+ /* Typecast without field option is not supported */
+ trace_probe_log_err(ctx->offset + strlen(varname), TYPECAST_REQ_FIELD);
+ return -EOPNOTSUPP;
+ }
if (ctx->flags & TPARG_FL_TEVENT) {
ret = parse_trace_event(varname, code, ctx);
@@ -700,8 +727,7 @@ static int parse_btf_arg(char *varname,
/* TEVENT is only here via a typecast */
if (WARN_ON_ONCE(ctx->struct_btf == NULL))
return -EINVAL;
- type = ctx->last_struct;
- goto found_type;
+ goto found;
}
if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
@@ -770,7 +796,6 @@ static int parse_btf_arg(char *varname,
type = ctx->last_struct;
else
type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
-found_type:
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
return -EINVAL;
@@ -839,6 +864,46 @@ static int query_btf_struct(const char *sname, struct traceprobe_parse_context *
return 0;
}
+static int parse_btf_casttype(char *casttype, struct traceprobe_parse_context *ctx)
+{
+ char *field;
+ int ret;
+
+ /* Field option - evaluated later. */
+ field = strchr(casttype, ',');
+ if (field)
+ *field++ = '\0';
+
+ ret = query_btf_struct(casttype, ctx);
+ if (ret < 0) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+
+ if (field) {
+ struct btf_type *type = (struct btf_type *)ctx->last_struct;
+
+ ctx->offset += field - casttype;
+ ret = get_bitoffset_of_field(&field, &ctx->last_struct, ctx);
+ if (ret < 0)
+ return ret;
+ if (ret % 8) {
+ trace_probe_log_err(ctx->offset, TYPECAST_NOT_ALIGNED);
+ return -EINVAL;
+ }
+ if (field != NULL) {
+ /* this means @field skips an arrow operator ("->"). */
+ trace_probe_log_err(ctx->offset - 2, TYPECAST_BAD_ARROW);
+ return -EINVAL;
+ }
+ ctx->prefix_byteoffs = ret / 8;
+ /* Restore the original struct type (overwritten by get_bitoffset_of_field) */
+ ctx->last_struct = type;
+ }
+
+ return ret;
+}
+
/* Find the matching closing parenthesis for a given opening parenthesis. */
static char *find_matched_close_paren(char *s)
{
@@ -922,11 +987,10 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
nested = true;
}
- ret = query_btf_struct(arg + 1, ctx);
- if (ret < 0) {
- trace_probe_log_err(orig_offset + 1, NO_PTR_STRCT);
- return -EINVAL;
- }
+ ctx->offset = orig_offset + 1; /* for the '(' */
+ ret = parse_btf_casttype(arg + 1, ctx);
+ if (ret < 0)
+ return ret;
ctx->offset = orig_offset + tmp - arg;
/* If it is nested, tmp points to the field name. */
@@ -934,6 +998,7 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
ret = parse_btf_field(tmp, ctx->last_struct, pcode, end, ctx);
else
ret = parse_btf_arg(tmp, pcode, end, ctx);
+ ctx->prefix_byteoffs = 0;
return ret;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 982d32a5df8b..44f113faae61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -436,6 +436,7 @@ struct traceprobe_parse_context {
unsigned int flags;
int offset;
int nested_level;
+ int prefix_byteoffs; /* The byte offset of the prefix field of typecast */
};
#define TRACEPROBE_MAX_NESTED_LEVEL 3
@@ -576,7 +577,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \
C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"), \
C(TYPECAST_REQ_FIELD, "Typecast requires a field access"), \
- C(TOO_MANY_NESTED, "Too many nested typecasts/dereferences"),
+ C(TOO_MANY_NESTED, "Too many nested typecasts/dereferences"), \
+ C(TYPECAST_NOT_ALIGNED, "Typecast field option is not byte-aligned"), \
+ C(TYPECAST_BAD_ARROW, "Typecast field option does not support -> operator"),
#undef C
#define C(a, b) TP_ERR_##a
^ permalink raw reply related
* [PATCH v5 3/7] tracing/probes: Support nested typecast
From: Masami Hiramatsu (Google) @ 2026-06-17 1:03 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
When we hit an open parenthesis right after typecast closing
parenthesis, it means we have nested typecast. This allows us to
typecast a generic data member in a structure to a pointer to
another structure.
For example, to cast a DATA_MEMBER of VAR structure to STRUCT pointer
and get MEMBER value.
(STRUCT)(VAR->DATA_MEMBER)->MEMBER
Also, we can nest typecast.
(STRUCT1)((STRUCT2)$ARG->FIELD2)->FIELD1
Currently the max nest level is limited to 3.
This also allows user to use typecasting for registers or stacks on
kprobe events. e.g.
(STRUCT)(%ax)->MEMBER
(STRUCT)($stack0)->MEMBER
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v4:
- Use orig_offset for reporting NO_PTR_STRCT error.
Changes in v2:
- Fix to skip "->" after closing parenthetsis.
---
Documentation/trace/eprobetrace.rst | 2 +
Documentation/trace/fprobetrace.rst | 2 +
Documentation/trace/kprobetrace.rst | 2 +
kernel/trace/trace.c | 1
kernel/trace/trace_probe.c | 78 +++++++++++++++++++++++++++++++----
kernel/trace/trace_probe.h | 7 +++
6 files changed, 83 insertions(+), 9 deletions(-)
diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index fe3602540569..cd0b4aa7f896 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -50,6 +50,8 @@ Synopsis of eprobe_events
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that when this is used, the FIELD name does not
need to be prefixed with a '$'.
+ (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ also be used with another FETCHARG instead of FIELD.
Types
-----
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 7435ded2d66d..6b8bb27bb62d 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -60,6 +60,8 @@ Synopsis of fprobe-events
(STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
->MEMBER.
+ (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ also be used with another FETCHARG instead of FIELD.
(\*1) This is available only when BTF is enabled.
(\*2) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index f73614997d52..c4382765d5b2 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -65,6 +65,8 @@ Synopsis of kprobe_events
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that this is available only when the probe is
on function entry.
+ (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ also be used with another FETCHARG instead of FIELD.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
is best effort, because depending on the argument type, it may be passed on
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa93e7b01146..4f70318918c2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4326,6 +4326,7 @@ static const char readme_msg[] =
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t [(structname)]<argname>[->field[->field|.field...]],\n"
+ "\t [(structname)](fetcharg)->field[->field|.field...],\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 064ca59387ba..5d8b85435eda 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -839,10 +839,35 @@ static int query_btf_struct(const char *sname, struct traceprobe_parse_context *
return 0;
}
+/* Find the matching closing parenthesis for a given opening parenthesis. */
+static char *find_matched_close_paren(char *s)
+{
+ char *p = s;
+ int count = 0;
+
+ while (*p) {
+ if (*p == '(')
+ count++;
+ else if (*p == ')') {
+ if (--count == 0)
+ return p;
+ }
+ p++;
+ }
+ return NULL;
+}
+
+static int
+parse_probe_arg(char *arg, const struct fetch_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx);
+
static int handle_typecast(char *arg, struct fetch_insn **pcode,
struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
{
+ int orig_offset = ctx->offset;
+ bool nested = false;
char *tmp;
int ret;
@@ -859,19 +884,56 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
DEREF_OPEN_BRACE);
return -EINVAL;
}
- *tmp = '\0';
- ret = query_btf_struct(arg + 1, ctx);
- *tmp = ')';
+ *tmp++ = '\0';
+
+ /* Handle the nested structure like (STRUCT)(VAR->FIELD)->... */
+ if (*tmp == '(') {
+ char *close = find_matched_close_paren(tmp);
+
+ ctx->offset += tmp - arg;
+ if (!close) {
+ trace_probe_log_err(ctx->offset, DEREF_OPEN_BRACE);
+ return -EINVAL;
+ }
+ /* We expect a field access for typecast */
+ if (close[1] != '-' || close[2] != '>') {
+ trace_probe_log_err(ctx->offset + close - tmp + 1,
+ TYPECAST_REQ_FIELD);
+ return -EINVAL;
+ }
+ ctx->nested_level++;
+ if (ctx->nested_level > TRACEPROBE_MAX_NESTED_LEVEL) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_NESTED);
+ return -E2BIG;
+ }
+ *close = '\0';
+
+ ctx->offset += 1; /* for the '(' */
+ /* We need to parse the nested one */
+ ret = parse_probe_arg(tmp + 1, find_fetch_type(NULL, ctx->flags),
+ pcode, end, ctx);
+ if (ret < 0)
+ return ret;
+ ctx->nested_level--;
+ clear_struct_btf(ctx);
+
+ tmp = close + 3;/* Skip "->" after closing parenthesis */
+ nested = true;
+ }
+
+ ret = query_btf_struct(arg + 1, ctx);
if (ret < 0) {
- trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
+ trace_probe_log_err(orig_offset + 1, NO_PTR_STRCT);
return -EINVAL;
}
- tmp++;
-
- ctx->offset += tmp - arg;
- ret = parse_btf_arg(tmp, pcode, end, ctx);
+ ctx->offset = orig_offset + tmp - arg;
+ /* If it is nested, tmp points to the field name. */
+ if (nested)
+ ret = parse_btf_field(tmp, ctx->last_struct, pcode, end, ctx);
+ else
+ ret = parse_btf_arg(tmp, pcode, end, ctx);
return ret;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 883938a74aee..982d32a5df8b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -435,8 +435,11 @@ struct traceprobe_parse_context {
struct trace_probe *tp;
unsigned int flags;
int offset;
+ int nested_level;
};
+#define TRACEPROBE_MAX_NESTED_LEVEL 3
+
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char *argv,
struct traceprobe_parse_context *ctx);
@@ -571,7 +574,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \
- C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"),
+ C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"), \
+ C(TYPECAST_REQ_FIELD, "Typecast requires a field access"), \
+ C(TOO_MANY_NESTED, "Too many nested typecasts/dereferences"),
#undef C
#define C(a, b) TP_ERR_##a
^ permalink raw reply related
* [PATCH v5 2/7] tracing/probes: Support typecast for various probe events
From: Masami Hiramatsu (Google) @ 2026-06-17 1:03 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Support BTF typecast feature on other probe events, but only if it is
kernel function entry or return, and must use function parameter name
or $retval. This means you can do:
(STRUCT)PARAM->MEMBER
but you can not do (this should be enabled by nesting support)
(STRUCT)%reg->MEMBER
To support other probe events, we just need to use last_struct type
when we find a function parameter in parse_btf_arg().
This also update <tracefs>/README file to show struct typecast.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v5:
- Add comments about $retval with typecast.
- Even if the type of retvalue is not known, if user specifies typecast,
use it for its type.
Changes in v3:
- Clarify the limitation.
Changes in v2:
- Fix to re-enable typecast on eprobe.
---
Documentation/trace/fprobetrace.rst | 3 +++
Documentation/trace/kprobetrace.rst | 4 ++++
kernel/trace/trace.c | 2 +-
kernel/trace/trace_probe.c | 23 +++++++++++++++++------
kernel/trace/trace_probe.h | 5 +++++
5 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index b4c2ca3d02c1..7435ded2d66d 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -57,6 +57,9 @@ Synopsis of fprobe-events
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
and bitfield are supported.
+ (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ a pointer to STRUCT and then derference the pointer defined by
+ ->MEMBER.
(\*1) This is available only when BTF is enabled.
(\*2) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 3b6791c17e9b..f73614997d52 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -61,6 +61,10 @@ Synopsis of kprobe_events
(x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
"string", "ustring", "symbol", "symstr" and bitfield are
supported.
+ (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ a pointer to STRUCT and then derference the pointer defined by
+ ->MEMBER. Note that this is available only when the probe is
+ on function entry.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
is best effort, because depending on the argument type, it may be passed on
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..aa93e7b01146 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4325,7 +4325,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
- "\t <argname>[->field[->field|.field...]],\n"
+ "\t [(structname)]<argname>[->field[->field|.field...]],\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index fd1caa1f9723..064ca59387ba 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -706,7 +706,7 @@ static int parse_btf_arg(char *varname,
if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
code->op = FETCH_OP_RETVAL;
- /* Check whether the function return type is not void */
+ /* Check whether the function return type is not void, even with typecast. */
if (query_btf_context(ctx) == 0) {
if (ctx->proto->type == 0) {
trace_probe_log_err(ctx->offset, NO_RETVAL);
@@ -715,6 +715,13 @@ static int parse_btf_arg(char *varname,
tid = ctx->proto->type;
goto found;
}
+ /*
+ * Even if we can not find appropriate BTF info, we can still access
+ * the field via typecast.
+ */
+ if (ctx->struct_btf)
+ goto found;
+
if (field) {
trace_probe_log_err(ctx->offset + field - varname,
NO_BTF_ENTRY);
@@ -759,7 +766,10 @@ static int parse_btf_arg(char *varname,
return -ENOENT;
found:
- type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+ if (ctx->struct_btf)
+ type = ctx->last_struct;
+ else
+ type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
found_type:
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
@@ -836,10 +846,11 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
char *tmp;
int ret;
- /* Currently this only works for eprobes */
- if (!(ctx->flags & TPARG_FL_TEVENT)) {
- trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT);
- return -EINVAL;
+ if (!(tparg_is_event_probe(ctx->flags) ||
+ tparg_is_function_entry(ctx->flags) ||
+ tparg_is_function_return(ctx->flags))) {
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
}
tmp = strchr(arg, ')');
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 15758cc11fc6..883938a74aee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -414,6 +414,11 @@ static inline bool tparg_is_function_return(unsigned int flags)
return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN);
}
+static inline bool tparg_is_event_probe(unsigned int flags)
+{
+ return !!(flags & TPARG_FL_TEVENT);
+}
+
struct traceprobe_parse_context {
struct trace_event_call *event;
/* BTF related parameters */
^ permalink raw reply related
* [PATCH v5 1/7] tracing/events: Fix to check the simple_tsk_fn creation
From: Masami Hiramatsu (Google) @ 2026-06-17 1:02 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178165816303.269421.7302603996990753309.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Sashiko pointed that this sample code does not correctly handle the
failure of thread creation because kthread_run() can return -errno.
Check the simple_tsk_fn is correctly initialized (created) or not.
Link: https://sashiko.dev/#/patchset/178092865666.163648.10457567771536160909.stgit%40devnote2
Fixes: 9cfe06f8cd5c ("tracing/events: add trace-events-sample")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v4:
- Fix to remove decrementing counter in error path, since foo_bar_reg() always returns 0.
- Add a newline to error message.
Changes in v3:
- Recover the usage counter.
---
samples/trace_events/trace-events-sample.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index ecc7db237f2e..0b7a6efdb247 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -107,6 +107,10 @@ int foo_bar_reg(void)
* for consistency sake, we still take the thread_mutex.
*/
simple_tsk_fn = kthread_run(simple_thread_fn, NULL, "event-sample-fn");
+ if (IS_ERR_OR_NULL(simple_tsk_fn)) {
+ pr_err("Failed to create simple_thread_fn\n");
+ simple_tsk_fn = NULL;
+ }
out:
mutex_unlock(&thread_mutex);
return 0;
^ permalink raw reply related
* [PATCH v5 0/7] tracing/probes: Add more typecast features
From: Masami Hiramatsu (Google) @ 2026-06-17 1:02 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
Hi,
Here is the 5th version of series to introduce more typecast features
to probe events. The previous version is here:
https://lore.kernel.org/all/178148603548.185520.3389196102475741865.stgit@devnote2/
In this version, I fixed some issues found by Sashiko reviews,
convert this_cpu_read() into this_cpu_ptr() + dereference[6/7],
and add some syntax testcases.
Steve introduced BTF typecast feature for eprobe[1].
This series extends it and add more options:
1. Expanding BTF typecast to kprobe and fprobe.
(currently only function entry/exit)
2. Introduce container_of like typecast. This adds a "assigned
member" option to the typecast.
(STRUCT,MEMBER)VAR->ANOTHER_MEMBER
This casts VAR to STRUCT type but the VAR is as the address
of STRUCT.MEMBER. In C, it is:
container_of(VAR, STRUCT, MEMBER)->ANOTHER_MEMBER
3. Support nested typecast, e.g.
(STRUCT)((STRUCT2)VAR->MEMBER2)->MEMBER
the nest level must be smaller than 3.
4. Add $current variable to point "current" task_struct.
This is useful with typecast, e.g.
(task_struct)$current->pid
5. per-cpu dereference support.
Intrdouce this_cpu_read(VAR) and this_cpu_ptr(VAR) to
access per-cpu data on the current CPU (accessing other CPU
data is not stable, because it can be changed.)
You can access the member of per-cpu data structure using
typecast like:
(STRUCT)this_cpu_ptr(VAR)->MEMBER
And added a test script to test part of them.
Thanks,
---
Masami Hiramatsu (Google) (7):
tracing/events: Fix to check the simple_tsk_fn creation
tracing/probes: Support typecast for various probe events
tracing/probes: Support nested typecast
tracing/probes: Support field specifier option for typecast
tracing/probes: Add $current variable support
tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
tracing/probes: Add a new testcase for BTF typecasts
Documentation/trace/eprobetrace.rst | 9
Documentation/trace/fprobetrace.rst | 10
Documentation/trace/kprobetrace.rst | 11 +
kernel/trace/trace.c | 8
kernel/trace/trace_probe.c | 439 +++++++++++++++-----
kernel/trace/trace_probe.h | 17 +
kernel/trace/trace_probe_tmpl.h | 25 +
samples/trace_events/trace-events-sample.c | 44 ++
samples/trace_events/trace-events-sample.h | 34 +-
.../ftrace/test.d/dynevent/btf_probe_event.tc | 51 ++
.../ftrace/test.d/dynevent/fprobe_syntax_errors.tc | 11 +
.../ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 11 +
.../ftrace/test.d/kprobe/uprobe_syntax_errors.tc | 5
13 files changed, 559 insertions(+), 116 deletions(-)
create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH v3 01/12] x86/resctrl: Support Privilege-Level Zero Association (PLZA)
From: Reinette Chatre @ 2026-06-17 0:00 UTC (permalink / raw)
To: Moger, Babu, Babu Moger, corbet, tony.luck, Dave.Martin,
james.morse, tglx, bp, dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman
In-Reply-To: <a737ae9e-9cbc-46bb-b565-0b888e69f0ea@amd.com>
Hi Babu,
On 6/12/26 9:56 AM, Moger, Babu wrote:
> Hi Reinette,
>
> On 6/11/2026 6:23 PM, Reinette Chatre wrote:
>> Hi Babu,
>>
>> On 4/30/26 4:24 PM, Babu Moger wrote:
>>> Customers have identified an issue while using the QoS resource Control
>>
>> "Control" -> "control"?
>>
>
> ack
>
>>> feature. If a memory bandwidth associated with a CLOSID is aggressively
>>
>> "a memory bandwidth" -> "memory bandwidth"?
>
> ack.
>
>>
>>> throttled, and it moves into Kernel mode, the Kernel operations are also
>>
>> What does "it" refer to here? From text it seems to be the "CLOSID" but that
>> does not sound right? Should "it" instead be something like "a task with that
>> CLOSID"?
>
> sure.
>
>>
>> "Kernel" -> "kernel"?
>
> ack.
>>
>>> aggressively throttled. This can stall forward progress and eventually
>>> degrade overall system performance. AMD hardware supports a feature
>>> Privilege-Level Zero Association (PLZA) to change the association of the
>>> thread as soon as it begins executing.
>>
>> "change the association of the thread as soon as it begins executing." I am
>> not able to parse this.
>
> How about ?
>
> Customers have identified an issue while using the QoS resource Control
> feature. If memory bandwidth associated with a CLOSID is aggressively
> throttled, and a task with that CLOSID moves into kernel mode, the kernel operations are also aggressively throttled. This can stall forward progress and eventually degrade overall system performance.
> AMD hardware supports a feature Privilege-Level Zero Association (PLZA)
> to change the CPU association at the user-to-kernel transition, so the kernel execution can use a different association than user mode.
"change the CPU association at the user-to-kernel transition" -> What is this
trying to describe? CPU association of what?
"a different association"? What does this mean?
>
> Privilege-Level Zero Association (PLZA) allows the user to specify a> CLOSID and/or RMID associated with execution in Privilege-Level
> Zero. When enabled on a CPU, as the CPU enters Privilege-Level Zero,
> allocation and monitoring for that CPU will be associated with the
> PLZA CLOSID and/or RMID. Otherwise, the CPU will be associated with
> the CLOSID and RMID given by PQR_ASSOC.
Sounds like this is vague because MSR_IA32_PQR_PLZA_ASSOC has not been
introduced yet. Could it help to introduce MSR_IA32_PQR_PLZA_ASSOC as
part of this patch and then the changelog can be specific about PLZA
feature introducing this new MSR and how it complements MSR_IA32_PQR_ASSOC?
...
>>> Documentation/admin-guide/kernel-parameters.txt | 2 +-
>>> arch/x86/include/asm/cpufeatures.h | 1 +
>>> arch/x86/kernel/cpu/resctrl/core.c | 2 ++
>>> arch/x86/kernel/cpu/scattered.c | 1 +
>>
>> Please split changes to other subsystems and make these changes
>> obvious with their own subject prefix to avoid sneaking changes into
>> other subsystems via resctrl.
>>
>
> Ok. Will be two patches.
> 1. For Documentation/admin-guide/kernel-parameters.txt
> 2. arch/x86/include/asm/cpufeatures.h
> arch/x86/kernel/cpu/resctrl/core.c
> arch/x86/kernel/cpu/scattered.c
The resctrl changes found in (2) would be documented in (1)? That does not
look right. Why not just split the resctrl changes from the cpufeatures changes?
This would be similar to how you did ABMC enabling.
Reinette
^ permalink raw reply
* Re: [PATCH v3 09/12] fs/resctrl: Reset kernel-mode binding when its rdtgroup goes away
From: Reinette Chatre @ 2026-06-16 23:42 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman, sos-linux-ext-patches
In-Reply-To: <280912ae2d2ee068fe5ec94aaf7e6e3f4e1c68b6.1777591497.git.babu.moger@amd.com>
Hi Babu,
On 4/30/26 4:24 PM, Babu Moger wrote:
> resctrl_kcfg.k_rdtgrp records which rdtgroup currently owns the kernel
> CLOSID/RMID, but nothing cleared that snapshot when the group was
> removed. rmdir of a control or monitor group, or unmount of the
> resctrl filesystem, left kernel mode enabled on the CPUs the group
> covered and left k_rdtgrp pointing at freed memory; the next read or write of
> info/kernel_mode would dereference a stale rdtgroup under rdtgroup_mutex.
Please do not word the enabling as bugfixes.
>
> Add rdtgroup_config_kmode_delete() as the disable counterpart of
> rdtgroup_config_kmode(). It clears the kernel-mode binding on the
> group's kmode_cpu_mask (or all online CPUs when that mask is empty),
> drops the per-group kmode/kmode_cpu_mask bookkeeping, and if
> @rdtgrp was the bound, resets resctrl_kcfg to &rdtgroup_default,
> BIT(INHERIT_CTRL_AND_MON)) so subsequent sysfs operations resolve
> to a live group.
Could you please reword these code descriptions to describe why this
patch is needed?
>
> Call it from rdtgroup_rmdir_mon(), rdtgroup_rmdir_ctrl(), and
> resctrl_fs_teardown(); each call site is gated on rdtgrp->kmode so
> groups that never participated in kernel mode pay nothing.
Does this handle the non-default resource groups removed on unmount?
(see rmdir_all_sub() called from resctrl_fs_teardown())
(please refer to earlier comments that apply to this patch also)
Reinette
^ permalink raw reply
* Re: [PATCH v3 08/12] fs/resctrl: Make info/kernel_mode writable and identify the bound group
From: Reinette Chatre @ 2026-06-16 23:42 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman
In-Reply-To: <768d4b603542f3202ece4294c808dbbf1a8e3008.1777591497.git.babu.moger@amd.com>
Hi Babu,
On 4/30/26 4:24 PM, Babu Moger wrote:
> info/kernel_mode lists the kernel-mode CLOSID/RMID policies the kernel
(also here please drop the x86 specific details and consider the resctrl
fs changes to be valid from MPAM perspective also)
> supports and the one currently active, but user space has no way to
> switch policies or rebind to a different rdtgroup, and the file does
> not name the group that owns the kernel CLOSID/RMID.
This adds a new feature. No need to describe this change as a bugfix.
>
> Make info/kernel_mode writable. The format used by both read and
> write is one line per mode:
This sounds like multiple modes can be written to the file as long as they
are separated by newline? I do not think it should be needed to support
write of more than one mode at a time.
>
> inherit_ctrl_and_mon:group=none
> [global_assign_ctrl_inherit_mon_per_cpu:group=g1//]
> global_assign_ctrl_assign_mon_per_cpu:group=none
>
> The active mode is wrapped in "[...]" and ":group=<ctrl>/<mon>/" names
> the bound rdtgroup ("//" for the default control group). Inactive
> modes report ":group=none". Documented in
> Documentation/filesystems/resctrl.rst.
Above describes the output of the file. This changelog can just focus on
what needs to be supported when user space writes to the file.
>
> The write path strims input, strips the optional "[...]", validates
strims?
Wait, why support the brackets as input? This seems unnecessary.
> the mode against resctrl_kcfg.kmode, and resolves the optional
> ":group=" suffix via the new helper rdtgroup_by_kmode_path(). An
> omitted suffix or an INHERIT-mode write binds to the default group.
> On success, rdtgroup_config_kmode_clear() tears down the previous
> binding and rdtgroup_config_kmode() programs the new one before
> resctrl_kcfg.k_rdtgrp and resctrl_kcfg.kmode_cur are updated under
> rdtgroup_mutex. Allocation failures in the helpers are propagated so
> the write fails atomically.
This also reads like it just describes the code.
>
> Add struct rdtgroup fields kmode and kmode_cpu_mask to track the
> per-group binding.
Please do not just describe the code but *why* this change is needed and
what it means and how it is used.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> v3: New patch to handle the changed interface file info/kernel_mode.
> ---
> Documentation/filesystems/resctrl.rst | 51 ++++
> fs/resctrl/internal.h | 6 +
> fs/resctrl/rdtgroup.c | 375 +++++++++++++++++++++++++-
> 3 files changed, 431 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst
> index b003bed339fd..89fbf8b4fb2a 100644
> --- a/Documentation/filesystems/resctrl.rst
> +++ b/Documentation/filesystems/resctrl.rst
> @@ -522,6 +522,57 @@ conveyed in the error returns from file operations. E.g.
> # cat info/last_cmd_status
> mask f7 has non-consecutive 1-bits
>
> +"kernel_mode":
(dropping the documentation here since I believe earlier comments apply)
...
> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> index 1a9b29119f88..9435ce663f54 100644
> --- a/fs/resctrl/internal.h
> +++ b/fs/resctrl/internal.h
> @@ -216,6 +216,10 @@ struct mongroup {
> * @mon: mongroup related data
> * @mode: mode of resource group
> * @mba_mbps_event: input monitoring event id when mba_sc is enabled
> + * @kmode: true if this group is currently bound as the kernel-mode
> + * CLOSID/RMID owner (resctrl_kcfg.k_rdtgrp)
(drop CLOSID/RMID)
> + * @kmode_cpu_mask: CPUs scoped for this group's kernel-mode binding;
> + * when empty, all online CPUs are used
Why does "empty" signify "all online CPUs"? This complicates implementation and
creates different interface from existing CPUs interface of resource groups.
> * @plr: pseudo-locked region
> */
> struct rdtgroup {
> @@ -229,6 +233,8 @@ struct rdtgroup {
> struct mongroup mon;
> enum rdtgrp_mode mode;
> enum resctrl_event_id mba_mbps_event;
> + bool kmode;
> + struct cpumask kmode_cpu_mask;
> struct pseudo_lock_region *plr;
> };
>
> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
> index 9cdcfa64c4a2..5383b4eb23ed 100644
> --- a/fs/resctrl/rdtgroup.c
> +++ b/fs/resctrl/rdtgroup.c
> @@ -1055,6 +1055,378 @@ static int resctrl_kernel_mode_show(struct kernfs_open_file *of,
> return 0;
> }
>
> +/**
> + * rdtgroup_config_kmode() - Push @rdtgrp's kernel CLOSID/RMID to hardware
> + * @rdtgrp: Resctrl group whose CLOSID/RMID should be programmed.
> + *
> + * Derives CLOSID/RMID from @rdtgrp->type:
> + * - RDTMON_GROUP: parent control group's CLOSID with the monitor group's RMID.
This seem unnecessary since when a monitor group is created it's closid is inherited
from it's control group?
> + * - RDTCTRL_GROUP: the control group's own CLOSID and default RMID.
> + *
> + * Calls resctrl_arch_configure_kmode() with the kernel-mode binding enabled
> + * on the online subset of @rdtgrp->kmode_cpu_mask (or all online CPUs when
> + * that mask is empty), and disabled on the complementary online CPUs so
> + * stale enable bits from a previously bound group are cleared in the same
> + * reprogram step. The caller (resctrl_kernel_mode_write()) is responsible
> + * for validating that the (kmode, group type) pair is permitted before
> + * invoking this helper.
> + *
> + * Context: Caller must hold rdtgroup_mutex.
Please use lockdep_assert_held(&rdtgroup_mutex) instead. See "Documenting locking requirements"
in Documentation/process/maintainer-tip.rst
> + *
> + * Return: 0 on success, -EINVAL for a pseudo-locked group, -ENOMEM if
> + * cpumask allocation fails.
> + */
> +static int rdtgroup_config_kmode(struct rdtgroup *rdtgrp)
> +{
> + cpumask_var_t enable_mask, disable_mask;
> + u32 closid, rmid;
> + bool need_disable;
(needs reverse fir)
> +
> + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
> + rdt_last_cmd_puts("Resource group is pseudo-locked\n");
> + return -EINVAL;
> + }
> +
> + if (!zalloc_cpumask_var(&enable_mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + need_disable = !cpumask_empty(&rdtgrp->kmode_cpu_mask);
As I understand rdtgroup_config_kmode() is called when the kernel mode is switched.
Also, earlier patches made it explicit that "Default scope is all online CPUs".
It is not clear to me how kmode_cpu_mask is initialized here ... it almost seems as though
if a resource was associated with a mode at some point and received some CPU changes then
when the mode switches between some other resource groups and then back to the original
then the old cpu_mask will be used on the mode switch. Should the resource group's cpu_mask
not be re-initialized to all online CPUs? If done then all of this cpu_mask wrangling seems
unnecessary to me, just use all online CPUs?
> + if (need_disable && !zalloc_cpumask_var(&disable_mask, GFP_KERNEL)) {
> + free_cpumask_var(enable_mask);
> + return -ENOMEM;
> + }
> +
> + if (rdtgrp->type == RDTMON_GROUP) {
> + closid = rdtgrp->mon.parent->closid;
> + rmid = rdtgrp->mon.rmid;
> + } else {
> + closid = rdtgrp->closid;
> + rmid = rdtgrp->mon.rmid;
> + }
Considering MON group inherits the CLOSID if its parent, can above be simplified
to just be?
closid = rdtgrp->closid;
rmid = rdtgrp->mon.rmid;
> +
> + /*
> + * Empty kmode_cpu_mask: enable on every online CPU. Otherwise enable
> + * only CPUs in the group mask and explicitly clear on other online CPUs
> + * so a previously bound group's enable bits don't linger.
> + */
> + if (!need_disable) {
> + cpumask_copy(enable_mask, cpu_online_mask);
> + } else {
> + cpumask_copy(enable_mask, &rdtgrp->kmode_cpu_mask);
> + cpumask_andnot(disable_mask, cpu_online_mask, &rdtgrp->kmode_cpu_mask);
> + }
> +
> + if (!cpumask_empty(enable_mask))
> + resctrl_arch_configure_kmode(enable_mask, closid, rmid, true);
> +
> + if (need_disable && !cpumask_empty(disable_mask))
> + resctrl_arch_configure_kmode(disable_mask, closid, rmid, false);
> +
> + rdtgrp->kmode = true;
> +
> + free_cpumask_var(enable_mask);
> + if (need_disable)
> + free_cpumask_var(disable_mask);
> +
> + return 0;
> +}
> +
> +/**
> + * rdtgroup_config_kmode_clear() - Tear down the kernel-mode binding on @rdtgrp
> + * @rdtgrp: Resctrl group whose kernel-mode binding is being released.
> + * May be %NULL when no group is currently bound, in which case
> + * this is a no-op.
> + * @kmode: Kernel-mode policy currently active on @rdtgrp, as a
> + * BIT(&enum resctrl_kernel_modes) value. When this is
> + * BIT(INHERIT_CTRL_AND_MON) the hardware tear-down is skipped
> + * because no MSR was previously programmed.
> + *
> + * Disables the kernel-mode binding on the CPUs @rdtgrp covers (its
> + * @kmode_cpu_mask, or all online CPUs when that mask is empty) and resets
> + * the per-group bookkeeping (@kmode and @kmode_cpu_mask). This is the
> + * disable counterpart of rdtgroup_config_kmode() and exists so that a write
> + * that transitions the active mode to BIT(INHERIT_CTRL_AND_MON) -- which
> + * skips rdtgroup_config_kmode() entirely -- still tears down the previously
> + * bound group instead of leaving stale enable bits behind.
> + *
> + * On allocation failure the function returns -ENOMEM and leaves both the
> + * hardware state and @rdtgrp's bookkeeping unchanged so the caller can fail
> + * the operation atomically and last_cmd_status reflects reality.
> + *
> + * Context: Caller must hold rdtgroup_mutex.
> + *
> + * Return: 0 on success (including the @rdtgrp == %NULL and INHERIT cases),
> + * -ENOMEM if cpumask allocation fails.
> + */
> +static int rdtgroup_config_kmode_clear(struct rdtgroup *rdtgrp, int kmode)
> +{
> + cpumask_var_t disable_mask;
> + u32 closid, rmid;
> +
> + if (!rdtgrp)
> + return 0;
> +
> + if (kmode == BIT(INHERIT_CTRL_AND_MON))
> + goto out_clear;
> +
> + if (!zalloc_cpumask_var(&disable_mask, GFP_KERNEL))
> + return -ENOMEM;
> +
> + if (rdtgrp->type == RDTMON_GROUP) {
> + closid = rdtgrp->mon.parent->closid;
> + rmid = rdtgrp->mon.rmid;
> + } else {
> + closid = rdtgrp->closid;
> + rmid = rdtgrp->mon.rmid;
> + }
Same comment as above ... but actually, why is closid/rmid needed at all? This
function is intended to *reset* the kernel mode so needing a valid/active closid and
rmid does not look right.
> +
> + if (cpumask_empty(&rdtgrp->kmode_cpu_mask))
> + cpumask_copy(disable_mask, cpu_online_mask);
> + else
> + cpumask_copy(disable_mask, &rdtgrp->kmode_cpu_mask);
Having kmode_cpu_mask accurately reflect the online CPUs will simplify this to
not need any of this wrangling and kmode_cpu_mask can just be used directly.
> +
> + resctrl_arch_configure_kmode(disable_mask, closid, rmid, false);
> + free_cpumask_var(disable_mask);
> +
> +out_clear:
> + cpumask_clear(&rdtgrp->kmode_cpu_mask);
> + rdtgrp->kmode = false;
> + return 0;
> +}
> +
> +/**
> + * rdtgroup_by_kmode_path() - Resolve a "<ctrl>/<mon>/" path to an rdtgroup
> + * @ctrl_name: Control-group name, or "" for the default control group.
> + * @mon_name: Monitor-group name, or "" to select the control group itself.
> + *
> + * Matches the path syntax emitted by resctrl_kernel_mode_show():
> + * "//" - the default control group
> + * "<ctrl>//" - control group @ctrl_name
> + * "/<mon>/" - monitor group @mon_name under the default control group
> + * "<ctrl>/<mon>/" - monitor group @mon_name under control group @ctrl_name
> + *
> + * Context: Caller must hold rdtgroup_mutex.
(lockdep)
> + *
> + * Return: Pointer to the matching rdtgroup, &rdtgroup_default when both
> + * names are empty (the show form "//"), or NULL if no such group exists.
> + */
> +static struct rdtgroup *rdtgroup_by_kmode_path(const char *ctrl_name,
> + const char *mon_name)
> +{
> + struct rdtgroup *rdtg, *parent = NULL, *crg;
> +
> + /* Show emits "//" for the default control group; round-trip it here. */
> + if (!*ctrl_name && !*mon_name)
> + return &rdtgroup_default;
> +
> + /* Control-group-only form: "<ctrl>//". */
> + if (!*mon_name) {
> + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
> + if (rdtg->type != RDTCTRL_GROUP)
> + continue;
> + if (!strcmp(rdt_kn_name(rdtg->kn), ctrl_name))
> + return rdtg;
> + }
> + return NULL;
> + }
> +
> + /* Monitor-group form: locate the parent control group first. */
> + if (!*ctrl_name) {
> + parent = &rdtgroup_default;
> + } else {
> + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
> + if (rdtg->type != RDTCTRL_GROUP)
> + continue;
> + if (!strcmp(rdt_kn_name(rdtg->kn), ctrl_name)) {
> + parent = rdtg;
> + break;
> + }
> + }
> + if (!parent)
> + return NULL;
> + }
> +
> + list_for_each_entry(crg, &parent->mon.crdtgrp_list, mon.crdtgrp_list)
> + if (!strcmp(rdt_kn_name(crg->kn), mon_name))
> + return crg;
> + return NULL;
> +}
> +
> +/**
> + * resctrl_kernel_mode_write() - Select kernel mode and bind group via info/kernel_mode
> + * @of: kernfs file handle.
> + * @buf: One line in the same format emitted by resctrl_kernel_mode_show(),
> + * i.e. "<mode>[:group=<ctrl>/<mon>/]" with an optional surrounding
> + * "[...]"; must end with a newline. The ":group=<spec>" suffix is
> + * optional -- when omitted the default control group
> + * (&rdtgroup_default) is used.
> + * @nbytes: Length of @buf.
> + * @off: File offset (unused).
> + *
> + * Parses @buf, validates that <mode> is listed in resctrl_mode_str[] and is
> + * supported by the platform (resctrl_kcfg.kmode), resolves <ctrl>/<mon>/ to
> + * an existing rdtgroup (or picks &rdtgroup_default if no group was specified
> + * or if the new mode is INHERIT), clears any previous binding via
> + * rdtgroup_config_kmode_clear(), programs hardware via
> + * rdtgroup_config_kmode() when @kmode is not BIT(INHERIT_CTRL_AND_MON), and
> + * on success updates resctrl_kcfg.k_rdtgrp and resctrl_kcfg.kmode_cur. The
> + * display-only "group=none" form produced by show for inactive modes is
> + * rejected. Errors are reported in last_cmd_status.
> + *
> + * Return: @nbytes on success, negative errno with last_cmd_status set on error.
> + */
> +static ssize_t resctrl_kernel_mode_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes, loff_t off)
> +{
> + char *mode_str, *group_str, *slash;
> + const char *ctrl_name, *mon_name;
> + struct rdtgroup *rdtgrp;
> + int ret = 0;
> + size_t len;
> + u32 kmode;
> + int i;
> +
> + if (nbytes == 0 || buf[nbytes - 1] != '\n')
> + return -EINVAL;
> + buf[nbytes - 1] = '\0';
> +
> + /* Tolerate surrounding whitespace before the bracket/mode parsing. */
> + buf = strim(buf);
> + len = strlen(buf);
> +
> + /* Strip the optional "[...]" that show uses to mark the active line. */
> + if (len >= 2 && buf[0] == '[' && buf[len - 1] == ']') {
> + buf[len - 1] = '\0';
> + buf++;
> + len -= 2;
> + }
I do not think the brackets should be valid input.
> +
> + /*
> + * Split "<mode>:group=<spec>"; the ":group=<spec>" suffix is optional
> + * and when omitted the default control group (&rdtgroup_default) is used.
> + */
> + group_str = strstr(buf, ":group=");
> + if (group_str) {
> + *group_str = '\0';
> + group_str += strlen(":group=");
> + }
> + mode_str = buf;
> +
> + mutex_lock(&rdtgroup_mutex);
> + rdt_last_cmd_clear();
> +
> + for (i = 0; i < RESCTRL_NUM_KERNEL_MODES; i++)
> + if (!strcmp(mode_str, resctrl_mode_str[i]))
> + break;
> + if (i == RESCTRL_NUM_KERNEL_MODES) {
> + rdt_last_cmd_puts("Unknown kernel mode\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + if (!(resctrl_kcfg.kmode & BIT(i))) {
> + rdt_last_cmd_puts("Kernel mode not available\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + kmode = BIT(i);
Can kmode be of enum type to be assigned the actual enum value to avoid all these BIT(enum value) usages?
> +
> + if (!group_str) {
> + /* No ":group=" suffix: fall back to the default control group. */
> + rdtgrp = &rdtgroup_default;
> + } else if (!strcmp(group_str, "none")) {
> + /* Display-only placeholder emitted by show; not selectable. */
> + rdt_last_cmd_puts("Cannot bind to 'none' group\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + } else {
> + /* Require exactly "<ctrl>/<mon>/" - two '/' with the second terminating. */
User should not be expected to provide monitor group when the monitoring is inherited.
> + slash = strchr(group_str, '/');
> + if (!slash) {
> + rdt_last_cmd_puts("Group must be <ctrl>/<mon>/\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + *slash = '\0';
> + ctrl_name = group_str;
> + mon_name = slash + 1;
> + slash = strchr(mon_name, '/');
> + if (!slash || slash[1] != '\0') {
> + rdt_last_cmd_puts("Group must be <ctrl>/<mon>/\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + *slash = '\0';
> +
> + rdtgrp = rdtgroup_by_kmode_path(ctrl_name, mon_name);
> + if (!rdtgrp) {
> + rdt_last_cmd_puts("Group not found\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + }
> +
> + /*
> + * INHERIT mode binds nothing; force the bound group to the default so
> + * round-trips with show (which prints "group=//") are stable and any
> + * user-supplied :group= suffix is silently normalised.
> + */
> + if (kmode == BIT(INHERIT_CTRL_AND_MON))
> + rdtgrp = &rdtgroup_default;
rdtgrp = NULL ?
> +
> + /* No-op if the same mode is already active on the same group. */
> + if (resctrl_kcfg.kmode_cur == kmode && resctrl_kcfg.k_rdtgrp == rdtgrp)
> + goto out_unlock;
> +
> + /*
> + * global_assign_ctrl_assign_mon_per_cpu binds one CLOSID and RMID for
> + * all kernel work (Documentation/filesystems/resctrl.rst uses
> + * "<ctrl>/<mon>/", i.e. an RDTMON_GROUP).
> + *
> + * global_assign_ctrl_inherit_mon_per_cpu assigns one CLOSID globally
> + * while leaving RMID inheritance to user contexts; that uses the
> + * control group's CLOSID slot only, i.e. an RDTCTRL_GROUP.
> + */
> + if (kmode == BIT(GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU) &&
> + rdtgrp->type != RDTMON_GROUP) {
> + rdt_last_cmd_puts("global_assign_ctrl_assign_mon_per_cpu requires a monitor group\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> + if (kmode == BIT(GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU) &&
> + rdtgrp->type != RDTCTRL_GROUP) {
> + rdt_last_cmd_puts("global_assign_ctrl_inherit_mon_per_cpu requires a control group\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /* Switching to a different group: release the old binding first. */
> + if (resctrl_kcfg.k_rdtgrp != rdtgrp) {
> + ret = rdtgroup_config_kmode_clear(resctrl_kcfg.k_rdtgrp,
> + resctrl_kcfg.kmode_cur);
> + if (ret) {
> + rdt_last_cmd_puts("Failed to release previous kernel-mode binding\n");
> + goto out_unlock;
> + }
> + }
> +
> + if (kmode != BIT(INHERIT_CTRL_AND_MON)) {
> + ret = rdtgroup_config_kmode(rdtgrp);
> + if (ret) {
> + rdt_last_cmd_puts("Kernel mode change failed\n");
If it fails here then previous binding was released successfully but new binding failed. What is
state of system?
> + goto out_unlock;
> + }
> + }
> +
> + resctrl_kcfg.k_rdtgrp = rdtgrp;
> + resctrl_kcfg.kmode_cur = kmode;
> +
> +out_unlock:
> + mutex_unlock(&rdtgroup_mutex);
> + return ret ?: nbytes;
> +}
> +
> void *rdt_kn_parent_priv(struct kernfs_node *kn)
> {
> /*
> @@ -1960,9 +2332,10 @@ static struct rftype res_common_files[] = {
> },
> {
> .name = "kernel_mode",
> - .mode = 0444,
> + .mode = 0644,
> .kf_ops = &rdtgroup_kf_single_ops,
> .seq_show = resctrl_kernel_mode_show,
> + .write = resctrl_kernel_mode_write,
> .fflags = RFTYPE_TOP_INFO,
> },
> {
Reinette
^ permalink raw reply
* Re: [PATCH v3 07/12] fs/resctrl: Add info/kernel_mode for kernel-mode policy introspection
From: Reinette Chatre @ 2026-06-16 23:38 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman
In-Reply-To: <549bf0fadf1cedb5938599be58e53b7464c939b5.1777591497.git.babu.moger@amd.com>
Hi Babu,
How should "introspection" as used in subject be interpreted? This just
displays the supported and active kernel modes to user space, no?
On 4/30/26 4:24 PM, Babu Moger wrote:
> There is no user-visible way today to see which kernel-mode CLOSID/RMID
> policies the running kernel supports, which one is active, or which
> resctrl group currently owns the kernel CLOSID/RMID.
Why should there be? This is a new feature being added in this series.
No need to write this as a bugfix.
>
> Add a read-only top-level sysfs file, info/kernel_mode. It emits one
> line per mode advertised in resctrl_kcfg.kmode, in stable lowercase
> spelling derived from enum resctrl_kernel_modes, e.g.:
All these changelogs feel so strange ... as though they are written by
somebody who simultaneously has no and full knowledge of resctrl.
These verbatim descriptions of what the code does is not necessary. Please
start with why the patch is needed.
>
> [inherit_ctrl_and_mon:group=//]
This is unexpected. There should be no group associated with this default mode.
This is how I interpreted our previous discussion ending:
https://lore.kernel.org/lkml/6709398b-269d-47b5-9b41-084f410bb1a6@amd.com/
> global_assign_ctrl_inherit_mon_per_cpu:group=none
> global_assign_ctrl_assign_mon_per_cpu:group=none
>
> The effective policy (resctrl_kcfg.kmode_cur) is wrapped in square
(needs imperative - please check all changelogs)
> brackets and its :group= suffix names the resctrl group currently
> bound to the kernel CLOSID/RMID (resctrl_kcfg.k_rdtgrp), formatted as
> <ctrl>/<mon>/ with empty components left blank. Inactive modes are
> reported as :group=none.
>
> rdtgroup_mutex is held while printing, matching other info/ show paths.
No need to describe details that can be seen from patch.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> v3: New patch to handle the changed interface file info/kernel_mode.
> Changed the group name to "none" if kmode binding is not done.
> Reinette suggested "uninitialized". "none" seemed more relevent.
> ---
> fs/resctrl/rdtgroup.c | 74 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 74 insertions(+)
>
> diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
> index a7bfc74897cc..9cdcfa64c4a2 100644
> --- a/fs/resctrl/rdtgroup.c
> +++ b/fs/resctrl/rdtgroup.c
> @@ -988,6 +988,73 @@ static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
> return 0;
> }
>
> +/* Sysfs lines for info/kernel_mode; indexed by &enum resctrl_kernel_modes */
> +static const char * const resctrl_mode_str[] = {
> + [INHERIT_CTRL_AND_MON] = "inherit_ctrl_and_mon",
> + [GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU] = "global_assign_ctrl_inherit_mon_per_cpu",
> + [GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU] = "global_assign_ctrl_assign_mon_per_cpu",
Please make alignment consistent.
> +};
> +
> +static_assert(ARRAY_SIZE(resctrl_mode_str) == RESCTRL_NUM_KERNEL_MODES);
> +
> +/**
> + * resctrl_kernel_mode_show() - Enumerate supported and effective kernel-mode policies
"Enumerate" -> "Display"?
> + * @of: kernfs open file
> + * @seq: output seq_file
> + * @v: unused
> + *
> + * Emits one line per mode advertised in resctrl_kcfg.kmode (each mode is one
> + * BIT(index) per &enum resctrl_kernel_modes). Every line carries a
Above is clear from the code. Please instead describe what this means.
> + * ":group=<name>" suffix:
> + *
> + * - The effective policy (whose BIT matches resctrl_kcfg.kmode_cur) is
> + * wrapped in square brackets and <name> is the resctrl group that
> + * currently owns the kernel CLOSID/RMID (resctrl_kcfg.k_rdtgrp),
> + * formatted as "<ctrl>/<mon>/". A component is left empty when it
> + * does not apply: an RDTCTRL_GROUP emits "<ctrl>//", an RDTMON_GROUP
> + * under the default control group emits "/<mon>/", and an RDTMON_GROUP
> + * under a named control group emits "<ctrl>/<mon>/".
> + *
> + * - Other supported but inactive modes are emitted without brackets and
> + * <name> is reported as "none".
> + *
> + * Context: Called under rdtgroup_mutex like other resctrl sysfs show paths.
This does not look accurate since it is not called with mutex held but instead
takes the mutex itself. Also no need to refer to what other code does.
> + */
> +static int resctrl_kernel_mode_show(struct kernfs_open_file *of,
> + struct seq_file *seq, void *v)
> +{
> + struct rdtgroup *rdtgrp;
> + const char *ctrl, *mon;
> + int i;
> +
> + mutex_lock(&rdtgroup_mutex);
> + for (i = 0; i < RESCTRL_NUM_KERNEL_MODES; i++) {
> + if (!(resctrl_kcfg.kmode & BIT(i)))
> + continue;
> +
> + if (resctrl_kcfg.kmode_cur != BIT(i)) {
> + seq_printf(seq, "%s:group=none\n",
> + resctrl_mode_str[i]);
> + continue;
> + }
> +
> + rdtgrp = resctrl_kcfg.k_rdtgrp;
> + ctrl = "";
> + mon = "";
> + if (rdtgrp->type == RDTMON_GROUP) {
> + if (rdtgrp->mon.parent != &rdtgroup_default)
> + ctrl = rdtgrp->mon.parent->kn->name;
Isn't default group's kn->name is initialized correctly via
rdtgroup_setup_root()->kernfs_create_root()->__kernfs_new_node(root, NULL, "", ...) ?
> + mon = rdtgrp->kn->name;
> + } else {
> + ctrl = rdtgrp->kn->name;
> + }
Can the names not just be initialized directly from kn->name?
> + seq_printf(seq, "[%s:group=%s/%s/]\n",
> + resctrl_mode_str[i], ctrl, mon);
This is not where I understood our discussion landed. I expected that the display will
reflect what can/should be assigned in a mode. For example, mode "inherit_ctrl_and_mon"
does not have an associated resource group and should thus not display one,
"global_assign_ctrl_inherit_mon_per_cpu" can only be assigned a control group and
should thus not display a monitor group also.
> + }
> + mutex_unlock(&rdtgroup_mutex);
> + return 0;
> +}
> +
> void *rdt_kn_parent_priv(struct kernfs_node *kn)
> {
> /*
> @@ -1891,6 +1958,13 @@ static struct rftype res_common_files[] = {
> .seq_show = rdt_last_cmd_status_show,
> .fflags = RFTYPE_TOP_INFO,
> },
> + {
> + .name = "kernel_mode",
> + .mode = 0444,
> + .kf_ops = &rdtgroup_kf_single_ops,
> + .seq_show = resctrl_kernel_mode_show,
> + .fflags = RFTYPE_TOP_INFO,
> + },
> {
> .name = "mbm_assign_on_mkdir",
> .mode = 0644,
Reinette
^ permalink raw reply
* Re: [PATCH v3 06/12] fs/resctrl: Initialize the global kernel-mode policy at subsystem init
From: Reinette Chatre @ 2026-06-16 23:36 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman, sos-linux-ext-patches
In-Reply-To: <38f794ae4076a3c118e8eda08ae2bc1e69eba979.1777591497.git.babu.moger@amd.com>
Hi Babu,
On 4/30/26 4:24 PM, Babu Moger wrote:
> kernel_mode feature needs to add the interface that lets user space
> choose between INHERIT_CTRL_AND_MON, GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU
> and GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU. Both the generic resctrl
> code and the architecture layer need a single shared snapshot of the
> supported and effective policy plus the resource group that backs the
> global-assign modes; that snapshot is struct resctrl_kmode_cfg.
This does not seem to match implementation since this implementation does
not actually share struct resctrl_kmode_cfg as described above. Only
resctrl_arch_get_kmode_support() exchanges this struct between fs and
arch and as already mentioned that usage looks unnecessary. The other
arch/fs touch points use either individual members or their properties
(like closid/rmid).
As described in response to previous patch I think this can be simplified
while also making it more robust.
>
> Add the file-local resctrl_kcfg and a helper resctrl_kmode_init() that:
>
> - Adds kmode and kmode_cur with BIT(INHERIT_CTRL_AND_MON), the
> universally supported mode and today's behaviour;
> - points k_rdtgrp at rdtgroup_default so global-assign modes have a
> valid backing group from boot;
If the default mode is INHERIT_CTRL_AND_MON then should the default group
not be NULL?
> - calls resctrl_arch_get_kmode_support() so each architecture ORs
> BIT(<mode>) into kmode for the policies its hardware supports
> (on x86, AMD PLZA contributes the two global-assign modes).
>
> resctrl_kmode_init() runs from resctrl_init() once the default group
resctrl_kmode_init() can be dropped after changes described in response
to previous patch. Apart from no longer being necessary I also find that
having the kernel mode fully initialized *before* the hotplug handlers run
to be simpler.
> has been set up. No user-visible behaviour changes yet; later patches
(drop "later patches ...")
> expose kmode_cur via sysfs and act on changes.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
Reinette
^ permalink raw reply
* Re: [PATCH v3 05/12] x86/resctrl: Initialize supported kernel modes for PLZA
From: Reinette Chatre @ 2026-06-16 23:35 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman
In-Reply-To: <95188117225c9235be89753edcace115cf5c2e5f.1777591497.git.babu.moger@amd.com>
Hi Babu,
On 4/30/26 4:24 PM, Babu Moger wrote:
> Resctrl subsystem tracks which kernel-mode CLOSID/RMID policies the
> platform can offer via struct resctrl_kmode_cfg and
> resctrl_arch_get_kmode_support(). AMD PLZA (Privilege Level Zero
> Association) is the x86 feature that allows kernel traffic to use an
> assigned CLOSID alone or CLOSID and RMID together.
>
> Report the available kernel-modes when x86 PLZA is enabled.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> v3: New patch to report all the supported kernel mode by arch.
> ---
> arch/x86/kernel/cpu/resctrl/core.c | 15 +++++++++++++++
> 1 file changed, 15 insertions(+)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index 4a8717157e3e..699d8bb82875 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -894,6 +894,21 @@ bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
> }
> }
>
> +/**
> + * resctrl_arch_get_kmode_support() - x86: record which kernel-mode policies hardware supports
> + * @kcfg: Cumulative snapshot; OR bits into @kcfg->kmode (see &struct resctrl_kmode_cfg).
If this is intended to be a cumulative snapshot this is a very subtle requirement
for architectures to "do the right thing" here. To make this more robust I think it will be
simpler if resctrl fs boots with resctrl_kcfg initialized to expected defaults.
Instead of this callback resctrl can add resctrl_set_kmode_support(u32 kmodes)
that the architecture *may* use to further initialize the kmodes supported by it. This
function is implemented by resctrl fs, instead of architecture, and it can fail if
architecture does not support INHERIT_CTRL_AND_MON. This will help to keep
struct resctrl_kmode_cfg private to resctrl fs while enforcing any assumptions about
which modes are required to be supported.
> + *
> + * When PLZA is present (CPUID X86_FEATURE_PLZA), the kernel may assign a CLOSID
> + * for kernel work alone or assign CLOSID and RMID together. Advertise both
> + * assign-style modes in @kcfg->kmode using &enum resctrl_kernel_modes indices.
> + */
> +void resctrl_arch_get_kmode_support(struct resctrl_kmode_cfg *kcfg)
> +{
> + if (rdt_cpu_has(X86_FEATURE_PLZA))
> + kcfg->kmode |= BIT(GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU) |
> + BIT(GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU);
> +}
> +
> static __init bool get_mem_config(void)
> {
> struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA];
Reinette
^ permalink raw reply
* Re: [PATCH v3 04/12] x86,fs/resctrl: Program PLZA through kmode arch hooks
From: Reinette Chatre @ 2026-06-16 23:33 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman
In-Reply-To: <0cfd813e10072eefc8f4d84328e83bd9a6220ad4.1777591497.git.babu.moger@amd.com>
Hi Babu,
On 4/30/26 4:24 PM, Babu Moger wrote:
> AMD Privilege Level Zero Association (PLZA) exposes kernel CLOSID/RMID
> association through MSR_IA32_PQR_PLZA_ASSOC. Generic resctrl already
> tracks supported and effective kernel-mode policy in struct
> resctrl_kmode_cfg, but the architecture layer needs a callable entry point
> that can push those values into per-CPU hardware on a chosen CPU mask.
>
> Declare resctrl_arch_configure_kmode() in linux/resctrl.h with kernel-doc.
> Implement it on x86: add an SMP callback that writes
> MSR_IA32_PQR_PLZA_ASSOC on each targeted CPU, and use on_each_cpu_mask()
> for the broadcast.
Above is clear from the patch. Please start with focus on why this patch is
needed.
>
> The hook is unused in this patch; later patches in the series wire it into
Similar to previous work: write changelog in imperative tone and do not
refer to patches in series but instead let each patch stand on its own.
> generic resctrl when an effective kernel-mode policy is selected or a CPU
> mask changes.
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> ---
> arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 35 +++++++++++++++++++++++
> include/linux/resctrl.h | 10 +++++++
> 2 files changed, 45 insertions(+)
>
> diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> index b20e705606b8..68f1cf503904 100644
> --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
> @@ -131,3 +131,38 @@ int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
>
> return 0;
> }
> +
> +/*
> + * SMP call-function callback: each CPU writes its own MSR_IA32_PQR_PLZA_ASSOC
> + * (AMD PLZA). Invoked via on_each_cpu_mask() with wait=1 so the on-stack
> + * union pointed at by @arg is safe.
> + */
> +static void resctrl_kmode_set_one_amd(void *arg)
> +{
> + union msr_pqr_plza_assoc *plza = arg;
> +
> + wrmsrl(MSR_IA32_PQR_PLZA_ASSOC, plza->full);
fyi ...
commit 2232959db26d ("x86/msr: Switch wrmsrl() users to wrmsrq()")
commit b5884070f9da ("x86/msr: Remove wrmsrl()")
> +}
> +
> +/**
> + * resctrl_arch_configure_kmode() - x86/AMD: program PLZA MSR on a CPU subset
> + * @cpu_mask: CPUs to receive the update (see on_each_cpu_mask() for online subset).
Why is the caveat added? Will resctrl ever provide offline CPUs in the mask?
> + * @closid: CLOSID field written into the MSR with CLOSID_EN set.
> + * @rmid: RMID field written into the MSR with RMID_EN set.
> + * @enable: Value for the PLZA_EN split field.
Please describe the meaning of the fields instead the mechanics of the code
that are obvious.
> + *
> + * Context: Do not call with IRQs off or from IRQ context except as allowed for
> + * on_each_cpu_mask(); see kernel/smp.c.
Why is this context caveat needed?
> + */
> +void resctrl_arch_configure_kmode(cpumask_var_t cpu_mask, u32 closid, u32 rmid, bool enable)
Please replace "cpumask_var_t cpu_mask" with "const struct cpumask *cpu_mask".
> +{
> + union msr_pqr_plza_assoc plza = { 0 };
> +
> + plza.split.rmid = rmid;
> + plza.split.rmid_en = 1;
> + plza.split.closid = closid;
> + plza.split.closid_en = 1;
> + plza.split.plza_en = enable;
> +
> + on_each_cpu_mask(cpu_mask, resctrl_kmode_set_one_amd, &plza, 1);
> +}
function self has been discussed already
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index ce28418df00f..570918e57e24 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -712,6 +712,16 @@ bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r);
> */
> void resctrl_arch_get_kmode_support(struct resctrl_kmode_cfg *kcfg);
>
> +/**
> + * resctrl_arch_configure_kmode() - Program MSR_IA32_PQR_PLZA_ASSOC on CPUs in @cpu_mask
> + * @cpu_mask: Target CPUs; on_each_cpu_mask() applies the callback on the online subset.
> + * @closid: CLOSID written to the MSR with CLOSID_EN set.
> + * @rmid: RMID written to the MSR with RMID_EN set.
> + * @enable: PLZA_EN field value for this update.
This is a resctrl fs API - please replace all the AMD architecture specific implementation details
with what the parameters actually mean/represent.
> + */
> +void resctrl_arch_configure_kmode(cpumask_var_t cpu_mask, u32 closid, u32 rmid,
> + bool enable);
> +
> extern unsigned int resctrl_rmid_realloc_threshold;
> extern unsigned int resctrl_rmid_realloc_limit;
>
Reinette
^ permalink raw reply
* Re: [PATCH v3 03/12] fs/resctrl: Add kernel mode (kmode) data structures and arch hook
From: Reinette Chatre @ 2026-06-16 23:30 UTC (permalink / raw)
To: Babu Moger, corbet, tony.luck, Dave.Martin, james.morse, tglx, bp,
dave.hansen
Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
linux-kernel, eranian, peternewman
In-Reply-To: <3996883c2d1d47e094f97bab2a2e74df3f8c55e7.1777591497.git.babu.moger@amd.com>
Hi Babu,
On 4/30/26 4:24 PM, Babu Moger wrote:
> Privilege-Level Zero Association (PLZA) allows the user to specify a CLOSID
Subject prefix makes it clear this is a resctrl fs patch so please take care to
not mix architecture specific terms with resctrl fs generalized support.
Something that may help here is to consider all resctrl fs changes to be
relevant from MPAM perspective. Please do so with all resctrl fs changes in
this series.
> and/or RMID associated with execution in Privilege-Level Zero. Introduce a
> generic enumeration so that architecture and generic code can agree on the
> available policies.
>
> Introduce enum resctrl_kernel_modes with the following values:
Please make the enum name singular, "resctrl_kernel_modes" -> "resctrl_kernel_mode"
Doing so will make its use in code easier to parse.
>
> - INHERIT_CTRL_AND_MON: kernel and user tasks share the same CLOSID and
> RMID. This is the default and matches today's resctrl behaviour.
CLOSID and RMID are x86 terms where the meaning is not 1:1 with other architectures.
Since this is a new resctrl fs interface it is expected to be usable by all
architectures. Making this architecture specific is not appropriate.
These are the modes that are exposed to user space and user space has no insight
into CLOSID and RMID (ignoring scenario of debugging). I see no reason for
resctrl do dictate CLOSID/RMID assignment as part of these modes but instead
what the modes mean should be explained. If it is helpful then any x86 specific
details can be added by highlighting it is x86 specific. For example,
"Kernel work inherits the allocation and monitoring from the user space task.
On x86 this means that kernel work shares the same CLOSID and RMID as
the user space task."
>
> - GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU: a CLOSID is assigned for kernel
> work while the RMID used for monitoring is inherited from the running
> user task. The default scope is all online CPUs and may be narrowed to
> a subset via the resctrl group interface. A CTRL_MON group can be
> bound to this mode.
Is binding a CTRL_MON group optional? Consider, for example:
"A CTRL_MON group is bound to this mode."
>
> - GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU: both CLOSID and RMID are
> assigned to kernel work. The default scope is all online CPUs and may
> be narrowed per CPU via the resctrl group interface. A CTRL_MON group
> can be bound to this mode.
It should be possible to bind a MON group also, no?
>
> - RESCTRL_KMODE_LAST: highest enumerator naming a policy mode.
>
> - RESCTRL_NUM_KERNEL_MODES: number of policy modes; use this to size
> static tables indexed by mode.
The last two can be dropped, this is clear from the patch.
>
> Also add struct resctrl_kmode_cfg (the snapshot architecture code returns)
> in include/linux/resctrl_types.h, and declare
> resctrl_arch_get_kmode_support() in include/linux/resctrl.h so architecture
> code can advertise the supported modes.
Above mostly just describes what is clear from the patch. Instead this can summarize
what the addition does: "Provide callback with which architecture can set the
kernel modes supported by it". (not exactly what this patch does though, but more below ...)
>
> Signed-off-by: Babu Moger <babu.moger@amd.com>
> ---
> v3: Removed resctrl_kmode definition.
> Changed the kernel mode definitions to enum resctrl_kernel_modes.
> Used BIT() to set/test the features.
> Added details to changelog.
>
> v2: New patch to handle PLZA interfaces with /sys/fs/resctrl/info/ directory.
> https://lore.kernel.org/lkml/2ab556af-095b-422b-9396-f845c6fd0342@intel.com/
> ---
> include/linux/resctrl.h | 13 ++++++++++
> include/linux/resctrl_types.h | 46 +++++++++++++++++++++++++++++++++++
> 2 files changed, 59 insertions(+)
>
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 006e57fd7ca5..ce28418df00f 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -699,6 +699,19 @@ int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable);
> */
> bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r);
>
> +/**
> + * resctrl_arch_get_kmode_support() - Advertise kernel-mode capabilities
"Advertise" implies a "broadcast" while the function name is "get" that implies
retrieval.
Why does resctrl query the support from the architecture? The typical resctrl initialization
involves the architecture setting certain capabilities. This simplifies enabling since
it does not require the addition of this feature to be accompanied with an implementation of
this call by every architecture.
Instead, resctrl can just initialize the defaults and an architecture can
make any adjustments using the optional callback. So, instead of
resctrl_arch_get_kmode_support(), why not resctrl_set_kmode_support() that is
implemented in resctrl fs and called by architecture?
When considering the x86 implementation of this it seems as though this implementation
assumes that all architectures will support inherit_ctrl_and_mon but this is not
enforced anywhere. Having any assumptions enforced/verified will help to make this
more robust. The fs/arch separation depending on so many architectures
"doing the right thing" seems risky.
> + * @kcfg: Architecture ORs BIT() flags into @kcfg->kmode for each supported
> + * &enum resctrl_kernel_modes value (see &struct resctrl_kmode_cfg).
> + *
> + * Used for optional features (for example PLZA on x86) that can assign CLOSID
> + * and/or RMID to kernel work separately from user tasks. Generic code compares
> + * @kcfg->kmode with the effective @kcfg->kmode_cur; when a global-assign mode is
> + * active, @kcfg->k_rdtgrp identifies the active &struct rdtgroup. The default mode
Does the architecture need to know these implementation details?
> + * is INHERIT_CTRL_AND_MON and group is default group.
> + */
> +void resctrl_arch_get_kmode_support(struct resctrl_kmode_cfg *kcfg);
Why does architecture need to know the layout of struct resctrl_kmode_cfg? It only needs
to share the modes it supports and need not be concerned with any of the internals - from
what I can tell the hook to program the kernel mode does not use this structure either and
this is the only "outside of resctrl fs" usage and it does not seem necessary.
> +
> extern unsigned int resctrl_rmid_realloc_threshold;
> extern unsigned int resctrl_rmid_realloc_limit;
>
> diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h
> index a5f56faa18d2..3aba07764b99 100644
> --- a/include/linux/resctrl_types.h
> +++ b/include/linux/resctrl_types.h
Please keep in mind that resctrl_types.h is reserved for those types that an architecture
needs to use in its asm/resctrl.h ... it does not look like any of the types added here qualify.
> @@ -68,4 +68,50 @@ enum resctrl_event_id {
> #define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1)
> #define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID)
>
> +/**
> + * enum resctrl_kernel_modes - Kernel versus user CLOSID/RMID policy
What does "versus user" mean? Can this be dropped?
> + *
> + * Enumeration values are contiguous indices from 0 through
> + * @RESCTRL_KMODE_LAST inclusive.
Above sentence is not necessary.
> Global-assign modes treat all online CPUs as
> + * in scope by default; a subset of CPUs may be selected by using resctrl
> + * group's interface.
> + *
> + * @INHERIT_CTRL_AND_MON:
> + * User and kernel tasks use the same CLOSID and RMID.
Similar comment as earlier. Since this is generic resctrl fs interface it needs to
be applicable to all architectures. For example (same suggestion as earlier),
"Kernel work inherits the allocation and monitoring of the user space task.
> + * @GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU:
> + * A CLOSID may be assigned for kernel work while RMID selection for
"may be assigned" - this is not optional, right? How about "A control group is assigned ..."
> + * monitoring follows the same inheritance rules as for user contexts.
> + * Default scope is all online CPUs: subset of CPUs may be selected by
> + * using resctrl group's interface.
> + * @GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU:
> + * A single resource group (CLOSID and RMID together) may be assigned to
"may be" -> "is" ?
> + * kernel work. Default scope is all online CPUs: subset of CPUs may be
> + * selected by using resctrl group's interface.
> + * @RESCTRL_KMODE_LAST:
Documenting @RESCTRL_KMODE_LAST is not necessary.
> + * Highest enumerator that names a policy mode. Use RESCTRL_NUM_KERNEL_MODES
> + * to size static tables indexed by mode.
No need to document this.
> + */
> +enum resctrl_kernel_modes {
> + INHERIT_CTRL_AND_MON,
> + GLOBAL_ASSIGN_CTRL_INHERIT_MON_PER_CPU,
> + GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU,
> + RESCTRL_KMODE_LAST = GLOBAL_ASSIGN_CTRL_ASSIGN_MON_PER_CPU,
> +};
> +
> +#define RESCTRL_NUM_KERNEL_MODES (RESCTRL_KMODE_LAST + 1)
> +
> +/**
> + * struct resctrl_kmode_cfg - Kernel-mode policy snapshot from architecture
Only @kmode is initialized from the architecture. The rest is managed by resctrl fs.
I do not see why architecture needs to know the structure details.
> + * @kmode: Hardware- or policy-supported modes: each enumerator from
> + * &enum resctrl_kernel_modes is represented by BIT(mode index).
> + * @kmode_cur: Effective mode(s) in the same BIT(index) form as @kmode.
"mode(s)" ... this is plural implying more than one mode can be active at a time?
Should this not be just one mode and can thus have type "enum resctrl_kernel_mode" to make
this obvious?
> + * @k_rdtgrp: Resource group backing global-assign modes when applicable;
> + * initialized to the default group at boot.
Why is this initialized to default group at boot? I believe inherit_ctrl_and_mon is
the default mode and it does not have a group so should this not be NULL by default?
> + */
> +struct resctrl_kmode_cfg {
> + u32 kmode;
> + u32 kmode_cur;
> + struct rdtgroup *k_rdtgrp;
Please align struct members in tabular fashion.
Not specific to this patch: After so many contributions to resctrl I am very surprised how
this series does not respect Documentation/process/maintainer-tip.rst in many ways. For example,
later patches at some point just stops writing changelogs in imperative tone and just
documents what the code does, patches document locking requirements instead of using code
like lockdep_assert_held(), variables are not declared in reverse fir, changelogs refer to
other patches in series. Following Documentation/process/maintainer-tip.rst should be
very familiar by now.
Reinette
^ permalink raw reply
* Re: [PATCH v2] hwmon: coretemp: Fix documentation wording
From: Guenter Roeck @ 2026-06-16 23:03 UTC (permalink / raw)
To: Ximing Zhang
Cc: Jonathan Corbet, Shuah Khan, linux-hwmon, linux-doc, linux-kernel
In-Reply-To: <20260616121549.29484-1-xzhangjr@gmail.com>
On Tue, Jun 16, 2026 at 04:15:49PM +0400, Ximing Zhang wrote:
> Fix two minor wording issues in the coretemp documentation.
>
> Signed-off-by: Ximing Zhang <xzhangjr@gmail.com>
> ---
Applied to hwmon-next.
Thanks,
Guenter
^ permalink raw reply
* Re: [PATCH v3 0/3] f2fs: support encrypted inline data
From: Eric Biggers @ 2026-06-16 23:02 UTC (permalink / raw)
To: LiaoYuanhong-vivo
Cc: chao, corbet, jaegeuk, linux-doc, linux-ext4, linux-f2fs-devel,
linux-fscrypt, linux-kernel, skhan, tytso
In-Reply-To: <20260616094612.45505-1-liaoyuanhong@vivo.com>
On Tue, Jun 16, 2026 at 05:46:12PM +0800, LiaoYuanhong-vivo wrote:
> Could you share more about the direction you have in mind for simplifying
> f2fs/ext4 contents encryption around blk-crypto?
Currently ext4 and f2fs each have two implementations of file contents
encryption and decryption:
- One where the en/decryption is done in the filesystem layer
- One where the filesystem attaches a bio_crypt_ctx to the bios and the
en/decryption is done either in the block layer by blk-crypto-fallback
or by inline encryption hardware
I'd like to drop the first one, for simplicity and to reduce the burden
on ongoing developments like large folio support.
> For f2fs inline_data, there is still a real space-saving benefit on phones,
> since many encrypted files are smaller than 4K. Is there any acceptable
> future direction to support this kind of inode-resident data with
> blk-crypto or hardware-wrapped keys?
It is incompatible with inline encryption hardware. A CPU-based
solution like Intel Key Locker or RISC-V High Assurance Cryptography
could provide similar security properties. But there's nothing for
arm64 yet. And I should mention that no one has wanted to use Key
Locker anyway because it's really slow.
- Eric
^ permalink raw reply
* Re: [PATCH v6 03/12] PCI: liveupdate: Track incoming preserved PCI devices
From: Samiullah Khawaja @ 2026-06-16 22:38 UTC (permalink / raw)
To: David Matlack
Cc: kexec, linux-doc, linux-kernel, linux-mm, linux-pci,
Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed, Shuah Khan,
Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <CALzav=eo=UwoTNTYM8Z7uKoihxfB7NtVP701qidVgoqyBKhUig@mail.gmail.com>
On Tue, Jun 16, 2026 at 03:20:33PM -0700, David Matlack wrote:
>On Tue, Jun 16, 2026 at 1:09 PM Samiullah Khawaja <skhawaja@google.com> wrote:
>>
[snip]
>>
>> Hmm.. This is interesting, so the KHO state is freed and it cannot be
>> reused. I see you already pointed out that we are putting an LUO policy
>> to say that the retry is not allowed.
>>
>> But what should be the behaviour of liveupdate in this regard? Let the
>> system boot in a normal way? This might break other subsystems as they
>> might depend on PCIe restoring state properly. Also I think some of the
>> PCIe state, like device-id, BAR addresses, ACLs etc, might be used as
>> source of truth by other components.
>>
>> For example, lets say FLB retrieve() of PCIe fails, but succeeds for
>> VFIO/IOMMU, now VFIO/IOMMU are restoring state of a device that is not
>> restored/preserved?
>>
>> Should this be considered fatal?
>
>If PCI FLB retrieve fails then there are certain things that cannot be
>guaranteed, such as BDF (B specifically) remaining constant. This
>could lead to memory corruption as the IOMMU may have live
>translations in place for those specific RequesterIDs. And, in the
>future, preserved devices may be doing P2P which depends on BARs not
>moving. If the PCI core cannot retrieve the FLB saved by the previous
>kernel, it cannot make these guarantees.
Yes, this is what I was worried about.
>
>So yeah I think you're right that PCI core should treat FLB retrieve
>as fatal and just panic.
This sounds great.
>
>> > }
>> >
>> > static void pci_flb_finish(struct liveupdate_flb_op_args *args)
>> > {
>> >- kho_restore_free(args->obj);
>> >+ struct pci_flb_incoming *incoming = args->obj;
>> >+
>> >+ xa_destroy(&incoming->xa);
>> >+ kho_restore_free(incoming->ser);
>> >+ kfree(incoming);
>> > }
>> >
>> > static struct liveupdate_flb_ops pci_liveupdate_flb_ops = {
>> >@@ -270,6 +335,91 @@ void pci_liveupdate_unpreserve(struct pci_dev *dev)
>> > }
>> > EXPORT_SYMBOL_GPL(pci_liveupdate_unpreserve);
>> >
>> >+static struct pci_flb_incoming *pci_liveupdate_flb_get_incoming(void)
>> >+{
>> >+ struct pci_flb_incoming *incoming = NULL;
>> >+ int ret;
>> >+
>> >+ ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&incoming);
>> >+
>> >+ /* Live Update is not enabled. */
>> >+ if (ret == -EOPNOTSUPP)
>> >+ return NULL;
>> >+
>> >+ /* Live Update is enabled, but there is no incoming FLB data. */
>> >+ if (ret == -ENODATA)
>> >+ return NULL;
>> >+
>> >+ /*
>> >+ * Live Update is enabled and there is incoming FLB data, but none of it
>> >+ * matches pci_liveupdate_flb.compatible.
>> >+ *
>> >+ * This could mean that no PCI FLB data was passed by the previous
>> >+ * kernel, but it could also mean the previous kernel used a different
>> >+ * compatibility string (i.e. a different ABI).
>> >+ */
>> >+ if (ret == -ENOENT) {
>> >+ pr_info_once("No incoming FLB matched %s\n", pci_liveupdate_flb.compatible);
>> >+ return NULL;
>> >+ }
>> >+
>> >+ /*
>> >+ * There is incoming FLB data that matches pci_liveupdate_flb.compatible
>> >+ * but it cannot be retrieved.
>> >+ */
>> >+ if (ret) {
>> >+ WARN_ONCE(ret, "Failed to retrieve incoming FLB data\n");
>>
>> I think this should probably be considered fatal as mentioned above or
>> the caller of this function should get an error so it can fail. I think
>> retrievel of preserved state should generally not fail unless there is
>> memory corruption or ABI is incompatible.
>
>Yeah. I think I will just call panic() here to cover all cases.
We have an luo specific panic macro/function that you can use.
luo_restore_fail()
>
>> >+ return NULL;
>> >+ }
>> >+
>> >+ return incoming;
>> >+}
>> >+
>>
>> [snip]
>> >+
>> >+static inline bool pci_liveupdate_is_incoming(struct pci_dev *dev)
>> >+{
>> >+ return false;
>> >+}
>> > #endif
>> >
>> > #endif /* LINUX_PCI_LIVEUPDATE_H */
>> >--
>> >2.54.0.746.g67dd491aae-goog
>> >
>>
>> Sami
^ permalink raw reply
* Re: [PATCH v6 03/12] PCI: liveupdate: Track incoming preserved PCI devices
From: David Matlack @ 2026-06-16 22:20 UTC (permalink / raw)
To: Samiullah Khawaja
Cc: kexec, linux-doc, linux-kernel, linux-mm, linux-pci,
Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed, Shuah Khan,
Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <ajGpxMXk9iyXLzC4@google.com>
On Tue, Jun 16, 2026 at 1:09 PM Samiullah Khawaja <skhawaja@google.com> wrote:
>
> On Fri, May 22, 2026 at 08:24:01PM +0000, David Matlack wrote:
> >During PCI enumeration, the previous kernel might have passed state about
> >devices that were preserved across kexec. The PCI core needs to fetch
> >this state to identify which devices are "incoming" and require special
> >handling.
> >
> >Add pci_liveupdate_setup_device() which is called during device setup
> >to fetch the serialized state (struct pci_ser) from the Live Update
> >Orchestrator. The first time this happens, pci_flb_retrieve() will run
> >and convert the array of pci_dev_ser structs into an xarray so that it
> >can be looked up efficiently.
> >
> >If a device is found in the xarray, the PCI core stores a pointer to its
> >state in dev->liveupdate_incoming and holds a reference to the incoming
> >FLB until pci_liveupdate_finish() is called by the driver.
> >
> >This ensures proper lifecycle management for incoming preserved devices
> >and allows the PCI core and drivers to apply specific Live Update
> >logic to them in subsequent commits.
> >
> >Drivers can check if a device is an incoming preserved device (e.g.
> >during probe) by calling pci_liveupdate_is_incoming().
> >
> >CONFIG_64BIT is now required to enable CONFIG_PCI_LIVEUPDATE so that the
> >domain and bdf can be guaranteed to fit in an unsigned long and be used
> >as the xarray key.
> >
> >Signed-off-by: David Matlack <dmatlack@google.com>
> >---
> > MAINTAINERS | 1 +
> > drivers/pci/Kconfig | 2 +-
> > drivers/pci/liveupdate.c | 230 ++++++++++++++++++++++++++++++++-
> > drivers/pci/liveupdate.h | 5 +
> > drivers/pci/probe.c | 3 +
> > include/linux/pci_liveupdate.h | 13 ++
> > 6 files changed, 251 insertions(+), 3 deletions(-)
> >
>
> [snip]
> >
> > static int pci_flb_retrieve(struct liveupdate_flb_op_args *args)
> > {
> >- args->obj = phys_to_virt(args->data);
> >+ struct pci_ser *ser = phys_to_virt(args->data);
> >+ struct pci_flb_incoming *incoming;
> >+ int ret = -ENOMEM;
> >+ u32 i;
> >+
> >+ incoming = kmalloc_obj(*incoming);
> >+ if (!incoming)
> >+ goto err_restore_free;
> >+
> >+ incoming->ser = ser;
> >+ xa_init(&incoming->xa);
> >+
> >+ for (i = 0; i < incoming->ser->max_nr_devices; i++) {
> >+ struct pci_dev_ser *dev_ser = &incoming->ser->devices[i];
> >+ unsigned long key;
> >+
> >+ if (!dev_ser->refcount)
> >+ continue;
> >+
> >+ key = pci_ser_xa_key(dev_ser->domain, dev_ser->bdf);
> >+ ret = xa_insert(&incoming->xa, key, dev_ser, GFP_KERNEL);
> >+ if (ret)
> >+ goto err_xa_destroy;
> >+ }
> >+
> >+ args->obj = incoming;
> > return 0;
> >+
> >+err_xa_destroy:
> >+ xa_destroy(&incoming->xa);
> >+ kfree(incoming);
> >+err_restore_free:
> >+ kho_restore_free(ser);
> >+ return ret;
>
> Hmm.. This is interesting, so the KHO state is freed and it cannot be
> reused. I see you already pointed out that we are putting an LUO policy
> to say that the retry is not allowed.
>
> But what should be the behaviour of liveupdate in this regard? Let the
> system boot in a normal way? This might break other subsystems as they
> might depend on PCIe restoring state properly. Also I think some of the
> PCIe state, like device-id, BAR addresses, ACLs etc, might be used as
> source of truth by other components.
>
> For example, lets say FLB retrieve() of PCIe fails, but succeeds for
> VFIO/IOMMU, now VFIO/IOMMU are restoring state of a device that is not
> restored/preserved?
>
> Should this be considered fatal?
If PCI FLB retrieve fails then there are certain things that cannot be
guaranteed, such as BDF (B specifically) remaining constant. This
could lead to memory corruption as the IOMMU may have live
translations in place for those specific RequesterIDs. And, in the
future, preserved devices may be doing P2P which depends on BARs not
moving. If the PCI core cannot retrieve the FLB saved by the previous
kernel, it cannot make these guarantees.
So yeah I think you're right that PCI core should treat FLB retrieve
as fatal and just panic.
> > }
> >
> > static void pci_flb_finish(struct liveupdate_flb_op_args *args)
> > {
> >- kho_restore_free(args->obj);
> >+ struct pci_flb_incoming *incoming = args->obj;
> >+
> >+ xa_destroy(&incoming->xa);
> >+ kho_restore_free(incoming->ser);
> >+ kfree(incoming);
> > }
> >
> > static struct liveupdate_flb_ops pci_liveupdate_flb_ops = {
> >@@ -270,6 +335,91 @@ void pci_liveupdate_unpreserve(struct pci_dev *dev)
> > }
> > EXPORT_SYMBOL_GPL(pci_liveupdate_unpreserve);
> >
> >+static struct pci_flb_incoming *pci_liveupdate_flb_get_incoming(void)
> >+{
> >+ struct pci_flb_incoming *incoming = NULL;
> >+ int ret;
> >+
> >+ ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&incoming);
> >+
> >+ /* Live Update is not enabled. */
> >+ if (ret == -EOPNOTSUPP)
> >+ return NULL;
> >+
> >+ /* Live Update is enabled, but there is no incoming FLB data. */
> >+ if (ret == -ENODATA)
> >+ return NULL;
> >+
> >+ /*
> >+ * Live Update is enabled and there is incoming FLB data, but none of it
> >+ * matches pci_liveupdate_flb.compatible.
> >+ *
> >+ * This could mean that no PCI FLB data was passed by the previous
> >+ * kernel, but it could also mean the previous kernel used a different
> >+ * compatibility string (i.e. a different ABI).
> >+ */
> >+ if (ret == -ENOENT) {
> >+ pr_info_once("No incoming FLB matched %s\n", pci_liveupdate_flb.compatible);
> >+ return NULL;
> >+ }
> >+
> >+ /*
> >+ * There is incoming FLB data that matches pci_liveupdate_flb.compatible
> >+ * but it cannot be retrieved.
> >+ */
> >+ if (ret) {
> >+ WARN_ONCE(ret, "Failed to retrieve incoming FLB data\n");
>
> I think this should probably be considered fatal as mentioned above or
> the caller of this function should get an error so it can fail. I think
> retrievel of preserved state should generally not fail unless there is
> memory corruption or ABI is incompatible.
Yeah. I think I will just call panic() here to cover all cases.
> >+ return NULL;
> >+ }
> >+
> >+ return incoming;
> >+}
> >+
>
> [snip]
> >+
> >+static inline bool pci_liveupdate_is_incoming(struct pci_dev *dev)
> >+{
> >+ return false;
> >+}
> > #endif
> >
> > #endif /* LINUX_PCI_LIVEUPDATE_H */
> >--
> >2.54.0.746.g67dd491aae-goog
> >
>
> Sami
^ permalink raw reply
* Re: [PATCH net-next V3 2/7] netdevsim: Register devlink after device init
From: Jakub Kicinski @ 2026-06-16 21:05 UTC (permalink / raw)
To: Mark Bloch
Cc: Jiri Pirko, Eric Dumazet, Paolo Abeni, Andrew Lunn,
David S. Miller, Jonathan Corbet, Shuah Khan, Simon Horman,
Sunil Goutham, Linu Cherian, Geetha sowjanya, hariprasad,
Subbaraya Sundeep, Bharat Bhushan, Saeed Mahameed,
Leon Romanovsky, Tariq Toukan, Ethan Nelson-Moore, linux-doc,
netdev, linux-rdma
In-Reply-To: <7635d50c-1c82-4090-8907-53a72444fc04@nvidia.com>
On Tue, 16 Jun 2026 20:29:25 +0300 Mark Bloch wrote:
> I think the explicit helper is the cleanest option here, without any
> workqueue fallback inside devlink. It avoids depending on devl_register()
> ordering, and makes the support explicit per driver.
>
> Does that sound like an acceptable direction?
I'd much rather have the workqueue with the purely theoretical race
with user space than a bunch of drivers that don't act on the cmdline
params.
^ permalink raw reply
* Re: [swap tier discussion] Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Yosry Ahmed @ 2026-06-16 20:26 UTC (permalink / raw)
To: Nhat Pham
Cc: YoungJun Park, Shakeel Butt, Hao Jia, Johannes Weiner, mhocko, tj,
mkoutny, roman.gushchin, akpm, chengming.zhou, muchun.song,
cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
kasong, baoquan.he, joshua.hahnjy
In-Reply-To: <CAKEwX=NyfxfXhHESTLyirAgdVA6QaYAcam792-vSZdmo0Pz+bA@mail.gmail.com>
On Tue, Jun 16, 2026 at 1:24 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Jun 16, 2026 at 4:10 PM Yosry Ahmed <yosry@kernel.org> wrote:
> >
> > On Tue, Jun 16, 2026 at 1:09 PM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Tue, Jun 16, 2026 at 3:54 PM Yosry Ahmed <yosry@kernel.org> wrote:
> > > >
> > > > On Tue, Jun 16, 2026 at 11:33 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > > > >
> > > > > TBH, without vswap, we should not allow setting zswap as its own tier.
> > > > > It's meaningless. Maybe makes it a no-op, and warn users what they're
> > > > > setting is gibberish?
> > > >
> > > > Why? vswap is transparent to the user. Why can't zswap be its own tier?
> > >
> > > Without vswap, if you set zswap as its own tier, which phys swap
> > > device should we allocate from for the backing slot? :)
> >
> > Today we just allocate a swap slot in a swapfile during reclaim,
> > before swapout, and zswap will just writeback to that one. I assume
> > the same will work with swap tiering, except that maybe the way that
> > swap slot will respect the allowed swap tiers?
>
> Yep! So if we set zswap as the only tier, then it wouldn't be able to
> allocate a swap slot in swapfile right?
Ohh I thought you meant we shouldn't allow zswap to be a tier at all,
not the *only* tier.
> Or are you suggesting that if we set zswap as the only tier then we
> can allocate from any swapfile (since we're not doing any IO anyway)?
Hmm, technically having zswap as the only tier should be equivalent to
disabling writeback, but you're right that if zswap is the only tier
than the memcg is not allowed to use swap slots from any swapfile, so
zswap cannot be used. Very good point :)
In this case I think yes, we need vswap to be enabled to allow making
zswap the only tier. That's one gap between zswap being the only tier
and disabling zswap writeback, the former requires vswap while the
latter doesn't.
^ permalink raw reply
* Re: [swap tier discussion] Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Nhat Pham @ 2026-06-16 20:24 UTC (permalink / raw)
To: Yosry Ahmed
Cc: YoungJun Park, Shakeel Butt, Hao Jia, Johannes Weiner, mhocko, tj,
mkoutny, roman.gushchin, akpm, chengming.zhou, muchun.song,
cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
kasong, baoquan.he, joshua.hahnjy
In-Reply-To: <CAO9r8zMHGFG_jcVeDPgowaQ2RNntp3KankwzQdgrJb9PrWu8_w@mail.gmail.com>
On Tue, Jun 16, 2026 at 4:10 PM Yosry Ahmed <yosry@kernel.org> wrote:
>
> On Tue, Jun 16, 2026 at 1:09 PM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Tue, Jun 16, 2026 at 3:54 PM Yosry Ahmed <yosry@kernel.org> wrote:
> > >
> > > On Tue, Jun 16, 2026 at 11:33 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > > >
> > > > TBH, without vswap, we should not allow setting zswap as its own tier.
> > > > It's meaningless. Maybe makes it a no-op, and warn users what they're
> > > > setting is gibberish?
> > >
> > > Why? vswap is transparent to the user. Why can't zswap be its own tier?
> >
> > Without vswap, if you set zswap as its own tier, which phys swap
> > device should we allocate from for the backing slot? :)
>
> Today we just allocate a swap slot in a swapfile during reclaim,
> before swapout, and zswap will just writeback to that one. I assume
> the same will work with swap tiering, except that maybe the way that
> swap slot will respect the allowed swap tiers?
Yep! So if we set zswap as the only tier, then it wouldn't be able to
allocate a swap slot in swapfile right?
Or are you suggesting that if we set zswap as the only tier then we
can allocate from any swapfile (since we're not doing any IO anyway)?
^ permalink raw reply
* Re: [swap tier discussion] Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Yosry Ahmed @ 2026-06-16 20:10 UTC (permalink / raw)
To: Nhat Pham
Cc: YoungJun Park, Shakeel Butt, Hao Jia, Johannes Weiner, mhocko, tj,
mkoutny, roman.gushchin, akpm, chengming.zhou, muchun.song,
cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
kasong, baoquan.he, joshua.hahnjy
In-Reply-To: <CAKEwX=N=Umi94wdKcLxEWOqUwhz6=Lj909pc1Pr_5ivVnZmdPQ@mail.gmail.com>
On Tue, Jun 16, 2026 at 1:09 PM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Tue, Jun 16, 2026 at 3:54 PM Yosry Ahmed <yosry@kernel.org> wrote:
> >
> > On Tue, Jun 16, 2026 at 11:33 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > TBH, without vswap, we should not allow setting zswap as its own tier.
> > > It's meaningless. Maybe makes it a no-op, and warn users what they're
> > > setting is gibberish?
> >
> > Why? vswap is transparent to the user. Why can't zswap be its own tier?
>
> Without vswap, if you set zswap as its own tier, which phys swap
> device should we allocate from for the backing slot? :)
Today we just allocate a swap slot in a swapfile during reclaim,
before swapout, and zswap will just writeback to that one. I assume
the same will work with swap tiering, except that maybe the way that
swap slot will respect the allowed swap tiers?
>
> With vswap then it makes sense (and would probably be the "default"
> for zswap setup until we enable zswap writeback).
^ permalink raw reply
* Re: [PATCH v6 03/12] PCI: liveupdate: Track incoming preserved PCI devices
From: Samiullah Khawaja @ 2026-06-16 20:09 UTC (permalink / raw)
To: David Matlack
Cc: kexec, linux-doc, linux-kernel, linux-mm, linux-pci,
Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
Jason Gunthorpe, Jonathan Corbet, Josh Hilke, Leon Romanovsky,
Lukas Wunner, Mike Rapoport, Parav Pandit, Pasha Tatashin,
Pranjal Shrivastava, Pratyush Yadav, Saeed Mahameed, Shuah Khan,
Vipin Sharma, William Tu, Yi Liu
In-Reply-To: <20260522202410.3104264-4-dmatlack@google.com>
On Fri, May 22, 2026 at 08:24:01PM +0000, David Matlack wrote:
>During PCI enumeration, the previous kernel might have passed state about
>devices that were preserved across kexec. The PCI core needs to fetch
>this state to identify which devices are "incoming" and require special
>handling.
>
>Add pci_liveupdate_setup_device() which is called during device setup
>to fetch the serialized state (struct pci_ser) from the Live Update
>Orchestrator. The first time this happens, pci_flb_retrieve() will run
>and convert the array of pci_dev_ser structs into an xarray so that it
>can be looked up efficiently.
>
>If a device is found in the xarray, the PCI core stores a pointer to its
>state in dev->liveupdate_incoming and holds a reference to the incoming
>FLB until pci_liveupdate_finish() is called by the driver.
>
>This ensures proper lifecycle management for incoming preserved devices
>and allows the PCI core and drivers to apply specific Live Update
>logic to them in subsequent commits.
>
>Drivers can check if a device is an incoming preserved device (e.g.
>during probe) by calling pci_liveupdate_is_incoming().
>
>CONFIG_64BIT is now required to enable CONFIG_PCI_LIVEUPDATE so that the
>domain and bdf can be guaranteed to fit in an unsigned long and be used
>as the xarray key.
>
>Signed-off-by: David Matlack <dmatlack@google.com>
>---
> MAINTAINERS | 1 +
> drivers/pci/Kconfig | 2 +-
> drivers/pci/liveupdate.c | 230 ++++++++++++++++++++++++++++++++-
> drivers/pci/liveupdate.h | 5 +
> drivers/pci/probe.c | 3 +
> include/linux/pci_liveupdate.h | 13 ++
> 6 files changed, 251 insertions(+), 3 deletions(-)
>
[snip]
>
> static int pci_flb_retrieve(struct liveupdate_flb_op_args *args)
> {
>- args->obj = phys_to_virt(args->data);
>+ struct pci_ser *ser = phys_to_virt(args->data);
>+ struct pci_flb_incoming *incoming;
>+ int ret = -ENOMEM;
>+ u32 i;
>+
>+ incoming = kmalloc_obj(*incoming);
>+ if (!incoming)
>+ goto err_restore_free;
>+
>+ incoming->ser = ser;
>+ xa_init(&incoming->xa);
>+
>+ for (i = 0; i < incoming->ser->max_nr_devices; i++) {
>+ struct pci_dev_ser *dev_ser = &incoming->ser->devices[i];
>+ unsigned long key;
>+
>+ if (!dev_ser->refcount)
>+ continue;
>+
>+ key = pci_ser_xa_key(dev_ser->domain, dev_ser->bdf);
>+ ret = xa_insert(&incoming->xa, key, dev_ser, GFP_KERNEL);
>+ if (ret)
>+ goto err_xa_destroy;
>+ }
>+
>+ args->obj = incoming;
> return 0;
>+
>+err_xa_destroy:
>+ xa_destroy(&incoming->xa);
>+ kfree(incoming);
>+err_restore_free:
>+ kho_restore_free(ser);
>+ return ret;
Hmm.. This is interesting, so the KHO state is freed and it cannot be
reused. I see you already pointed out that we are putting an LUO policy
to say that the retry is not allowed.
But what should be the behaviour of liveupdate in this regard? Let the
system boot in a normal way? This might break other subsystems as they
might depend on PCIe restoring state properly. Also I think some of the
PCIe state, like device-id, BAR addresses, ACLs etc, might be used as
source of truth by other components.
For example, lets say FLB retrieve() of PCIe fails, but succeeds for
VFIO/IOMMU, now VFIO/IOMMU are restoring state of a device that is not
restored/preserved?
Should this be considered fatal?
> }
>
> static void pci_flb_finish(struct liveupdate_flb_op_args *args)
> {
>- kho_restore_free(args->obj);
>+ struct pci_flb_incoming *incoming = args->obj;
>+
>+ xa_destroy(&incoming->xa);
>+ kho_restore_free(incoming->ser);
>+ kfree(incoming);
> }
>
> static struct liveupdate_flb_ops pci_liveupdate_flb_ops = {
>@@ -270,6 +335,91 @@ void pci_liveupdate_unpreserve(struct pci_dev *dev)
> }
> EXPORT_SYMBOL_GPL(pci_liveupdate_unpreserve);
>
>+static struct pci_flb_incoming *pci_liveupdate_flb_get_incoming(void)
>+{
>+ struct pci_flb_incoming *incoming = NULL;
>+ int ret;
>+
>+ ret = liveupdate_flb_get_incoming(&pci_liveupdate_flb, (void **)&incoming);
>+
>+ /* Live Update is not enabled. */
>+ if (ret == -EOPNOTSUPP)
>+ return NULL;
>+
>+ /* Live Update is enabled, but there is no incoming FLB data. */
>+ if (ret == -ENODATA)
>+ return NULL;
>+
>+ /*
>+ * Live Update is enabled and there is incoming FLB data, but none of it
>+ * matches pci_liveupdate_flb.compatible.
>+ *
>+ * This could mean that no PCI FLB data was passed by the previous
>+ * kernel, but it could also mean the previous kernel used a different
>+ * compatibility string (i.e. a different ABI).
>+ */
>+ if (ret == -ENOENT) {
>+ pr_info_once("No incoming FLB matched %s\n", pci_liveupdate_flb.compatible);
>+ return NULL;
>+ }
>+
>+ /*
>+ * There is incoming FLB data that matches pci_liveupdate_flb.compatible
>+ * but it cannot be retrieved.
>+ */
>+ if (ret) {
>+ WARN_ONCE(ret, "Failed to retrieve incoming FLB data\n");
I think this should probably be considered fatal as mentioned above or
the caller of this function should get an error so it can fail. I think
retrievel of preserved state should generally not fail unless there is
memory corruption or ABI is incompatible.
>+ return NULL;
>+ }
>+
>+ return incoming;
>+}
>+
[snip]
>+
>+static inline bool pci_liveupdate_is_incoming(struct pci_dev *dev)
>+{
>+ return false;
>+}
> #endif
>
> #endif /* LINUX_PCI_LIVEUPDATE_H */
>--
>2.54.0.746.g67dd491aae-goog
>
Sami
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox