* [RFC PATCH v2 5/7] tracing/probes: Add $current variable support
From: Masami Hiramatsu (Google) @ 2026-06-10 0:52 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178105268094.21760.13668249930524377840.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Since we can use the BTF to cast value to a structure pointer type,
it is useful to introduce "$current" special variable support to
fetcharg.
User can define a fetcharg to access current task_struct properties
using BTF info. e.g.
$current->cpus_ptr
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v2:
- Support to parse $current in parse_btf_arg().
- If no typecast on $current, it automatically casted to task_struct.
- Check error case if $current follows something except for "-".
---
Documentation/trace/eprobetrace.rst | 1 +
Documentation/trace/fprobetrace.rst | 1 +
Documentation/trace/kprobetrace.rst | 1 +
kernel/trace/trace.c | 2 +-
kernel/trace/trace_probe.c | 29 ++++++++++++++++++++++++++++-
kernel/trace/trace_probe.h | 1 +
kernel/trace/trace_probe_tmpl.h | 3 +++
7 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index 680e0af43d5d..dcf92d5b4175 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -38,6 +38,7 @@ Synopsis of eprobe_events
@ADDR : Fetch memory at ADDR (ADDR should be in kernel)
@SYM[+|-offs] : Fetch memory at SYM +|- offs (SYM should be a data symbol)
$comm : Fetch current task comm.
+ $current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 290a9e6f7491..3392cab016b3 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -50,6 +50,7 @@ Synopsis of fprobe-events
$argN : Fetch the Nth function argument. (N >= 1) (\*2)
$retval : Fetch return value.(\*3)
$comm : Fetch current task comm.
+ $current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*4)(\*5)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index a62707e6a9f2..81e4fe38791d 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -53,6 +53,7 @@ Synopsis of kprobe_events
$argN : Fetch the Nth function argument. (N >= 1) (\*1)
$retval : Fetch return value.(\*2)
$comm : Fetch current task comm.
+ $current : Fetch the address of the current task_struct.
+|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
\IMM : Store an immediate value to the argument.
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e36af853199..e185a006cb08 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4329,7 +4329,7 @@ static const char readme_msg[] =
"\t [(structname[,field])](fetcharg)->field[->field|.field...],\n"
#endif
#else
- "\t $stack<index>, $stack, $retval, $comm,\n"
+ "\t $stack<index>, $stack, $retval, $comm, $current\n"
#endif
"\t +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
"\t kernel return probes support: $retval, $arg<N>, $comm\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 726be9782775..4bdccd9bd7d1 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -718,6 +718,20 @@ static int parse_btf_arg(char *varname,
return -EOPNOTSUPP;
}
+ if (strcmp(varname, "$current") == 0) {
+ code->op = FETCH_OP_CURRENT;
+ /* If no typecast is specified for $current, use task_struct by default */
+ if (!ctx->struct_btf) {
+ tid = bpf_find_btf_id("task_struct", BTF_KIND_STRUCT, &ctx->struct_btf);
+ if (tid < 0) {
+ trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
+ return -ENOENT;
+ }
+ ctx->last_struct = btf_type_skip_modifiers(ctx->struct_btf, tid, &tid);
+ }
+ goto found;
+ }
+
if (ctx->flags & TPARG_FL_TEVENT) {
ret = parse_trace_event(varname, code, ctx);
if (ret < 0) {
@@ -756,8 +770,8 @@ static int parse_btf_arg(char *varname,
return -ENOENT;
}
}
- params = ctx->params;
+ params = ctx->params;
for (i = 0; i < ctx->nr_params; i++) {
const char *name = btf_name_by_offset(ctx->btf, params[i].name_off);
@@ -1246,6 +1260,19 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
return 0;
}
+ /* $current returns the address of the current task_struct. */
+ if (str_has_prefix(arg, "current")) {
+ arg += strlen("current");
+ if (*arg == '-' && IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS))
+ return parse_btf_arg(orig_arg, pcode, end, ctx);
+
+ if (*arg != '\0')
+ goto inval;
+
+ code->op = FETCH_OP_CURRENT;
+ return 0;
+ }
+
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
len = str_has_prefix(arg, "arg");
if (len) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 44f113faae61..62645e847bd1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -96,6 +96,7 @@ enum fetch_op {
FETCH_OP_FOFFS, /* File offset: .immediate */
FETCH_OP_DATA, /* Allocated data: .data */
FETCH_OP_EDATA, /* Entry data: .offset */
+ FETCH_OP_CURRENT, /* Current task_struct address */
// Stage 2 (dereference) op
FETCH_OP_DEREF, /* Dereference: .offset */
FETCH_OP_UDEREF, /* User-space Dereference: .offset */
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index f39b37fcdb3b..f630930288d2 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -112,6 +112,9 @@ process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
case FETCH_OP_DATA:
*val = (unsigned long)code->data;
break;
+ case FETCH_OP_CURRENT:
+ *val = (unsigned long)current;
+ break;
default:
return -EILSEQ;
}
^ permalink raw reply related
* [RFC PATCH v2 4/7] tracing/probes: Support field specifier option for typecast
From: Masami Hiramatsu (Google) @ 2026-06-10 0:52 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178105268094.21760.13668249930524377840.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Add a field specifier option for the typecast. This works like
container_of() macro.
(STRUCT[,FIELD[.FIELD2...]])VAR
This is equivalent to :
container_of(VAR, struct STRUCT, FIELD[.FIELD2...])
For example:
echo "f tick_nohz_handler next_tick=(tick_sched,sched_timer)timer->next_tick" >> dynamic_events
This will trace tick_nohz_handler() with its tick_sched::next_tick which
is converted from @timer by contianer_of(tick, struct tick_sched, sched_timer).
So, if you enabkle both fprobes:tick_nohz_handler__entry and
timer:hrtimer_expire_entry events, we will see something like:
<idle>-0 [002] d.h1. 3778.087272: hrtimer_expire_entry: hrtimer=00000000d63db328 f
unction=tick_nohz_handler now=3777450051040
<idle>-0 [002] d.h1. 3778.087281: tick_nohz_handler__entry: (tick_nohz_handler+0x4
/0x140) next_tick=3777450000000
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v2:
- Use byteoffset for typecast field offset instead of bitoffset. This fixes negative modulo calculation.
- Check whether a field is specified after typecast.
- Reject if typecast field option has arrow operator.
---
Documentation/trace/eprobetrace.rst | 5 +
Documentation/trace/fprobetrace.rst | 8 +-
Documentation/trace/kprobetrace.rst | 8 +-
kernel/trace/trace.c | 4 -
kernel/trace/trace_probe.c | 178 ++++++++++++++++++++++++-----------
kernel/trace/trace_probe.h | 5 +
6 files changed, 141 insertions(+), 67 deletions(-)
diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index cd0b4aa7f896..680e0af43d5d 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -49,7 +49,10 @@ Synopsis of eprobe_events
(STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that when this is used, the FIELD name does not
- need to be prefixed with a '$'.
+ need to be prefixed with a '$'. ASGN can be specified optionally.
+ If ASGN is specified, FIELD will be cast to the same offset
+ position as the ASGN member, rather than to the beginning of
+ the STRUCT.
(STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
also be used with another FETCHARG instead of FIELD.
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 6b8bb27bb62d..290a9e6f7491 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -57,10 +57,12 @@ Synopsis of fprobe-events
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
and bitfield are supported.
- (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ (STRUCT[,ASGN])FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
- ->MEMBER.
- (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ ->MEMBER. ASGN can be specified optionally. If ASGN is specified,
+ FIELD will be cast to the same offset position as the ASGN member,
+ rather than to the beginning of the STRUCT.
+ (STRUCT[,ASGN])(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
also be used with another FETCHARG instead of FIELD.
(\*1) This is available only when BTF is enabled.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index c4382765d5b2..a62707e6a9f2 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -61,11 +61,13 @@ Synopsis of kprobe_events
(x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
"string", "ustring", "symbol", "symstr" and bitfield are
supported.
- (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ (STRUCT[,ASGN])FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that this is available only when the probe is
- on function entry.
- (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ on function entry. ASGN can be specified optionally. If ASGN
+ is specified, FIELD will be cast to the same offset position
+ as the ASGN member, rather than to the beginning of the STRUCT.
+ (STRUCT[,ASGN])(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
also be used with another FETCHARG instead of FIELD.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f70318918c2..0e36af853199 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4325,8 +4325,8 @@ static const char readme_msg[] =
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
- "\t [(structname)]<argname>[->field[->field|.field...]],\n"
- "\t [(structname)](fetcharg)->field[->field|.field...],\n"
+ "\t [(structname[,field])]<argname>[->field[->field|.field...]],\n"
+ "\t [(structname[,field])](fetcharg)->field[->field|.field...],\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index dba73aaa8ade..726be9782775 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -574,6 +574,65 @@ static int split_next_field(char *varname, char **next_field,
return ret;
}
+/* Inner loop for solving dot operator ('.'). Return bit-offset of the given field */
+static int get_bitoffset_of_field(char **pfieldname, const struct btf_type **ptype,
+ struct traceprobe_parse_context *ctx)
+{
+ const struct btf_type *type = *ptype;
+ const struct btf_member *field;
+ struct btf *btf = ctx_btf(ctx);
+ char *fieldname = *pfieldname;
+ int bitoffs = 0;
+ u32 anon_offs;
+ char *next;
+ int is_ptr;
+ s32 tid;
+
+ do {
+ next = NULL;
+ is_ptr = split_next_field(fieldname, &next, ctx);
+ if (is_ptr < 0)
+ return is_ptr;
+
+ anon_offs = 0;
+ field = btf_find_struct_member(btf, type, fieldname,
+ &anon_offs);
+ if (IS_ERR(field)) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return PTR_ERR(field);
+ }
+ if (!field) {
+ trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
+ return -ENOENT;
+ }
+ /* Add anonymous structure/union offset */
+ bitoffs += anon_offs;
+
+ /* Accumulate the bit-offsets of the dot-connected fields */
+ if (btf_type_kflag(type)) {
+ bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
+ ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
+ } else {
+ bitoffs += field->offset;
+ ctx->last_bitsize = 0;
+ }
+
+ type = btf_type_skip_modifiers(btf, field->type, &tid);
+ if (!type) {
+ trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+ return -EINVAL;
+ }
+
+ if (next)
+ ctx->offset += next - fieldname;
+ fieldname = next;
+ } while (!is_ptr && fieldname);
+
+ *pfieldname = fieldname;
+ *ptype = type;
+
+ return bitoffs;
+}
/*
* Parse the field of data structure. The @type must be a pointer type
* pointing the target data structure type.
@@ -583,16 +642,14 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code = *pcode;
- const struct btf_member *field;
- u32 bitoffs, anon_offs;
- bool is_struct = ctx->struct_btf != NULL;
struct btf *btf = ctx_btf(ctx);
- char *next;
- int is_ptr;
+ bool is_first_field = true;
+ int bitoffs;
s32 tid;
do {
- if (!is_struct) {
+ /* For the first field of typecast, @type will be the target structure type. */
+ if (!(is_first_field && ctx->struct_btf)) {
/* Outer loop for solving arrow operator ('->') */
if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
@@ -606,60 +663,25 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
return -EINVAL;
}
}
- /* Only the first type can skip being a pointer */
- is_struct = false;
-
- bitoffs = 0;
- do {
- /* Inner loop for solving dot operator ('.') */
- next = NULL;
- is_ptr = split_next_field(fieldname, &next, ctx);
- if (is_ptr < 0)
- return is_ptr;
-
- anon_offs = 0;
- field = btf_find_struct_member(btf, type, fieldname,
- &anon_offs);
- if (IS_ERR(field)) {
- trace_probe_log_err(ctx->offset, BAD_BTF_TID);
- return PTR_ERR(field);
- }
- if (!field) {
- trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
- return -ENOENT;
- }
- /* Add anonymous structure/union offset */
- bitoffs += anon_offs;
-
- /* Accumulate the bit-offsets of the dot-connected fields */
- if (btf_type_kflag(type)) {
- bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
- ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
- } else {
- bitoffs += field->offset;
- ctx->last_bitsize = 0;
- }
-
- type = btf_type_skip_modifiers(btf, field->type, &tid);
- if (!type) {
- trace_probe_log_err(ctx->offset, BAD_BTF_TID);
- return -EINVAL;
- }
-
- ctx->offset += next - fieldname;
- fieldname = next;
- } while (!is_ptr && fieldname);
+ bitoffs = get_bitoffset_of_field(&fieldname, &type, ctx);
+ if (bitoffs < 0)
+ return bitoffs;
if (++code == end) {
trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
return -EINVAL;
}
code->op = FETCH_OP_DEREF; /* TODO: user deref support */
code->offset = bitoffs / 8;
+ if (is_first_field && ctx->struct_btf) {
+ /* The first field can be typecasted with field option. */
+ code->offset -= ctx->prefix_byteoffs;
+ }
*pcode = code;
ctx->last_bitoffs = bitoffs % 8;
ctx->last_type = type;
+ is_first_field = false;
} while (fieldname);
return 0;
@@ -690,6 +712,11 @@ static int parse_btf_arg(char *varname,
NOSUP_DAT_ARG);
return -EOPNOTSUPP;
}
+ if (!field && ctx->struct_btf) {
+ /* Typecast without field option is not supported */
+ trace_probe_log_err(ctx->offset, TYPECAST_REQ_FIELD);
+ return -EOPNOTSUPP;
+ }
if (ctx->flags & TPARG_FL_TEVENT) {
ret = parse_trace_event(varname, code, ctx);
@@ -700,8 +727,7 @@ static int parse_btf_arg(char *varname,
/* TEVENT is only here via a typecast */
if (WARN_ON_ONCE(ctx->struct_btf == NULL))
return -EINVAL;
- type = ctx->last_struct;
- goto found_type;
+ goto found;
}
if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
@@ -763,7 +789,6 @@ static int parse_btf_arg(char *varname,
type = ctx->last_struct;
else
type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
-found_type:
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
return -EINVAL;
@@ -832,6 +857,45 @@ static int query_btf_struct(const char *sname, struct traceprobe_parse_context *
return 0;
}
+static int parse_btf_casttype(char *casttype, struct traceprobe_parse_context *ctx)
+{
+ char *field;
+ int ret;
+
+ /* Field option - evaluated later. */
+ field = strchr(casttype, ',');
+ if (field)
+ *field++ = '\0';
+
+ ret = query_btf_struct(casttype, ctx);
+ if (ret < 0) {
+ trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+ return -EINVAL;
+ }
+
+ if (field) {
+ struct btf_type *type = (struct btf_type *)ctx->last_struct;
+
+ ctx->offset += field - casttype;
+ ret = get_bitoffset_of_field(&field, &ctx->last_struct, ctx);
+ if (ret < 0)
+ return ret;
+ if (ret % 8) {
+ trace_probe_log_err(ctx->offset, TYPECAST_NOT_ALIGNED);
+ return -EINVAL;
+ }
+ if (field != NULL) {
+ trace_probe_log_err(ctx->offset + field - casttype, TYPECAST_BAD_ARROW);
+ return -EINVAL;
+ }
+ ctx->prefix_byteoffs = ret / 8;
+ /* Restore the original struct type (overwritten by get_bitoffset_of_field) */
+ ctx->last_struct = type;
+ }
+
+ return ret;
+}
+
/* Find the matching closing parenthesis for a given opening parenthesis. */
static char *find_matched_close_paren(char *s)
{
@@ -915,11 +979,10 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
nested = true;
}
- ret = query_btf_struct(arg + 1, ctx);
- if (ret < 0) {
- trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
- return -EINVAL;
- }
+ ctx->offset = orig_offset + 1; /* for the '(' */
+ ret = parse_btf_casttype(arg + 1, ctx);
+ if (ret < 0)
+ return ret;
ctx->offset = orig_offset + tmp - arg;
/* If it is nested, tmp points to the field name. */
@@ -927,6 +990,7 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
ret = parse_btf_field(tmp, ctx->last_struct, pcode, end, ctx);
else
ret = parse_btf_arg(tmp, pcode, end, ctx);
+ ctx->prefix_byteoffs = 0;
return ret;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 982d32a5df8b..44f113faae61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -436,6 +436,7 @@ struct traceprobe_parse_context {
unsigned int flags;
int offset;
int nested_level;
+ int prefix_byteoffs; /* The byte offset of the prefix field of typecast */
};
#define TRACEPROBE_MAX_NESTED_LEVEL 3
@@ -576,7 +577,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \
C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"), \
C(TYPECAST_REQ_FIELD, "Typecast requires a field access"), \
- C(TOO_MANY_NESTED, "Too many nested typecasts/dereferences"),
+ C(TOO_MANY_NESTED, "Too many nested typecasts/dereferences"), \
+ C(TYPECAST_NOT_ALIGNED, "Typecast field option is not byte-aligned"), \
+ C(TYPECAST_BAD_ARROW, "Typecast field option does not support -> operator"),
#undef C
#define C(a, b) TP_ERR_##a
^ permalink raw reply related
* [RFC PATCH v2 3/7] tracing/probes: Support nested typecast
From: Masami Hiramatsu (Google) @ 2026-06-10 0:51 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178105268094.21760.13668249930524377840.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
When we hit an open parenthesis right after typecast closing
parenthesis, it means we have nested typecast. This allows us to
typecast a generic data member in a structure to a pointer to
another structure.
For example, to cast a DATA_MEMBER of VAR structure to STRUCT pointer
and get MEMBER value.
(STRUCT)(VAR->DATA_MEMBER)->MEMBER
Also, we can nest typecast.
(STRUCT1)((STRUCT2)$ARG->FIELD2)->FIELD1
Currently the max nest level is limited to 3.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v2:
- Fix to skip "->" after closing parenthetsis.
---
Documentation/trace/eprobetrace.rst | 2 +
Documentation/trace/fprobetrace.rst | 2 +
Documentation/trace/kprobetrace.rst | 2 +
kernel/trace/trace.c | 1
kernel/trace/trace_probe.c | 76 ++++++++++++++++++++++++++++++++---
kernel/trace/trace_probe.h | 7 +++
6 files changed, 82 insertions(+), 8 deletions(-)
diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index fe3602540569..cd0b4aa7f896 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -50,6 +50,8 @@ Synopsis of eprobe_events
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that when this is used, the FIELD name does not
need to be prefixed with a '$'.
+ (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ also be used with another FETCHARG instead of FIELD.
Types
-----
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 7435ded2d66d..6b8bb27bb62d 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -60,6 +60,8 @@ Synopsis of fprobe-events
(STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
a pointer to STRUCT and then derference the pointer defined by
->MEMBER.
+ (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ also be used with another FETCHARG instead of FIELD.
(\*1) This is available only when BTF is enabled.
(\*2) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index f73614997d52..c4382765d5b2 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -65,6 +65,8 @@ Synopsis of kprobe_events
a pointer to STRUCT and then derference the pointer defined by
->MEMBER. Note that this is available only when the probe is
on function entry.
+ (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+ also be used with another FETCHARG instead of FIELD.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
is best effort, because depending on the argument type, it may be passed on
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa93e7b01146..4f70318918c2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4326,6 +4326,7 @@ static const char readme_msg[] =
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
"\t [(structname)]<argname>[->field[->field|.field...]],\n"
+ "\t [(structname)](fetcharg)->field[->field|.field...],\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 9158f1f22a62..dba73aaa8ade 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -832,10 +832,35 @@ static int query_btf_struct(const char *sname, struct traceprobe_parse_context *
return 0;
}
+/* Find the matching closing parenthesis for a given opening parenthesis. */
+static char *find_matched_close_paren(char *s)
+{
+ char *p = s;
+ int count = 0;
+
+ while (*p) {
+ if (*p == '(')
+ count++;
+ else if (*p == ')') {
+ if (--count == 0)
+ return p;
+ }
+ p++;
+ }
+ return NULL;
+}
+
+static int
+parse_probe_arg(char *arg, const struct fetch_type *type,
+ struct fetch_insn **pcode, struct fetch_insn *end,
+ struct traceprobe_parse_context *ctx);
+
static int handle_typecast(char *arg, struct fetch_insn **pcode,
struct fetch_insn *end,
struct traceprobe_parse_context *ctx)
{
+ int orig_offset = ctx->offset;
+ bool nested = false;
char *tmp;
int ret;
@@ -852,19 +877,56 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
DEREF_OPEN_BRACE);
return -EINVAL;
}
- *tmp = '\0';
- ret = query_btf_struct(arg + 1, ctx);
- *tmp = ')';
+ *tmp++ = '\0';
+
+ /* Handle the nested structure like (STRUCT)(VAR->FIELD)->... */
+ if (*tmp == '(') {
+ char *close = find_matched_close_paren(tmp);
+
+ ctx->offset += tmp - arg;
+ if (!close) {
+ trace_probe_log_err(ctx->offset, DEREF_OPEN_BRACE);
+ return -EINVAL;
+ }
+ /* We expect a field access for typecast */
+ if (close[1] != '-' || close[2] != '>') {
+ trace_probe_log_err(ctx->offset + close - tmp + 1,
+ TYPECAST_REQ_FIELD);
+ return -EINVAL;
+ }
+ ctx->nested_level++;
+ if (ctx->nested_level > TRACEPROBE_MAX_NESTED_LEVEL) {
+ trace_probe_log_err(ctx->offset, TOO_MANY_NESTED);
+ return -E2BIG;
+ }
+ *close = '\0';
+
+ ctx->offset += 1; /* for the '(' */
+ /* We need to parse the nested one */
+ ret = parse_probe_arg(tmp + 1, find_fetch_type(NULL, ctx->flags),
+ pcode, end, ctx);
+ if (ret < 0)
+ return ret;
+ ctx->nested_level--;
+ clear_struct_btf(ctx);
+
+ tmp = close + 3;/* Skip "->" after closing parenthesis */
+ nested = true;
+ }
+
+ ret = query_btf_struct(arg + 1, ctx);
if (ret < 0) {
trace_probe_log_err(ctx->offset + 1, NO_PTR_STRCT);
return -EINVAL;
}
- tmp++;
-
- ctx->offset += tmp - arg;
- ret = parse_btf_arg(tmp, pcode, end, ctx);
+ ctx->offset = orig_offset + tmp - arg;
+ /* If it is nested, tmp points to the field name. */
+ if (nested)
+ ret = parse_btf_field(tmp, ctx->last_struct, pcode, end, ctx);
+ else
+ ret = parse_btf_arg(tmp, pcode, end, ctx);
return ret;
}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 883938a74aee..982d32a5df8b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -435,8 +435,11 @@ struct traceprobe_parse_context {
struct trace_probe *tp;
unsigned int flags;
int offset;
+ int nested_level;
};
+#define TRACEPROBE_MAX_NESTED_LEVEL 3
+
extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i,
const char *argv,
struct traceprobe_parse_context *ctx);
@@ -571,7 +574,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
C(TOO_MANY_ARGS, "Too many arguments are specified"), \
C(TOO_MANY_EARGS, "Too many entry arguments specified"), \
C(EVENT_TOO_BIG, "Event too big (too many fields?)"), \
- C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"),
+ C(TYPECAST_NOT_EVENT, "Typecasts are only for eprobe fields"), \
+ C(TYPECAST_REQ_FIELD, "Typecast requires a field access"), \
+ C(TOO_MANY_NESTED, "Too many nested typecasts/dereferences"),
#undef C
#define C(a, b) TP_ERR_##a
^ permalink raw reply related
* [RFC PATCH v2 2/7] tracing/probes: Support typecast for various probe events
From: Masami Hiramatsu (Google) @ 2026-06-10 0:51 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178105268094.21760.13668249930524377840.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Support BTF typecast feature on other probe events (but only if it is
kernel function entry or return.)
To support other probe events, we just need to use last_struct type
when we find a function parameter in parse_btf_arg().
This also update <tracefs>/README file to show struct typecast.
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
Changes in v2:
- Fix to re-enable typecast on eprobe.
---
Documentation/trace/fprobetrace.rst | 3 +++
Documentation/trace/kprobetrace.rst | 4 ++++
kernel/trace/trace.c | 2 +-
kernel/trace/trace_probe.c | 14 +++++++++-----
kernel/trace/trace_probe.h | 5 +++++
5 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index b4c2ca3d02c1..7435ded2d66d 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -57,6 +57,9 @@ Synopsis of fprobe-events
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
and bitfield are supported.
+ (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ a pointer to STRUCT and then derference the pointer defined by
+ ->MEMBER.
(\*1) This is available only when BTF is enabled.
(\*2) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 3b6791c17e9b..f73614997d52 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -61,6 +61,10 @@ Synopsis of kprobe_events
(x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
"string", "ustring", "symbol", "symstr" and bitfield are
supported.
+ (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+ a pointer to STRUCT and then derference the pointer defined by
+ ->MEMBER. Note that this is available only when the probe is
+ on function entry.
(\*1) only for the probe on function entry (offs == 0). Note, this argument access
is best effort, because depending on the argument type, it may be passed on
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..aa93e7b01146 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4325,7 +4325,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
"\t $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
#ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
- "\t <argname>[->field[->field|.field...]],\n"
+ "\t [(structname)]<argname>[->field[->field|.field...]],\n"
#endif
#else
"\t $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index fd1caa1f9723..9158f1f22a62 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -759,7 +759,10 @@ static int parse_btf_arg(char *varname,
return -ENOENT;
found:
- type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
+ if (ctx->struct_btf)
+ type = ctx->last_struct;
+ else
+ type = btf_type_skip_modifiers(ctx->btf, tid, &tid);
found_type:
if (!type) {
trace_probe_log_err(ctx->offset, BAD_BTF_TID);
@@ -836,10 +839,11 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
char *tmp;
int ret;
- /* Currently this only works for eprobes */
- if (!(ctx->flags & TPARG_FL_TEVENT)) {
- trace_probe_log_err(ctx->offset, TYPECAST_NOT_EVENT);
- return -EINVAL;
+ if (!(tparg_is_event_probe(ctx->flags) ||
+ tparg_is_function_entry(ctx->flags) ||
+ tparg_is_function_return(ctx->flags))) {
+ trace_probe_log_err(ctx->offset, NOSUP_BTFARG);
+ return -EOPNOTSUPP;
}
tmp = strchr(arg, ')');
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 15758cc11fc6..883938a74aee 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -414,6 +414,11 @@ static inline bool tparg_is_function_return(unsigned int flags)
return (flags & TPARG_FL_LOC_MASK) == (TPARG_FL_KERNEL | TPARG_FL_RETURN);
}
+static inline bool tparg_is_event_probe(unsigned int flags)
+{
+ return !!(flags & TPARG_FL_TEVENT);
+}
+
struct traceprobe_parse_context {
struct trace_event_call *event;
/* BTF related parameters */
^ permalink raw reply related
* [RFC PATCH v2 1/7] tracing/events: Fix to check the simple_tsk_fn creation
From: Masami Hiramatsu (Google) @ 2026-06-10 0:51 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178105268094.21760.13668249930524377840.stgit@devnote2>
From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Sashiko pointed that this sample code does not correctly handle the
failure of thread creation because kthread_run() can return -errno.
This removes the counter-based thread creation/stops but just
checking the simple_tsk_fn is correctly initialized (created) or not.
Link: https://sashiko.dev/#/patchset/178092865666.163648.10457567771536160909.stgit%40devnote2
Fixes: 9cfe06f8cd5c ("tracing/events: add trace-events-sample")
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
samples/trace_events/trace-events-sample.c | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index ecc7db237f2e..b61766864b54 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -92,12 +92,11 @@ static int simple_thread_fn(void *arg)
}
static DEFINE_MUTEX(thread_mutex);
-static int simple_thread_cnt;
int foo_bar_reg(void)
{
mutex_lock(&thread_mutex);
- if (simple_thread_cnt++)
+ if (!IS_ERR_OR_NULL(simple_tsk_fn))
goto out;
pr_info("Starting thread for foo_bar_fn\n");
@@ -115,14 +114,11 @@ int foo_bar_reg(void)
void foo_bar_unreg(void)
{
mutex_lock(&thread_mutex);
- if (--simple_thread_cnt)
- goto out;
-
- pr_info("Killing thread for foo_bar_fn\n");
- if (simple_tsk_fn)
+ if (!IS_ERR_OR_NULL(simple_tsk_fn)) {
+ pr_info("Killing thread for foo_bar_fn\n");
kthread_stop(simple_tsk_fn);
- simple_tsk_fn = NULL;
- out:
+ simple_tsk_fn = NULL;
+ }
mutex_unlock(&thread_mutex);
}
@@ -139,7 +135,7 @@ static void __exit trace_event_exit(void)
{
kthread_stop(simple_tsk);
mutex_lock(&thread_mutex);
- if (simple_tsk_fn)
+ if (!IS_ERR_OR_NULL(simple_tsk_fn))
kthread_stop(simple_tsk_fn);
simple_tsk_fn = NULL;
mutex_unlock(&thread_mutex);
^ permalink raw reply related
* [RFC PATCH v2 0/7] tracing/probes: Add more typecast features
From: Masami Hiramatsu (Google) @ 2026-06-10 0:51 UTC (permalink / raw)
To: Steven Rostedt, Mathieu Desnoyers
Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest
Hi,
Here is the 2nd version of series to introduce more typecast features
to probe events. The previous version is here:
https://lore.kernel.org/all/178092865666.163648.10457567771536160909.stgit@devnote2/
In this version, I fixed various problems Sashiko reviewed and add
a fix of sample code. Also drop +CPU/PCPU() and introduce this_cpu_read().
Steve introduced BTF typecast feature for eprobe[1].
This series extends it and add more options:
1. Expanding BTF typecast to kprobe and fprobe.
(currently only function entry/exit)
2. Introduce container_of like typecast. This adds a "assigned
member" option to the typecast.
(STRUCT,MEMBER)VAR->ANOTHER_MEMBER
This casts VAR to STRUCT type but the VAR is as the address
of STRUCT.MEMBER. In C, it is:
container_of(VAR, STRUCT, MEMBER)->ANOTHER_MEMBER
3. Support nested typecast, e.g.
(STRUCT)((STRUCT2)VAR->MEMBER2)->MEMBER
the nest level must be smaller than 3.
4. Add $current variable to point "current" task_struct.
This is useful with typecast, e.g.
(task_struct)$current->pid
5. per-cpu dereference support.
Intrdouce this_cpu_read(VAR) and this_cpu_ptr(VAR) to
access per-cpu data on the current CPU (accessing other CPU
data is not stable, because it can be changed.)
You can access the member of per-cpu data structure using
typecast like:
(STRUCT)this_cpu_ptr(VAR)->MEMBER
And added a test script to test part of them.
[1] https://lore.kernel.org/all/20260601130746.2139d926@gandalf.local.home/
---
Masami Hiramatsu (Google) (7):
tracing/events: Fix to check the simple_tsk_fn creation
tracing/probes: Support typecast for various probe events
tracing/probes: Support nested typecast
tracing/probes: Support field specifier option for typecast
tracing/probes: Add $current variable support
tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
tracing/probes: Add a new testcase for BTF typecasts
Documentation/trace/eprobetrace.rst | 10
Documentation/trace/fprobetrace.rst | 10
Documentation/trace/kprobetrace.rst | 11 +
kernel/trace/trace.c | 6
kernel/trace/trace_probe.c | 404 +++++++++++++++-----
kernel/trace/trace_probe.h | 18 +
kernel/trace/trace_probe_tmpl.h | 33 +-
samples/trace_events/trace-events-sample.c | 56 ++-
samples/trace_events/trace-events-sample.h | 34 ++
.../ftrace/test.d/dynevent/btf_probe_event.tc | 51 +++
10 files changed, 509 insertions(+), 124 deletions(-)
create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
--
Masami Hiramatsu (Google) <mhiramat@kernel.org>
^ permalink raw reply
* Re: [PATCH v1] pnp: Documentation improvements
From: Randy Dunlap @ 2026-06-10 0:36 UTC (permalink / raw)
To: Uwe Kleine-König (The Capable Hub), Rafael J. Wysocki
Cc: Jonathan Corbet, Shuah Khan, linux-doc, linux-kernel
In-Reply-To: <20260609145117.1355753-2-u.kleine-koenig@baylibre.com>
On 6/9/26 7:51 AM, Uwe Kleine-König (The Capable Hub) wrote:
> - Consistently use named initializers and simplify sentinel
> - Skip assignment to .driver_data if all are 0
> - Use consistent spacing to match Linux coding style
> - Fix prototype of probe function
> - s/pnp_id/pnp_device_id/
> - Drop non-existing .card_id_table
>
> Signed-off-by: Uwe Kleine-König (The Capable Hub) <u.kleine-koenig@baylibre.com>
LGTM. Thanks.
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
The only issue I have with this file following this patch is
the use of "ex" for "Example" or "E.g.".
> ---
> Documentation/admin-guide/pnp.rst | 22 ++++++++++------------
> 1 file changed, 10 insertions(+), 12 deletions(-)
>
> diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst
> index 24d80e3eb309..14a0bf400d2d 100644
> --- a/Documentation/admin-guide/pnp.rst
> +++ b/Documentation/admin-guide/pnp.rst
> @@ -203,12 +203,12 @@ The New Way
>
> ex::
>
> - static const struct pnp_id pnp_dev_table[] = {
> + static const struct pnp_device_id pnp_dev_table[] = {
> /* Standard LPT Printer Port */
> - {.id = "PNP0400", .driver_data = 0},
> + { .id = "PNP0400" },
> /* ECP Printer Port */
> - {.id = "PNP0401", .driver_data = 0},
> - {.id = ""}
> + { .id = "PNP0401" },
> + { }
> };
>
> Please note that the character 'X' can be used as a wild card in the function
> @@ -217,14 +217,14 @@ The New Way
> ex::
>
> /* Unknown PnP modems */
> - { "PNPCXXX", UNKNOWN_DEV },
> + { .id = "PNPCXXX", .driver_data = UNKNOWN_DEV },
>
> Supported PnP card IDs can optionally be defined.
> ex::
>
> - static const struct pnp_id pnp_card_table[] = {
> - { "ANYDEVS", 0 },
> - { "", 0 }
> + static const struct pnp_device_id pnp_card_table[] = {
> + { .id = "ANYDEVS" },
> + { }
> };
>
> 2. Optionally define probe and remove functions. It may make sense not to
> @@ -234,14 +234,13 @@ The New Way
> ex::
>
> static int
> - serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
> - struct pnp_id *dev_id)
> + serial_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *dev_id)
> {
> . . .
>
> ex::
>
> - static void serial_pnp_remove(struct pnp_dev * dev)
> + static void serial_pnp_remove(struct pnp_dev *dev)
> {
> . . .
>
> @@ -253,7 +252,6 @@ The New Way
>
> static struct pnp_driver serial_pnp_driver = {
> .name = "serial",
> - .card_id_table = pnp_card_table,
> .id_table = pnp_dev_table,
> .probe = serial_pnp_probe,
> .remove = serial_pnp_remove,
>
> base-commit: a87737435cfa134f9cdcc696ba3080759d04cf72
--
~Randy
^ permalink raw reply
* [PATCH v4 6/6] kselftest: alloc_tag: extend the allocinfo ioctl kselftest
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781042698.git.abhishekbapat@google.com>
Add the following 2 scenarios to the allocinfo ioctl kselftest:
1. Validate size based filtering
2. Validate lineno based filtering
The first test uses "do_init_module" as the candidate function for the
test. This is because the associated site will only allocate memory when
a kernel module is loaded. The return value of get_content_id() changes
every time modules are loaded or unloaded. Hence, as long as
get_content_id() values at the start and the end of the test are the
same, the memory allocated by the do_init_module call site should also
remain the same. Consequently, the test can assume consistency between
the value returned by the ioctl and the procfs resulting in less
flakiness.
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
---
.../alloc_tag/allocinfo_ioctl_test.c | 204 +++++++++++++++++-
1 file changed, 203 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
index cd9cf229ae1f..5d2f13900a47 100644
--- a/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
+++ b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
@@ -311,11 +311,201 @@ static int test_function_filter(void)
return run_filter_test(&filter);
}
+static int test_size_filter(void)
+{
+ int fd;
+ struct allocinfo_tag_data_vec *tags = malloc(sizeof(*tags));
+ struct allocinfo_tag_data_vec *procfs_entries = malloc(sizeof(*procfs_entries));
+ struct allocinfo_filter filter;
+ int ret = KSFT_PASS;
+ __u64 target_size, i, pos;
+ bool found;
+ const char *target_function = "do_init_module";
+ struct allocinfo_content_id start_cont_id, end_cont_id;
+ int retry = 0;
+ const int max_retries = 10;
+
+ if (!tags || !procfs_entries) {
+ ksft_print_msg("Memory allocation failed.\n");
+ ret = KSFT_FAIL;
+ goto freemem;
+ }
+
+ fd = open(ALLOCINFO_PROC, O_RDONLY);
+ if (fd < 0) {
+ ksft_exit_skip("Failed to open " ALLOCINFO_PROC ": %s\n", strerror(errno));
+ ret = KSFT_FAIL;
+ goto freemem;
+ }
+
+ do {
+ found = false;
+ pos = 0;
+
+ if (__allocinfo_get_content_id(fd, &start_cont_id)) {
+ ksft_print_msg("allocinfo_get_content_id failed\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+
+ memset(&filter, 0, sizeof(filter));
+ filter.mask |= ALLOCINFO_FILTER_MASK_FUNCTION;
+ strncpy(filter.fields.function, target_function, ALLOCINFO_STR_SIZE);
+
+ if (get_filtered_procfs_entries(procfs_entries, &filter, fd)) {
+ ksft_print_msg("Error retrieving entries from " ALLOCINFO_PROC "\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+
+ if (procfs_entries->count == 0) {
+ ksft_print_msg("Function %s not found in procfs\n", target_function);
+ ret = KSFT_SKIP;
+ goto exit;
+ }
+
+ target_size = procfs_entries->tag[0].counter.bytes;
+
+ memset(&filter, 0, sizeof(filter));
+ filter.mask |= ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE;
+ filter.min_size = target_size;
+ filter.max_size = target_size;
+
+ while (1) {
+ struct allocinfo_get_at get_at_params;
+
+ memset(&get_at_params, 0, sizeof(get_at_params));
+ memcpy(&get_at_params.filter, &filter, sizeof(filter));
+ get_at_params.pos = pos;
+
+ if (__allocinfo_get_at(fd, &get_at_params))
+ break;
+
+ tags->count = 0;
+ memcpy(&tags->tag[tags->count++], &get_at_params.data,
+ sizeof(get_at_params.data));
+
+ while (tags->count < VEC_MAX_ENTRIES &&
+ __allocinfo_get_next(fd, &tags->tag[tags->count]) == 0)
+ tags->count++;
+
+ for (i = 0; i < tags->count; i++) {
+ if (strcmp(tags->tag[i].tag.function, target_function) == 0) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found || tags->count < VEC_MAX_ENTRIES)
+ break;
+
+ pos += tags->count;
+ }
+
+ if (__allocinfo_get_content_id(fd, &end_cont_id)) {
+ ksft_print_msg("allocinfo_get_content_id failed\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+
+ if (start_cont_id.id == end_cont_id.id)
+ break;
+
+ ksft_print_msg("Module load detected during size verification, retrying...\n");
+ } while (retry++ < max_retries);
+
+ if (start_cont_id.id == end_cont_id.id && !found) {
+ ksft_print_msg("Entry with function %s not found in IOCTL results\n",
+ target_function);
+ ret = KSFT_FAIL;
+ }
+
+exit:
+ close(fd);
+freemem:
+ free(tags);
+ free(procfs_entries);
+ return ret;
+}
+
+static int test_lineno_filter(void)
+{
+ int fd;
+ struct allocinfo_tag_data_vec *tags = malloc(sizeof(*tags));
+ struct allocinfo_tag_data_vec *procfs_entries = malloc(sizeof(*procfs_entries));
+ struct allocinfo_filter filter;
+ enum ioctl_ret ioctl_status;
+ int ret = KSFT_PASS;
+ __u64 target_lineno, i;
+
+ if (!tags || !procfs_entries) {
+ ksft_print_msg("Memory allocation failed.\n");
+ ret = KSFT_FAIL;
+ goto freemem;
+ }
+
+ fd = open(ALLOCINFO_PROC, O_RDONLY);
+ if (fd < 0) {
+ ksft_exit_skip("Failed to open " ALLOCINFO_PROC ": %s\n", strerror(errno));
+ ret = KSFT_FAIL;
+ goto freemem;
+ }
+
+ memset(&filter, 0, sizeof(filter));
+
+ if (get_filtered_procfs_entries(procfs_entries, &filter, fd)) {
+ ksft_print_msg("Error retrieving entries from " ALLOCINFO_PROC "\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+ if (procfs_entries->count == 0) {
+ ksft_print_msg("Could not retrieve procfs entries\n");
+ ret = KSFT_SKIP;
+ goto exit;
+ }
+ /*
+ * We depend on the result of procfs entries to create the ioctl_filter. Hence we
+ * cannot recycle the run_filter_test function here.
+ */
+ target_lineno = procfs_entries->tag[0].tag.lineno;
+
+ filter.mask |= ALLOCINFO_FILTER_MASK_LINENO;
+ filter.fields.lineno = target_lineno;
+
+ ioctl_status = get_filtered_ioctl_entries(tags, &filter, fd, 0);
+ if (ioctl_status == IOCTL_INVALID_DATA) {
+ ksft_print_msg("Trouble retrieving valid IOCTL entries, skipping.\n");
+ ret = KSFT_SKIP;
+ goto exit;
+ }
+ if (ioctl_status == IOCTL_FAILURE) {
+ ksft_print_msg("Error retrieving IOCTL entries.\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+
+ for (i = 0; i < tags->count; i++) {
+ if (tags->tag[i].tag.lineno != target_lineno) {
+ ksft_print_msg("IOCTL entry %llu has incorrect lineno %llu.\n",
+ i, tags->tag[i].tag.lineno);
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+ }
+
+exit:
+ close(fd);
+freemem:
+ free(tags);
+ free(procfs_entries);
+ return ret;
+}
+
int main(int argc, char *argv[])
{
int ret;
- ksft_set_plan(2);
+ ksft_set_plan(4);
ret = test_filename_filter();
if (ret == KSFT_SKIP)
@@ -329,5 +519,17 @@ int main(int argc, char *argv[])
else
ksft_test_result(ret == KSFT_PASS, "test_function_filter\n");
+ ret = test_size_filter();
+ if (ret == KSFT_SKIP)
+ ksft_test_result_skip("Skipping test_size_filter\n");
+ else
+ ksft_test_result(ret == KSFT_PASS, "test_size_filter\n");
+
+ ret = test_lineno_filter();
+ if (ret == KSFT_SKIP)
+ ksft_test_result_skip("Skipping test_lineno_filter\n");
+ else
+ ksft_test_result(ret == KSFT_PASS, "test_lineno_filter\n");
+
ksft_finished();
}
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply related
* [PATCH v4 5/6] kselftest: alloc_tag: add kselftest for ioctl interface
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781042698.git.abhishekbapat@google.com>
Introduce a kselftest to verify the new IOCTL-based interface for
/proc/allocinfo. The test covers:
1. Validation of the filename filter.
2. Validation of the function filter.
The first test validates the functionality of the filename filter. Using
"mm/memory.c" as the candidate filename filter, it retrieves filtered
entries from both procfs and ioctl and matches the first VEC_MAX_ENTRIES
entries.
The second test validates the functionality of the function filter.
It uses "dup_mm" as the candidate function as we do not expect this
function name to change frequently and hence won't be needing to modify
this test often.
Note that both the tests match line no, function name and file name
fields. Bytes allocated and calls are not matched as those values may
change in the time when the data is being read from procfs and ioctl and
hence can lead to false negatives.
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
---
MAINTAINERS | 1 +
tools/testing/selftests/alloc_tag/Makefile | 9 +
.../alloc_tag/allocinfo_ioctl_test.c | 333 ++++++++++++++++++
3 files changed, 343 insertions(+)
create mode 100644 tools/testing/selftests/alloc_tag/Makefile
create mode 100644 tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 019cc4c285a3..6610dd42e484 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16715,6 +16715,7 @@ F: include/linux/alloc_tag.h
F: include/linux/pgalloc_tag.h
F: include/uapi/linux/alloc_tag.h
F: lib/alloc_tag.c
+F: tools/testing/selftests/alloc_tag/
MEMORY CONTROLLER DRIVERS
M: Krzysztof Kozlowski <krzk@kernel.org>
diff --git a/tools/testing/selftests/alloc_tag/Makefile b/tools/testing/selftests/alloc_tag/Makefile
new file mode 100644
index 000000000000..f2b8fc022c3b
--- /dev/null
+++ b/tools/testing/selftests/alloc_tag/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_GEN_PROGS := allocinfo_ioctl_test
+
+CFLAGS += -Wall
+CFLAGS += -I../../../../usr/include
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
new file mode 100644
index 000000000000..cd9cf229ae1f
--- /dev/null
+++ b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* kselftest for allocinfo ioctl
+ * allocinfo ioctl retrives allocinfo data through ioctl
+ * Copyright (C) 2026 Google, Inc.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/alloc_tag.h>
+#include "../kselftest.h"
+
+#define MAX_LINE_LEN 512
+#define ALLOCINFO_PROC "/proc/allocinfo"
+
+enum ioctl_ret {
+ IOCTL_SUCCESS = 0,
+ IOCTL_FAILURE = 1,
+ IOCTL_INVALID_DATA = 2,
+};
+
+#define VEC_MAX_ENTRIES 32
+
+struct allocinfo_tag_data_vec {
+ struct allocinfo_tag_data tag[VEC_MAX_ENTRIES];
+ __u64 count;
+};
+
+static inline int __allocinfo_get_content_id(int dev_fd, struct allocinfo_content_id *params)
+{
+ return ioctl(dev_fd, ALLOCINFO_IOC_CONTENT_ID, params);
+}
+
+static inline int __allocinfo_get_at(int dev_fd, struct allocinfo_get_at *params)
+{
+ return ioctl(dev_fd, ALLOCINFO_IOC_GET_AT, params);
+}
+
+static inline int __allocinfo_get_next(int dev_fd, struct allocinfo_tag_data *params)
+{
+ return ioctl(dev_fd, ALLOCINFO_IOC_GET_NEXT, params);
+}
+
+static bool match_entry(const struct allocinfo_tag_data *procfs_entry,
+ const struct allocinfo_tag_data *tag_data,
+ bool match_bytes, bool match_calls, bool match_lineno,
+ bool match_function, bool match_filename)
+{
+ if (match_bytes && tag_data->counter.bytes != procfs_entry->counter.bytes) {
+ ksft_print_msg("size retrieved through ioctl does not match procfs\n");
+ return false;
+ }
+
+ if (match_calls && tag_data->counter.calls != procfs_entry->counter.calls) {
+ ksft_print_msg("call count retrieved through ioctl does not match procfs\n");
+ return false;
+ }
+
+ if (match_lineno && tag_data->tag.lineno != procfs_entry->tag.lineno) {
+ ksft_print_msg("lineno retrieved through ioctl does not match procfs\n");
+ return false;
+ }
+
+ if (match_function &&
+ strncmp(tag_data->tag.function, procfs_entry->tag.function, ALLOCINFO_STR_SIZE)) {
+ ksft_print_msg("function retrieved through ioctl does not match procfs\n");
+ return false;
+ }
+
+ if (match_filename &&
+ strncmp(tag_data->tag.filename, procfs_entry->tag.filename, ALLOCINFO_STR_SIZE)) {
+ ksft_print_msg("filename retrieved through ioctl does not match procfs\n");
+ return false;
+ }
+ return true;
+}
+
+static bool match_entries(const struct allocinfo_tag_data_vec *procfs_entries,
+ const struct allocinfo_tag_data_vec *tags,
+ bool match_bytes, bool match_calls, bool match_lineno,
+ bool match_function, bool match_filename)
+{
+ __u64 i;
+
+ if (procfs_entries->count != tags->count) {
+ ksft_print_msg("Entry count mismatch. ioctl entries: %llu, proc entries: %llu\n",
+ tags->count, procfs_entries->count);
+ return false;
+ }
+ for (i = 0; i < procfs_entries->count; i++) {
+ if (!match_entry(&procfs_entries->tag[i], &tags->tag[i],
+ match_bytes, match_calls, match_lineno,
+ match_function, match_filename)) {
+ ksft_print_msg("%lluth entry does not match.\n", i);
+ return false;
+ }
+ }
+ return true;
+}
+
+static const char *allocinfo_str(const char *str)
+{
+ size_t len = strlen(str);
+
+ if (len >= ALLOCINFO_STR_SIZE)
+ str += (len - ALLOCINFO_STR_SIZE) + 1;
+ return str;
+}
+
+static void allocinfo_copy_str(char *dest, const char *src)
+{
+ strncpy(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE - 1);
+ dest[ALLOCINFO_STR_SIZE - 1] = '\0';
+}
+
+static int get_filtered_procfs_entries(struct allocinfo_tag_data_vec *procfs_entries,
+ const struct allocinfo_filter *filter, int fd)
+{
+ FILE *fp = fdopen(fd, "r");
+ char line[MAX_LINE_LEN];
+ int matches;
+ struct allocinfo_tag_data procfs_entry;
+
+ if (!fp) {
+ ksft_print_msg("Failed to open " ALLOCINFO_PROC " for reading\n");
+ return 1;
+ }
+ memset(procfs_entries, 0, sizeof(*procfs_entries));
+ while (fgets(line, sizeof(line), fp) && procfs_entries->count < VEC_MAX_ENTRIES) {
+ char filename[MAX_LINE_LEN];
+ char function[MAX_LINE_LEN];
+
+ memset(&procfs_entry, 0, sizeof(procfs_entry));
+ matches = sscanf(line, "%llu %llu %[^:]:%llu func:%s",
+ &procfs_entry.counter.bytes,
+ &procfs_entry.counter.calls,
+ filename,
+ &procfs_entry.tag.lineno,
+ function);
+
+ if (matches != 5)
+ continue;
+
+ allocinfo_copy_str(procfs_entry.tag.filename, filename);
+ allocinfo_copy_str(procfs_entry.tag.function, function);
+
+ if (filter->mask & ALLOCINFO_FILTER_MASK_FILENAME) {
+ if (strncmp(procfs_entry.tag.filename,
+ filter->fields.filename, ALLOCINFO_STR_SIZE))
+ continue;
+ }
+ if (filter->mask & ALLOCINFO_FILTER_MASK_FUNCTION) {
+ if (strncmp(procfs_entry.tag.function,
+ filter->fields.function, ALLOCINFO_STR_SIZE))
+ continue;
+ }
+ if (filter->mask & ALLOCINFO_FILTER_MASK_LINENO) {
+ if (procfs_entry.tag.lineno != filter->fields.lineno)
+ continue;
+ }
+ if (filter->mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) {
+ if (procfs_entry.counter.bytes < filter->min_size)
+ continue;
+ }
+ if (filter->mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) {
+ if (procfs_entry.counter.bytes > filter->max_size)
+ continue;
+ }
+
+ memcpy(&procfs_entries->tag[procfs_entries->count++], &procfs_entry,
+ sizeof(procfs_entry));
+ }
+ return 0;
+}
+
+static enum ioctl_ret get_filtered_ioctl_entries(struct allocinfo_tag_data_vec *tags,
+ const struct allocinfo_filter *filter, int fd,
+ __u64 start_pos)
+{
+ struct allocinfo_content_id start_cont_id, end_cont_id;
+ struct allocinfo_get_at get_at_params;
+ const int max_retries = 10;
+ int retry_count = 0;
+ int status;
+
+ /*
+ * __allocinfo_get_content_id may return different values if a kernel module was loaded
+ * between the two calls. If that happens, the data gathered cannot be considered consistent
+ * and hence needs to be fetched again to avoid flakiness.
+ */
+ do {
+ if (__allocinfo_get_content_id(fd, &start_cont_id)) {
+ ksft_print_msg("allocinfo_get_content_id failed\n");
+ return IOCTL_FAILURE;
+ }
+
+ memset(tags, 0, sizeof(*tags));
+ memset(&get_at_params, 0, sizeof(get_at_params));
+ memcpy(&get_at_params.filter, filter, sizeof(*filter));
+ get_at_params.pos = start_pos;
+ if (__allocinfo_get_at(fd, &get_at_params)) {
+ ksft_print_msg("allocinfo_get_at failed\n");
+ return IOCTL_FAILURE;
+ }
+ memcpy(&tags->tag[tags->count++], &get_at_params.data, sizeof(get_at_params.data));
+
+ while (tags->count < VEC_MAX_ENTRIES &&
+ __allocinfo_get_next(fd, &tags->tag[tags->count]) == 0)
+ tags->count++;
+
+ if (__allocinfo_get_content_id(fd, &end_cont_id)) {
+ ksft_print_msg("allocinfo_get_content_id failed\n");
+ return IOCTL_FAILURE;
+ }
+
+ if (start_cont_id.id == end_cont_id.id) {
+ status = IOCTL_SUCCESS;
+ } else {
+ ksft_print_msg("allocinfo_get_content_id mismatch, retrying...\n");
+ status = IOCTL_INVALID_DATA;
+ }
+ } while (status == IOCTL_INVALID_DATA && retry_count++ < max_retries);
+
+ return status;
+}
+
+static int run_filter_test(const struct allocinfo_filter *filter)
+{
+ int fd;
+ struct allocinfo_tag_data_vec *tags = malloc(sizeof(*tags));
+ struct allocinfo_tag_data_vec *procfs_entries = malloc(sizeof(*procfs_entries));
+ int ioctl_status;
+ int ret = KSFT_PASS;
+
+ if (!tags || !procfs_entries) {
+ ksft_print_msg("Memory allocation failed.\n");
+ ret = KSFT_FAIL;
+ goto freemem;
+ }
+
+ fd = open(ALLOCINFO_PROC, O_RDONLY);
+ if (fd < 0) {
+ ksft_print_msg("Failed to open " ALLOCINFO_PROC ": %s\n", strerror(errno));
+ ret = KSFT_SKIP;
+ goto freemem;
+ }
+
+ if (get_filtered_procfs_entries(procfs_entries, filter, fd)) {
+ ksft_print_msg("Error retrieving entries from " ALLOCINFO_PROC "\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+
+ if (procfs_entries->count == 0) {
+ ksft_print_msg("No entries found in " ALLOCINFO_PROC ", skipping test\n");
+ ret = KSFT_SKIP;
+ goto exit;
+ }
+
+ ioctl_status = get_filtered_ioctl_entries(tags, filter, fd, 0);
+ if (ioctl_status == IOCTL_INVALID_DATA) {
+ ksft_print_msg("Trouble retrieving valid IOCTL entries, skipping.\n");
+ ret = KSFT_SKIP;
+ goto exit;
+ }
+ if (ioctl_status == IOCTL_FAILURE) {
+ ksft_print_msg("Error retrieving IOCTL entries.\n");
+ ret = KSFT_FAIL;
+ goto exit;
+ }
+
+ if (!match_entries(procfs_entries, tags, false, false, true, true, true))
+ ret = KSFT_FAIL;
+
+exit:
+ close(fd);
+freemem:
+ free(tags);
+ free(procfs_entries);
+ return ret;
+}
+
+static int test_filename_filter(void)
+{
+ struct allocinfo_filter filter;
+ const char *target_filename = "mm/memory.c";
+
+ memset(&filter, 0, sizeof(filter));
+ filter.mask |= ALLOCINFO_FILTER_MASK_FILENAME;
+ strncpy(filter.fields.filename, target_filename, ALLOCINFO_STR_SIZE);
+
+ return run_filter_test(&filter);
+}
+
+static int test_function_filter(void)
+{
+ struct allocinfo_filter filter;
+ const char *target_function = "dup_mm";
+
+ memset(&filter, 0, sizeof(filter));
+ filter.mask |= ALLOCINFO_FILTER_MASK_FUNCTION;
+ strncpy(filter.fields.function, target_function, ALLOCINFO_STR_SIZE);
+
+ return run_filter_test(&filter);
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ ksft_set_plan(2);
+
+ ret = test_filename_filter();
+ if (ret == KSFT_SKIP)
+ ksft_test_result_skip("Skipping test_filename_filter\n");
+ else
+ ksft_test_result(ret == KSFT_PASS, "test_filename_filter\n");
+
+ ret = test_function_filter();
+ if (ret == KSFT_SKIP)
+ ksft_test_result_skip("Skipping test_function_filter\n");
+ else
+ ksft_test_result(ret == KSFT_PASS, "test_function_filter\n");
+
+ ksft_finished();
+}
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply related
* [PATCH v4 4/6] alloc_tag: add accuracy based filtering to ioctl
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781042698.git.abhishekbapat@google.com>
Extend the allocinfo filtering mechanism to allow users to filter tags
based on their accuracy.
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
---
include/uapi/linux/alloc_tag.h | 4 ++++
lib/alloc_tag.c | 8 ++++++++
2 files changed, 12 insertions(+)
diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
index 7f5acbb44c14..6ea39c4869fe 100644
--- a/include/uapi/linux/alloc_tag.h
+++ b/include/uapi/linux/alloc_tag.h
@@ -26,6 +26,8 @@ struct allocinfo_tag {
char function[ALLOCINFO_STR_SIZE];
char filename[ALLOCINFO_STR_SIZE];
__u64 lineno;
+ /* filter criteria only; see allocinfo_counter.accurate for actual accuracy */
+ __u64 inaccurate;
};
/* The alignment ensures 32-bit compatible interfaces are not broken */
@@ -45,6 +47,7 @@ enum {
ALLOCINFO_FILTER_FUNCTION,
ALLOCINFO_FILTER_FILENAME,
ALLOCINFO_FILTER_LINENO,
+ ALLOCINFO_FILTER_INACCURATE,
ALLOCINFO_FILTER_MIN_SIZE,
ALLOCINFO_FILTER_MAX_SIZE,
__ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
@@ -54,6 +57,7 @@ enum {
#define ALLOCINFO_FILTER_MASK_FUNCTION (1 << ALLOCINFO_FILTER_FUNCTION)
#define ALLOCINFO_FILTER_MASK_FILENAME (1 << ALLOCINFO_FILTER_FILENAME)
#define ALLOCINFO_FILTER_MASK_LINENO (1 << ALLOCINFO_FILTER_LINENO)
+#define ALLOCINFO_FILTER_MASK_INACCURATE (1 << ALLOCINFO_FILTER_INACCURATE)
#define ALLOCINFO_FILTER_MASK_MIN_SIZE (1 << ALLOCINFO_FILTER_MIN_SIZE)
#define ALLOCINFO_FILTER_MASK_MAX_SIZE (1 << ALLOCINFO_FILTER_MAX_SIZE)
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index a936cf18611a..73fb3d0ab821 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -249,6 +249,8 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
struct alloc_tag_counters *counters,
bool *fetched_counters)
{
+ bool inaccurate;
+
if (!filter || !filter->mask)
return true;
@@ -274,6 +276,12 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
ct->lineno != filter->fields.lineno)
return false;
+ if (filter->mask & ALLOCINFO_FILTER_MASK_INACCURATE) {
+ inaccurate = !!(ct->flags & CODETAG_FLAG_INACCURATE);
+ if (inaccurate != !!(filter->fields.inaccurate))
+ return false;
+ }
+
if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
if (!*fetched_counters) {
*counters = allocinfo_prefetch_counters(ct);
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply related
* [PATCH v4 3/6] alloc_tag: add size-based filtering to ioctl
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781042698.git.abhishekbapat@google.com>
Extend the allocinfo filtering mechanism to allow users to filter tags
based on the total number of bytes allocated [min_size, max_size]. The
size range is inclusive.
Filtering by size involves retrieving allocinfo per-CPU counters, which
is an expensive operation. Hence, the performance of size-based
filtering will be worse than other filters.
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
---
include/uapi/linux/alloc_tag.h | 8 ++++-
lib/alloc_tag.c | 63 ++++++++++++++++++++++++++++------
2 files changed, 59 insertions(+), 12 deletions(-)
diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
index 3b11877955b9..7f5acbb44c14 100644
--- a/include/uapi/linux/alloc_tag.h
+++ b/include/uapi/linux/alloc_tag.h
@@ -45,13 +45,17 @@ enum {
ALLOCINFO_FILTER_FUNCTION,
ALLOCINFO_FILTER_FILENAME,
ALLOCINFO_FILTER_LINENO,
- __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_LINENO
+ ALLOCINFO_FILTER_MIN_SIZE,
+ ALLOCINFO_FILTER_MAX_SIZE,
+ __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
};
#define ALLOCINFO_FILTER_MASK_MODNAME (1 << ALLOCINFO_FILTER_MODNAME)
#define ALLOCINFO_FILTER_MASK_FUNCTION (1 << ALLOCINFO_FILTER_FUNCTION)
#define ALLOCINFO_FILTER_MASK_FILENAME (1 << ALLOCINFO_FILTER_FILENAME)
#define ALLOCINFO_FILTER_MASK_LINENO (1 << ALLOCINFO_FILTER_LINENO)
+#define ALLOCINFO_FILTER_MASK_MIN_SIZE (1 << ALLOCINFO_FILTER_MIN_SIZE)
+#define ALLOCINFO_FILTER_MASK_MAX_SIZE (1 << ALLOCINFO_FILTER_MAX_SIZE)
#define ALLOCINFO_FILTER_MASKS \
((1 << (__ALLOCINFO_FILTER_LAST + 1)) - 1)
@@ -59,6 +63,8 @@ enum {
struct allocinfo_filter {
__u64 mask; /* bitmask of the filter fields used */
struct allocinfo_tag fields;
+ __u64 min_size;
+ __u64 max_size;
};
struct allocinfo_get_at {
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 378fcd63b6c9..a936cf18611a 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -191,15 +191,26 @@ static int allocinfo_cmp_str(const char *str, const char *template)
return strncmp(allocinfo_str(str), template, ALLOCINFO_STR_SIZE);
}
+/* Fetch the per-CPU counters */
+static inline struct alloc_tag_counters allocinfo_prefetch_counters(struct codetag *ct)
+{
+ return alloc_tag_read(ct_to_alloc_tag(ct));
+}
+
/*
* Populates the UAPI allocinfo_tag_data structure with active runtime
* profiling counters extracted from the given kernel codetag.
*/
static void allocinfo_to_params(struct codetag *ct,
- struct allocinfo_tag_data *data)
+ struct allocinfo_tag_data *data,
+ struct alloc_tag_counters *counters)
{
- struct alloc_tag *tag = ct_to_alloc_tag(ct);
- struct alloc_tag_counters counter = alloc_tag_read(tag);
+ struct alloc_tag_counters local_counters;
+
+ if (!counters) {
+ local_counters = allocinfo_prefetch_counters(ct);
+ counters = &local_counters;
+ }
if (ct->modname)
allocinfo_copy_str(data->tag.modname, ct->modname);
@@ -208,9 +219,9 @@ static void allocinfo_to_params(struct codetag *ct,
allocinfo_copy_str(data->tag.function, ct->function);
allocinfo_copy_str(data->tag.filename, ct->filename);
data->tag.lineno = ct->lineno;
- data->counter.bytes = counter.bytes;
- data->counter.calls = counter.calls;
- data->counter.accurate = !alloc_tag_is_inaccurate(tag);
+ data->counter.bytes = counters->bytes;
+ data->counter.calls = counters->calls;
+ data->counter.accurate = !alloc_tag_is_inaccurate(ct_to_alloc_tag(ct));
}
/*
@@ -234,7 +245,9 @@ static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
* Verifies whether a given codetag satisfies the active filtering criteria by
* matching its characteristics against the specified filter.
*/
-static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
+static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter,
+ struct alloc_tag_counters *counters,
+ bool *fetched_counters)
{
if (!filter || !filter->mask)
return true;
@@ -261,6 +274,19 @@ static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
ct->lineno != filter->fields.lineno)
return false;
+ if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
+ if (!*fetched_counters) {
+ *counters = allocinfo_prefetch_counters(ct);
+ *fetched_counters = true;
+ }
+ if ((filter->mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
+ counters->bytes < filter->min_size)
+ return false;
+ if ((filter->mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
+ counters->bytes > filter->max_size)
+ return false;
+ }
+
return true;
}
@@ -274,6 +300,8 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
struct codetag *ct;
struct allocinfo_get_at params = {0};
__u64 skip_count;
+ struct alloc_tag_counters counters;
+ bool fetched_counters;
if (copy_from_user(¶ms, arg, sizeof(params)))
return -EFAULT;
@@ -281,6 +309,11 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
if (params.filter.mask & ~ALLOCINFO_FILTER_MASKS)
return -EINVAL;
+ if ((params.filter.mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
+ (params.filter.mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
+ params.filter.min_size > params.filter.max_size)
+ return -EINVAL;
+
priv = m->private;
mutex_lock(&priv->ioctl_lock);
@@ -304,7 +337,8 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
ct = codetag_next_ct(&priv->ioctl_iter);
while (ct) {
- if (matches_filter(ct, &priv->filter)) {
+ fetched_counters = false;
+ if (matches_filter(ct, &priv->filter, &counters, &fetched_counters)) {
if (skip_count == 0)
break;
skip_count--;
@@ -313,7 +347,7 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
}
if (ct) {
- allocinfo_to_params(ct, ¶ms.data);
+ allocinfo_to_params(ct, ¶ms.data, fetched_counters ? &counters : NULL);
priv->positioned = true;
}
@@ -339,6 +373,8 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
struct codetag *ct;
struct allocinfo_tag_data params;
int ret = 0;
+ struct alloc_tag_counters counters;
+ bool fetched_counters;
memset(¶ms, 0, sizeof(params));
priv = m->private;
@@ -352,10 +388,15 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
}
ct = codetag_next_ct(&priv->ioctl_iter);
- while (ct && !matches_filter(ct, &priv->filter))
+ while (ct) {
+ fetched_counters = false;
+ if (matches_filter(ct, &priv->filter, &counters, &fetched_counters))
+ break;
ct = codetag_next_ct(&priv->ioctl_iter);
+ }
+
if (ct)
- allocinfo_to_params(ct, ¶ms);
+ allocinfo_to_params(ct, ¶ms, fetched_counters ? &counters : NULL);
if (!ct) {
priv->positioned = false;
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply related
* [PATCH v4 2/6] alloc_tag: add ioctl filters to /proc/allocinfo
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781042698.git.abhishekbapat@google.com>
Extend the capability of the IOCTL mechanism to filter allocations based
on tag's module name, function name, file name and line number.
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
---
include/uapi/linux/alloc_tag.h | 26 ++++++++++++-
lib/alloc_tag.c | 68 ++++++++++++++++++++++++++++++++--
2 files changed, 89 insertions(+), 5 deletions(-)
diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
index 0928e1a48d49..3b11877955b9 100644
--- a/include/uapi/linux/alloc_tag.h
+++ b/include/uapi/linux/alloc_tag.h
@@ -40,8 +40,32 @@ struct allocinfo_tag_data {
struct allocinfo_counter counter;
};
+enum {
+ ALLOCINFO_FILTER_MODNAME,
+ ALLOCINFO_FILTER_FUNCTION,
+ ALLOCINFO_FILTER_FILENAME,
+ ALLOCINFO_FILTER_LINENO,
+ __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_LINENO
+};
+
+#define ALLOCINFO_FILTER_MASK_MODNAME (1 << ALLOCINFO_FILTER_MODNAME)
+#define ALLOCINFO_FILTER_MASK_FUNCTION (1 << ALLOCINFO_FILTER_FUNCTION)
+#define ALLOCINFO_FILTER_MASK_FILENAME (1 << ALLOCINFO_FILTER_FILENAME)
+#define ALLOCINFO_FILTER_MASK_LINENO (1 << ALLOCINFO_FILTER_LINENO)
+
+#define ALLOCINFO_FILTER_MASKS \
+ ((1 << (__ALLOCINFO_FILTER_LAST + 1)) - 1)
+
+struct allocinfo_filter {
+ __u64 mask; /* bitmask of the filter fields used */
+ struct allocinfo_tag fields;
+};
+
struct allocinfo_get_at {
- __u64 pos; /* input */
+ /* inputs */
+ __u64 pos;
+ struct allocinfo_filter filter;
+ /* output */
struct allocinfo_tag_data data;
};
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index a0577215eb3d..378fcd63b6c9 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -49,6 +49,7 @@ struct allocinfo_private {
struct codetag_iterator iter;
struct codetag_iterator reported_iter;
bool print_header;
+ struct allocinfo_filter filter;
/* ioctl uses a separate iterator not to interfere with reads */
struct codetag_iterator ioctl_iter;
bool positioned; /* seq_open_private() sets to 0 */
@@ -184,6 +185,12 @@ static void allocinfo_copy_str(char *dest, const char *src)
strscpy_pad(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE);
}
+/* Compare two strings and only consider the trimmed suffix if s1 is too long */
+static int allocinfo_cmp_str(const char *str, const char *template)
+{
+ return strncmp(allocinfo_str(str), template, ALLOCINFO_STR_SIZE);
+}
+
/*
* Populates the UAPI allocinfo_tag_data structure with active runtime
* profiling counters extracted from the given kernel codetag.
@@ -223,6 +230,40 @@ static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
return 0;
}
+/*
+ * Verifies whether a given codetag satisfies the active filtering criteria by
+ * matching its characteristics against the specified filter.
+ */
+static bool matches_filter(struct codetag *ct, struct allocinfo_filter *filter)
+{
+ if (!filter || !filter->mask)
+ return true;
+
+ if (filter->mask & ALLOCINFO_FILTER_MASK_MODNAME) {
+ /* user wants to filter by modname but ct->modname is NULL */
+ if (!ct->modname) {
+ /* validate if user was attempting to filter for built-in allocations */
+ if (filter->fields.modname[0] != '\0')
+ return false;
+ } else if (allocinfo_cmp_str(ct->modname, filter->fields.modname))
+ return false;
+ }
+
+ if ((filter->mask & ALLOCINFO_FILTER_MASK_FUNCTION) &&
+ ct->function && allocinfo_cmp_str(ct->function, filter->fields.function))
+ return false;
+
+ if ((filter->mask & ALLOCINFO_FILTER_MASK_FILENAME) &&
+ ct->filename && allocinfo_cmp_str(ct->filename, filter->fields.filename))
+ return false;
+
+ if ((filter->mask & ALLOCINFO_FILTER_MASK_LINENO) &&
+ ct->lineno != filter->fields.lineno)
+ return false;
+
+ return true;
+}
+
/*
* Seeks the ioctl iterator to the specified 0-indexed tag position, reads its
* profiling data and returns it to userspace.
@@ -231,29 +272,46 @@ static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
{
struct allocinfo_private *priv;
struct codetag *ct;
- __u64 pos;
struct allocinfo_get_at params = {0};
+ __u64 skip_count;
if (copy_from_user(¶ms, arg, sizeof(params)))
return -EFAULT;
+ if (params.filter.mask & ~ALLOCINFO_FILTER_MASKS)
+ return -EINVAL;
+
priv = m->private;
- pos = params.pos;
mutex_lock(&priv->ioctl_lock);
codetag_lock_module_list(alloc_tag_cttype);
- if (pos >= codetag_get_count(alloc_tag_cttype)) {
+ if (params.pos >= codetag_get_count(alloc_tag_cttype)) {
codetag_unlock_module_list(alloc_tag_cttype);
mutex_unlock(&priv->ioctl_lock);
return -ENOENT;
}
+ skip_count = params.pos;
+
+ if (params.filter.mask)
+ priv->filter = params.filter;
+ else
+ priv->filter.mask = 0;
+
/* Find the codetag */
priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
ct = codetag_next_ct(&priv->ioctl_iter);
- while (ct && pos--)
+
+ while (ct) {
+ if (matches_filter(ct, &priv->filter)) {
+ if (skip_count == 0)
+ break;
+ skip_count--;
+ }
ct = codetag_next_ct(&priv->ioctl_iter);
+ }
+
if (ct) {
allocinfo_to_params(ct, ¶ms.data);
priv->positioned = true;
@@ -294,6 +352,8 @@ static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
}
ct = codetag_next_ct(&priv->ioctl_iter);
+ while (ct && !matches_filter(ct, &priv->filter))
+ ct = codetag_next_ct(&priv->ioctl_iter);
if (ct)
allocinfo_to_params(ct, ¶ms);
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply related
* [PATCH v4 1/6] alloc_tag: add ioctl to /proc/allocinfo
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
In-Reply-To: <cover.1781042698.git.abhishekbapat@google.com>
From: Suren Baghdasaryan <surenb@google.com>
Add the following ioctl commands for /proc/allocinfo file:
ALLOCINFO_IOC_CONTENT_ID - gets content identifier which can be used
to check whether the file content has changed specifically due to module
load/unload. Every time a module is loaded / unloaded, the returned
value will be different. By comparing the identifier value at the
beginning and at the end of the content retrieval operation, users can
validate retrieved information for consistency.
ALLOCINFO_IOC_GET_AT - gets the record at the specified position. This
is the position of a record in /proc/allocinfo.
ALLOCINFO_IOC_GET_NEXT - gets the record next to the last retrieved
one. If no records were previously retrieved, returns the first
record.
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
---
Documentation/mm/allocation-profiling.rst | 5 +
.../userspace-api/ioctl/ioctl-number.rst | 2 +
MAINTAINERS | 1 +
include/linux/codetag.h | 2 +
include/uapi/linux/alloc_tag.h | 60 +++++
lib/alloc_tag.c | 232 +++++++++++++++++-
lib/codetag.c | 18 ++
7 files changed, 318 insertions(+), 2 deletions(-)
create mode 100644 include/uapi/linux/alloc_tag.h
diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
index 5389d241176a..c3a28467955f 100644
--- a/Documentation/mm/allocation-profiling.rst
+++ b/Documentation/mm/allocation-profiling.rst
@@ -46,6 +46,11 @@ sysctl:
Runtime info:
/proc/allocinfo
+ Profiling data can be retrieved either by reading `/proc/allocinfo` directly as
+ text or programmatically via `ioctl()` calls defined in `<uapi/linux/alloc_tag.h>`.
+ The ioctl interface supports structured binary data extraction as well as filtering
+ by module name, function, file, line number, accuracy, or allocation size limits.
+
Example output::
root@moria-kvm:~# sort -g /proc/allocinfo|tail|numfmt --to=iec
diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 331223761fff..84f6808a8578 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -349,6 +349,8 @@ Code Seq# Include File Comments
<mailto:luzmaximilian@gmail.com>
0xA5 20-2F linux/surface_aggregator/dtx.h Microsoft Surface DTX driver
<mailto:luzmaximilian@gmail.com>
+0xA6 00-0F uapi/linux/alloc_tag.h Memory allocation profiling
+ <mailto:surenb@google.com>
0xAA 00-3F linux/uapi/linux/userfaultfd.h
0xAB 00-1F linux/nbd.h
0xAC 00-1F linux/raw.h
diff --git a/MAINTAINERS b/MAINTAINERS
index 65bd4328fe05..019cc4c285a3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16713,6 +16713,7 @@ S: Maintained
F: Documentation/mm/allocation-profiling.rst
F: include/linux/alloc_tag.h
F: include/linux/pgalloc_tag.h
+F: include/uapi/linux/alloc_tag.h
F: lib/alloc_tag.c
MEMORY CONTROLLER DRIVERS
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index ddae7484ca45..a25a085c2df1 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -77,6 +77,8 @@ struct codetag_iterator {
void codetag_lock_module_list(struct codetag_type *cttype);
bool codetag_trylock_module_list(struct codetag_type *cttype);
void codetag_unlock_module_list(struct codetag_type *cttype);
+unsigned long codetag_get_content_id(struct codetag_type *cttype);
+unsigned int codetag_get_count(struct codetag_type *cttype);
struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
struct codetag *codetag_next_ct(struct codetag_iterator *iter);
diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
new file mode 100644
index 000000000000..0928e1a48d49
--- /dev/null
+++ b/include/uapi/linux/alloc_tag.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * alloc_tag IOCTL API definition
+ *
+ * Copyright (C) 2026 Google, LLC. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _UAPI_ALLOC_TAG_H
+#define _UAPI_ALLOC_TAG_H
+
+#include <linux/types.h>
+
+#define ALLOCINFO_STR_SIZE 64
+
+struct allocinfo_content_id {
+ __u64 id;
+};
+
+struct allocinfo_tag {
+ /* Longer names are trimmed */
+ char modname[ALLOCINFO_STR_SIZE];
+ char function[ALLOCINFO_STR_SIZE];
+ char filename[ALLOCINFO_STR_SIZE];
+ __u64 lineno;
+};
+
+/* The alignment ensures 32-bit compatible interfaces are not broken */
+struct allocinfo_counter {
+ __u64 bytes;
+ __u64 calls;
+ __u8 accurate;
+} __attribute__((aligned(8)));
+
+struct allocinfo_tag_data {
+ struct allocinfo_tag tag;
+ struct allocinfo_counter counter;
+};
+
+struct allocinfo_get_at {
+ __u64 pos; /* input */
+ struct allocinfo_tag_data data;
+};
+
+#define _ALLOCINFO_IOC_CONTENT_ID 0
+#define _ALLOCINFO_IOC_GET_AT 1
+#define _ALLOCINFO_IOC_GET_NEXT 2
+
+#define ALLOCINFO_IOC_BASE 0xA6
+#define ALLOCINFO_IOC_CONTENT_ID _IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_CONTENT_ID, \
+ struct allocinfo_content_id)
+#define ALLOCINFO_IOC_GET_AT _IOWR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_AT, \
+ struct allocinfo_get_at)
+#define ALLOCINFO_IOC_GET_NEXT _IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_NEXT, \
+ struct allocinfo_tag_data)
+
+#endif /* _UAPI_ALLOC_TAG_H */
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index d9be1cf5187d..a0577215eb3d 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -5,6 +5,7 @@
#include <linux/gfp.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/page_ext.h>
#include <linux/pgalloc_tag.h>
#include <linux/proc_fs.h>
@@ -14,6 +15,7 @@
#include <linux/string_choices.h>
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
+#include <uapi/linux/alloc_tag.h>
#define ALLOCINFO_FILE_NAME "allocinfo"
#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag))
@@ -47,6 +49,10 @@ struct allocinfo_private {
struct codetag_iterator iter;
struct codetag_iterator reported_iter;
bool print_header;
+ /* ioctl uses a separate iterator not to interfere with reads */
+ struct codetag_iterator ioctl_iter;
+ bool positioned; /* seq_open_private() sets to 0 */
+ struct mutex ioctl_lock;
};
static void *allocinfo_start(struct seq_file *m, loff_t *pos)
@@ -130,6 +136,229 @@ static const struct seq_operations allocinfo_seq_op = {
.show = allocinfo_show,
};
+/*
+ * Initializes seq_file operations and allocates private state when opening
+ * the /proc/allocinfo procfs entry.
+ */
+static int allocinfo_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = seq_open_private(file, &allocinfo_seq_op,
+ sizeof(struct allocinfo_private));
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ struct allocinfo_private *priv = m->private;
+
+ mutex_init(&priv->ioctl_lock);
+ }
+ return ret;
+}
+
+/*
+ * Cleans up the seq_file state and frees up the private state allocated in
+ * allocinfo_open() when closing the /proc/allocinfo file descriptor.
+ */
+static int allocinfo_release(struct inode *inode, struct file *file)
+{
+ return seq_release_private(inode, file);
+}
+
+/*
+ * Returns a pointer to the suffix of a string so that its length fits within
+ * ALLOCINFO_STR_SIZE, preserving the trailing characters.
+ */
+static const char *allocinfo_str(const char *str)
+{
+ size_t len = strlen(str);
+
+ /* Keep an extra space for the trailing NULL. */
+ if (len >= ALLOCINFO_STR_SIZE)
+ str += (len - ALLOCINFO_STR_SIZE) + 1;
+ return str;
+}
+
+/* Copy a string and trim from the beginning if it's too long */
+static void allocinfo_copy_str(char *dest, const char *src)
+{
+ strscpy_pad(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE);
+}
+
+/*
+ * Populates the UAPI allocinfo_tag_data structure with active runtime
+ * profiling counters extracted from the given kernel codetag.
+ */
+static void allocinfo_to_params(struct codetag *ct,
+ struct allocinfo_tag_data *data)
+{
+ struct alloc_tag *tag = ct_to_alloc_tag(ct);
+ struct alloc_tag_counters counter = alloc_tag_read(tag);
+
+ if (ct->modname)
+ allocinfo_copy_str(data->tag.modname, ct->modname);
+ else
+ data->tag.modname[0] = '\0';
+ allocinfo_copy_str(data->tag.function, ct->function);
+ allocinfo_copy_str(data->tag.filename, ct->filename);
+ data->tag.lineno = ct->lineno;
+ data->counter.bytes = counter.bytes;
+ data->counter.calls = counter.calls;
+ data->counter.accurate = !alloc_tag_is_inaccurate(tag);
+}
+
+/*
+ * Retrieves the unique content ID representing the current allocation tag module
+ * layout, allowing userspace to detect if modules were loaded / unloaded.
+ */
+static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
+{
+ struct allocinfo_content_id params;
+
+ codetag_lock_module_list(alloc_tag_cttype);
+ params.id = codetag_get_content_id(alloc_tag_cttype);
+ codetag_unlock_module_list(alloc_tag_cttype);
+ if (copy_to_user(arg, ¶ms, sizeof(params)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Seeks the ioctl iterator to the specified 0-indexed tag position, reads its
+ * profiling data and returns it to userspace.
+ */
+static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
+{
+ struct allocinfo_private *priv;
+ struct codetag *ct;
+ __u64 pos;
+ struct allocinfo_get_at params = {0};
+
+ if (copy_from_user(¶ms, arg, sizeof(params)))
+ return -EFAULT;
+
+ priv = m->private;
+ pos = params.pos;
+
+ mutex_lock(&priv->ioctl_lock);
+ codetag_lock_module_list(alloc_tag_cttype);
+
+ if (pos >= codetag_get_count(alloc_tag_cttype)) {
+ codetag_unlock_module_list(alloc_tag_cttype);
+ mutex_unlock(&priv->ioctl_lock);
+ return -ENOENT;
+ }
+
+ /* Find the codetag */
+ priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
+ ct = codetag_next_ct(&priv->ioctl_iter);
+ while (ct && pos--)
+ ct = codetag_next_ct(&priv->ioctl_iter);
+ if (ct) {
+ allocinfo_to_params(ct, ¶ms.data);
+ priv->positioned = true;
+ }
+
+ codetag_unlock_module_list(alloc_tag_cttype);
+ mutex_unlock(&priv->ioctl_lock);
+
+ if (!ct)
+ return -ENOENT;
+
+ if (copy_to_user(arg, ¶ms, sizeof(params)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/*
+ * Advances the ioctl iterator to the next allocation tag in the sequence and
+ * returns its profiling data to userspace.
+ */
+static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
+{
+ struct allocinfo_private *priv;
+ struct codetag *ct;
+ struct allocinfo_tag_data params;
+ int ret = 0;
+
+ memset(¶ms, 0, sizeof(params));
+ priv = m->private;
+
+ mutex_lock(&priv->ioctl_lock);
+ codetag_lock_module_list(alloc_tag_cttype);
+
+ if (!priv->positioned) {
+ priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
+ priv->positioned = true;
+ }
+
+ ct = codetag_next_ct(&priv->ioctl_iter);
+ if (ct)
+ allocinfo_to_params(ct, ¶ms);
+
+ if (!ct) {
+ priv->positioned = false;
+ ret = -ENOENT;
+ }
+ codetag_unlock_module_list(alloc_tag_cttype);
+ mutex_unlock(&priv->ioctl_lock);
+
+ if (ret == 0) {
+ if (copy_to_user(arg, ¶ms, sizeof(params)))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+/*
+ * Entry point ioctl function for /proc/allocinfo routing requests to fetch the
+ * layout content ID, seek to a specific tag, or read sequential tags.
+ */
+static long allocinfo_ioctl(struct file *file, unsigned int cmd,
+ unsigned long __arg)
+{
+ void __user *arg = (void __user *)__arg;
+ int ret;
+
+ switch (cmd) {
+ case ALLOCINFO_IOC_CONTENT_ID:
+ ret = allocinfo_ioctl_get_content_id(file->private_data, arg);
+ break;
+ case ALLOCINFO_IOC_GET_AT:
+ ret = allocinfo_ioctl_get_at(file->private_data, arg);
+ break;
+ case ALLOCINFO_IOC_GET_NEXT:
+ ret = allocinfo_ioctl_get_next(file->private_data, arg);
+ break;
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long allocinfo_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return allocinfo_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct proc_ops allocinfo_proc_ops = {
+ .proc_open = allocinfo_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = allocinfo_release,
+ .proc_ioctl = allocinfo_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = allocinfo_compat_ioctl,
+#endif
+
+};
+
size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
{
struct codetag_iterator iter;
@@ -993,8 +1222,7 @@ static int __init alloc_tag_init(void)
return 0;
}
- if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
- sizeof(struct allocinfo_private), NULL)) {
+ if (!proc_create(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_proc_ops)) {
pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
shutdown_mem_profiling(false);
return -ENOMEM;
diff --git a/lib/codetag.c b/lib/codetag.c
index 4001a7ea6675..a9cda4c962a3 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -19,6 +19,8 @@ struct codetag_type {
struct codetag_type_desc desc;
/* generates unique sequence number for module load */
unsigned long next_mod_seq;
+ /* bumped on every module load and unload */
+ unsigned long content_id;
};
struct codetag_range {
@@ -50,6 +52,20 @@ void codetag_unlock_module_list(struct codetag_type *cttype)
up_read(&cttype->mod_lock);
}
+unsigned long codetag_get_content_id(struct codetag_type *cttype)
+{
+ lockdep_assert_held(&cttype->mod_lock);
+
+ return cttype->content_id;
+}
+
+unsigned int codetag_get_count(struct codetag_type *cttype)
+{
+ lockdep_assert_held(&cttype->mod_lock);
+
+ return cttype->count;
+}
+
struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
{
struct codetag_iterator iter = {
@@ -204,6 +220,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
down_write(&cttype->mod_lock);
cmod->mod_seq = ++cttype->next_mod_seq;
+ ++cttype->content_id;
mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
if (mod_id >= 0) {
if (cttype->desc.module_load) {
@@ -368,6 +385,7 @@ void codetag_unload_module(struct module *mod)
cttype->count -= range_size(cttype, &cmod->range);
idr_remove(&cttype->mod_idr, mod_id);
kfree(cmod);
+ ++cttype->content_id;
}
up_write(&cttype->mod_lock);
if (found && cttype->desc.free_section_mem)
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply related
* [PATCH v4 0/6] alloc_tag: introduce IOCTL-based filtering for MAP
From: Abhishek Bapat @ 2026-06-10 0:12 UTC (permalink / raw)
To: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Abhishek Bapat
Currently, memory allocation profiling data is primarily exposed through
/proc/allocinfo. While useful for manual inspection, this text-based
interface poses challenges for production monitoring and large-scale
analysis:
1. Userspace must parse large amounts of text to extract specific
fields.
2. To find specific tags, userspace must read the entire dataset,
requiring many context switches and high data copying.
3. The kernel currently aggregates per-CPU counters for every allocation
size, even those the user intends to filter out immediately.
This series introduces a new IOCTL-based binary interface for allocinfo
that supports kernel-side filtering. By allowing the user to specify a
filter mask, we significantly reduce the work performed in-kernel and
the amount of data transferred to userspace. The IOCTL mechanism was
chosen for allocinfo to address the per-CPU counter aggregation
bottleneck. A traditional read() operation must report the total
allocation count and sizes for every code tag in the system. Doing so
requires iterating across all CPUs to sum their per-CPU counters for
thousands of tags, which introduces substantial runtime overhead.
The IOCTL interface allows userspace to push selective filtering
criteria directly into the kernel before the per-CPU counter
aggregation. The kernel aggregates per-CPU counters only for a small
subset of tags that match the filter. This results in significant
performance improvement.
Beyond fast filtered retrieval, the IOCTL foundation allows introducing
a context capture mechanism in the future to capture the context for
specific allocations.
Performance measurements were conducted on an Intel Xeon Platinum 8481C
(224 CPUs) with caches dropped before each run.
The IOCTL mechanism shows a ~20x performance improvement for
filtered queries. The kernel avoids the expensive per-CPU counter
aggregation (alloc_tag_read) for any tags that fail the initial string
or location filters.
Scenario 1: Specific File Filtering (arch/x86/events/rapl.c)
1. Traditional (cat /proc/allocinfo | grep): 22ms (sys)
2. IOCTL Interface: 1ms (sys)
Scenario 2: Compound Filtering (Filename + Size)
1. Traditional: (cat ... | grep | awk): 21ms (sys)
2. IOCTL Interface: 1ms (sys)
Scenario 3: Size-Based Filtering (min_size = 1MB)
1. Traditional: (cat ... | awk): 21ms (sys)
2. IOCTL Interface: 14ms (sys)
v4 changes:
- Patch 1/6: Fixed a copyright comment inside
include/uapi/linux/alloc_tag.h
- Patch 3/6: Among other nits, fixed the inadvertent build failure
introduced in v3.
- Patch 4/6: Included a comment stating that the accurate field in
struct allocinfo_tag is only used for filtering.
- Patch 5/6: Modified test to trim prefix and keep suffix for entries
with filenames exceeding the size limit.
- Patch 6/6: Modified test_size_filter such that if content_id changes
between the moment when procfs and ioctl entries are read, both
entries are invalidated and re-fetched. Removed the tags->count == 0
check from test_lineno_filter as it's virtually unreachable.
v3 changes:
- Patch 1/6: Modified Documentation to indicate that map supports
ioctl(). Modified struct allocinfo_count to use
__attribute__((aligned(8))) instead of manual padding. Removed
redundance type-casting. Added comments for static functions in
lib/alloc_tag.c. Introduced a new seq counter for content_id that gets
bumped every time module is loaded / unloaded. Introduced logic to
validate user specified position is not greater than number of
allocation tags and return early if it is. Changed strscpy to
strscpy_pad to not echo arbitrary user data back to the user.
- Patch 2/6: Handled the case where user wants to specifically filter
for built-in modules. Included some comments for static functions.
- Patch 3/6: Modified logic to only fetch per-CPU counters for codetags
that satisfy other filters. Included some comments for static
functions.
v2 changes:
- Patch 1/6: Introduced locking for m->private. Also included the new uapi
header file in MAINTAINERS list.
- Patch 2/6: Handled the case where ALLOCINFO_FILTER_MASK_MODNAME is
passed but ct->modname is NULL.
- Patch 3/6: Moved min_size and max_size outside of struct allocinfo_tag
into struct allocinfo_filter. Added validation that min_size <=
max_size. Prefetched alloc_tag_counters if size based filter masks are
provided to avoid assimilating per-cpu counters twice.
- Patch 5/6: Removed the hardcoded logic to skip the header, instead the
test will skip lines that don't match the format. Also included the
newly added alloc_tag selftests directory in MAINTAINERS list.
Abhishek Bapat (5):
alloc_tag: add ioctl filters to /proc/allocinfo
alloc_tag: add size-based filtering to ioctl
alloc_tag: add accuracy based filtering to ioctl
kselftest: alloc_tag: add kselftest for ioctl interface
kselftest: alloc_tag: extend the allocinfo ioctl kselftest
Suren Baghdasaryan (1):
alloc_tag: add ioctl to /proc/allocinfo
Documentation/mm/allocation-profiling.rst | 5 +
.../userspace-api/ioctl/ioctl-number.rst | 2 +
MAINTAINERS | 2 +
include/linux/codetag.h | 2 +
include/uapi/linux/alloc_tag.h | 94 +++
lib/alloc_tag.c | 341 ++++++++++-
lib/codetag.c | 18 +
tools/testing/selftests/alloc_tag/Makefile | 9 +
.../alloc_tag/allocinfo_ioctl_test.c | 535 ++++++++++++++++++
9 files changed, 1006 insertions(+), 2 deletions(-)
create mode 100644 include/uapi/linux/alloc_tag.h
create mode 100644 tools/testing/selftests/alloc_tag/Makefile
create mode 100644 tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
--
2.54.0.1099.g489fc7bff1-goog
^ permalink raw reply
* Re: [PATCH v6 08/12] PCI: liveupdate: Inherit ACS flags in incoming preserved devices
From: Jason Gunthorpe @ 2026-06-10 0:07 UTC (permalink / raw)
To: Pranjal Shrivastava
Cc: David Matlack, kexec, linux-doc, linux-kernel, linux-mm,
linux-pci, Adithya Jayachandran, Alexander Graf, Alex Williamson,
Bjorn Helgaas, Chris Li, David Rientjes, Jacob Pan,
Jonathan Corbet, Josh Hilke, Leon Romanovsky, Lukas Wunner,
Mike Rapoport, Parav Pandit, Pasha Tatashin, Pratyush Yadav,
Saeed Mahameed, Samiullah Khawaja, Shuah Khan, Vipin Sharma,
William Tu, Yi Liu
In-Reply-To: <aihLTgs1Y49OXQaV@google.com>
On Tue, Jun 09, 2026 at 05:20:14PM +0000, Pranjal Shrivastava wrote:
> Now, the attacker has an opportunity with Liveupdate, since the devices
> are already assigned, if *somehow* it flips a bit like ACS_RR, the
If this is possible then your environment is already security broken,
no need to involve live update.
Jason
^ permalink raw reply
* Re: [PATCH v4 0/2] docs/mm/damon: fix docs and update zh_CN
From: SeongJae Park @ 2026-06-09 23:55 UTC (permalink / raw)
To: Doehyun Baek
Cc: SeongJae Park, Dongliang Mu, Jonathan Corbet, Shuah Khan,
Alex Shi, Yanteng Si, Hu Haowen, linux-doc, linux-kernel, damon
In-Reply-To: <cover.1781015560.git.doehyunbaek@gmail.com>
Hello Doehyun,
On Tue, 9 Jun 2026 14:34:24 +0000 Doehyun Baek <doehyunbaek@gmail.com> wrote:
> First of all, thank you very much, Dongliang, for your time and
> dedication in reviewing the previous versions.
>
> This v4 sends the original English DAMON documentation fixes as the
> first patch, and the Simplified Chinese translation update as the
> second patch.
>
> For zh_CN, I translated the current DAMON usage.rst paragraph by
> paragraph, and added missing pieces such as stat.rst and the related
> index/design references. The zh_TW changes from earlier versions are
> dropped from this series.
Thank you for sharing this patch series! However, to my understanding, the
path to the mainline for English documents and Chinese documents are different.
Sending patches for English document and Chinese document as one series is
therefore making it complicated, in my opinion. Could you please rebase
English document part to mm-new [1] and send as a separate patch?
[1] https://origin.kernel.org/doc/html/latest/mm/damon/maintainer-profile.html#scm-trees
Thanks,
SJ
[...]
^ permalink raw reply
* Re: [PATCH net-next v6 12/12] net: airoha: add phylink support
From: Christian Marangi @ 2026-06-09 23:51 UTC (permalink / raw)
To: Lorenzo Bianconi
Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Simon Horman, Jonathan Corbet, Shuah Khan, Heiner Kallweit,
Russell King, Saravana Kannan, Philipp Zabel, Nathan Chancellor,
Nick Desaulniers, Bill Wendling, Justin Stitt, netdev, devicetree,
linux-kernel, linux-doc, linux-arm-kernel, linux-mediatek, llvm
In-Reply-To: <aigxaDtZDnI-RTwN@lore-desk>
On Tue, Jun 09, 2026 at 05:29:44PM +0200, Lorenzo Bianconi wrote:
> > Add phylink support for each GDM port. For GDM1 add the internal interface
> > mode as the only supported mode. For GDM2/3/4 add the required
> > configuration of the PCS to make the external PHY or attached SFP cage
> > work.
> >
> > These needs to be defined in the GDM port node using the pcs-handle
> > property.
> >
> > Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
>
> Hi Christian,
>
> some nits inline.
>
> Regards,
> Lorenzo
>
> > ---
> > drivers/net/ethernet/airoha/Kconfig | 1 +
> > drivers/net/ethernet/airoha/airoha_eth.c | 167 +++++++++++++++++++++-
> > drivers/net/ethernet/airoha/airoha_eth.h | 3 +
> > drivers/net/ethernet/airoha/airoha_regs.h | 12 ++
> > 4 files changed, 181 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/airoha/Kconfig b/drivers/net/ethernet/airoha/Kconfig
> > index ad3ce501e7a5..38dcc76e5998 100644
> > --- a/drivers/net/ethernet/airoha/Kconfig
> > +++ b/drivers/net/ethernet/airoha/Kconfig
> > @@ -20,6 +20,7 @@ config NET_AIROHA
> > depends on NET_DSA || !NET_DSA
> > select NET_AIROHA_NPU
> > select PAGE_POOL
> > + select PHYLINK
> > help
> > This driver supports the gigabit ethernet MACs in the
> > Airoha SoC family.
> > diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
> > index 5a8e84fa9918..eabd7b058f82 100644
> > --- a/drivers/net/ethernet/airoha/airoha_eth.c
> > +++ b/drivers/net/ethernet/airoha/airoha_eth.c
> > @@ -8,6 +8,7 @@
> > #include <linux/of_reserved_mem.h>
> > #include <linux/platform_device.h>
> > #include <linux/tcp.h>
> > +#include <linux/pcs/pcs.h>
>
> Can you please respect the alphabetic order?
>
> > #include <linux/u64_stats_sync.h>
> > #include <net/dst_metadata.h>
> > #include <net/page_pool/helpers.h>
> > @@ -1779,6 +1780,15 @@ static int airoha_dev_open(struct net_device *netdev)
> > u32 cur_len, pse_port = FE_PSE_PORT_PPE1;
> > struct airoha_qdma *qdma = dev->qdma;
> >
> > + err = phylink_of_phy_connect(dev->phylink, netdev->dev.of_node, 0);
> > + if (err) {
> > + netdev_err(netdev, "%s: could not attach PHY: %d\n", __func__,
> > + err);
>
> Do we need specify the __func__ argument here?
>
I was following a pattern also used in other driver. Maybe we should stop
following it?
> > + return err;
> > + }
> > +
> > + phylink_start(dev->phylink);
> > +
> > netif_tx_start_all_queues(netdev);
> > err = airoha_set_vip_for_gdm_port(dev, true);
> > if (err)
> > @@ -1876,6 +1886,9 @@ static int airoha_dev_stop(struct net_device *netdev)
> > }
> > }
> >
> > + phylink_stop(dev->phylink);
> > + phylink_disconnect_phy(dev->phylink);
> > +
> > return 0;
> > }
> >
> > @@ -3148,6 +3161,153 @@ bool airoha_is_valid_gdm_dev(struct airoha_eth *eth,
> > return false;
> > }
> >
> > +/* Nothing to do in MAC, everything is handled in PCS */
> > +static void airoha_mac_config(struct phylink_config *config, unsigned int mode,
> > + const struct phylink_link_state *state)
> > +{
> > +}
> > +
> > +static void airoha_mac_link_up(struct phylink_config *config, struct phy_device *phy,
> > + unsigned int mode, phy_interface_t interface,
> > + int speed, int duplex, bool tx_pause, bool rx_pause)
> > +{
> > + struct airoha_gdm_dev *dev = container_of(config, struct airoha_gdm_dev,
> > + phylink_config);
> > + struct airoha_gdm_port *port = dev->port;
> > + struct airoha_eth *eth = dev->eth;
> > + u32 frag_size_tx, frag_size_rx;
> > + u32 mask, val;
> > +
> > + /* TX/RX frag is configured only for GDM4 */
> > + if (port->id != 4)
>
> if (port->id != AIROHA_GDM4_IDX)
> ...
>
> > + return;
> > +
> > + switch (speed) {
> > + case SPEED_10000:
> > + case SPEED_5000:
> > + frag_size_tx = 8;
> > + frag_size_rx = 8;
> > + break;
> > + case SPEED_2500:
> > + frag_size_tx = 2;
> > + frag_size_rx = 1;
> > + break;
> > + default:
> > + frag_size_tx = 1;
> > + frag_size_rx = 0;
> > + }
> > +
> > + /* Configure TX/RX frag based on speed */
> > + if (dev->nbq == 1) {
> > + mask = GDMA4_SGMII1_TX_FRAG_SIZE_MASK;
> > + val = FIELD_PREP(GDMA4_SGMII1_TX_FRAG_SIZE_MASK,
> > + frag_size_tx);
> > + } else {
> > + mask = GDMA4_SGMII0_TX_FRAG_SIZE_MASK;
> > + val = FIELD_PREP(GDMA4_SGMII0_TX_FRAG_SIZE_MASK,
> > + frag_size_tx);
> > + }
> > + airoha_fe_rmw(eth, REG_GDMA4_TMBI_FRAG, mask, val);
> > +
> > + if (dev->nbq == 1) {
> > + mask = GDMA4_SGMII1_RX_FRAG_SIZE_MASK;
> > + val = FIELD_PREP(GDMA4_SGMII1_RX_FRAG_SIZE_MASK,
> > + frag_size_tx);
> > + } else {
> > + mask = GDMA4_SGMII0_RX_FRAG_SIZE_MASK;
> > + val = FIELD_PREP(GDMA4_SGMII0_RX_FRAG_SIZE_MASK,
> > + frag_size_tx);
> > + }
> > + airoha_fe_rmw(eth, REG_GDMA4_RMBI_FRAG, mask, val);
> > +}
> > +
> > +/* Nothing to do in MAC, everything is handled in PCS */
> > +static void airoha_mac_link_down(struct phylink_config *config, unsigned int mode,
> > + phy_interface_t interface)
> > +{
> > +}
> > +
> > +static const struct phylink_mac_ops airoha_phylink_ops = {
> > + .mac_config = airoha_mac_config,
> > + .mac_link_up = airoha_mac_link_up,
> > + .mac_link_down = airoha_mac_link_down,
> > +};
> > +
> > +static int airoha_fill_available_pcs(struct phylink_config *config,
> > + struct phylink_pcs **available_pcs,
> > + unsigned int num_available_pcs)
> > +{
> > + struct device *dev = config->dev;
> > +
> > + return fwnode_phylink_pcs_parse(dev_fwnode(dev), available_pcs,
> > + &num_available_pcs);
> > +}
> > +
> > +static int airoha_setup_phylink(struct net_device *netdev)
> > +{
> > + struct airoha_gdm_dev *dev = netdev_priv(netdev);
> > + struct device_node *np = netdev->dev.of_node;
> > + struct airoha_gdm_port *port = dev->port;
> > + struct phylink_config *config;
> > + phy_interface_t phy_mode;
> > + struct phylink *phylink;
> > + int err;
> > +
> > + err = of_get_phy_mode(np, &phy_mode);
> > + if (err) {
> > + dev_err(&netdev->dev, "incorrect phy-mode\n");
> > + return err;
> > + }
> > +
> > + config = &dev->phylink_config;
>
> remove new-line here.
>
> > +
> > + config->dev = &netdev->dev;
> > + config->type = PHYLINK_NETDEV;
> > + config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10000FD;
> > + if (port->id > AIROHA_GDM1_IDX)
>
> maybe
> if (port->id != AIROHA_GDM1_IDX)
> ...
>
> > + config->mac_capabilities |= MAC_10 | MAC_100 | MAC_1000 |
> > + MAC_2500FD | MAC_5000FD;
> > +
> > + err = fwnode_phylink_pcs_parse(dev_fwnode(&netdev->dev), NULL,
> > + &config->num_available_pcs);
> > + if (err)
> > + return err;
> > +
> > + config->fill_available_pcs = airoha_fill_available_pcs;
> > +
> > + /*
> > + * GDM1 only supports internal for Embedded Switch
> > + * and doesn't require a PCS.
> > + */
> > + if (port->id == AIROHA_GDM1_IDX) {
> > + __set_bit(PHY_INTERFACE_MODE_INTERNAL,
> > + config->supported_interfaces);
> > + } else {
> > + __set_bit(PHY_INTERFACE_MODE_SGMII,
> > + config->supported_interfaces);
> > + __set_bit(PHY_INTERFACE_MODE_1000BASEX,
> > + config->supported_interfaces);
> > + __set_bit(PHY_INTERFACE_MODE_2500BASEX,
> > + config->supported_interfaces);
> > + __set_bit(PHY_INTERFACE_MODE_10GBASER,
> > + config->supported_interfaces);
> > + __set_bit(PHY_INTERFACE_MODE_USXGMII,
> > + config->supported_interfaces);
> > +
> > + phy_interface_copy(config->pcs_interfaces,
> > + config->supported_interfaces);
> > + }
> > +
> > + phylink = phylink_create(config, of_fwnode_handle(np),
> > + phy_mode, &airoha_phylink_ops);
> > + if (IS_ERR(phylink))
> > + return PTR_ERR(phylink);
> > +
> > + dev->phylink = phylink;
> > +
> > + return 0;
> > +}
> > +
> > static int airoha_alloc_gdm_device(struct airoha_eth *eth,
> > struct airoha_gdm_port *port,
> > int nbq, struct device_node *np)
> > @@ -3210,7 +3370,7 @@ static int airoha_alloc_gdm_device(struct airoha_eth *eth,
> > dev->nbq = nbq;
> > port->devs[index] = dev;
> >
> > - return 0;
> > + return airoha_setup_phylink(netdev);
> > }
> >
> > static int airoha_alloc_gdm_port(struct airoha_eth *eth,
> > @@ -3435,8 +3595,10 @@ static int airoha_probe(struct platform_device *pdev)
> > continue;
> >
> > netdev = netdev_from_priv(dev);
> > - if (netdev->reg_state == NETREG_REGISTERED)
> > + if (netdev->reg_state == NETREG_REGISTERED) {
> > + phylink_destroy(dev->phylink);
> > unregister_netdev(netdev);
> > + }
> > of_node_put(netdev->dev.of_node);
> > }
> > airoha_metadata_dst_free(port);
> > @@ -3472,6 +3634,7 @@ static void airoha_remove(struct platform_device *pdev)
> > continue;
> >
> > netdev = netdev_from_priv(dev);
> > + phylink_destroy(dev->phylink);
> > unregister_netdev(netdev);
> > of_node_put(netdev->dev.of_node);
> > }
> > diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h
> > index 8f42973f9cf5..1b25603dc64d 100644
> > --- a/drivers/net/ethernet/airoha/airoha_eth.h
> > +++ b/drivers/net/ethernet/airoha/airoha_eth.h
> > @@ -554,6 +554,9 @@ struct airoha_gdm_dev {
> >
> > u32 flags;
> > int nbq;
> > +
> > + struct phylink *phylink;
> > + struct phylink_config phylink_config;
> > };
> >
> > struct airoha_gdm_port {
> > diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h
> > index 436f3c8779c1..27f2583e143a 100644
> > --- a/drivers/net/ethernet/airoha/airoha_regs.h
> > +++ b/drivers/net/ethernet/airoha/airoha_regs.h
> > @@ -358,6 +358,18 @@
> > #define IP_FRAGMENT_PORT_MASK GENMASK(8, 5)
> > #define IP_FRAGMENT_NBQ_MASK GENMASK(4, 0)
> >
> > +#define REG_GDMA4_TMBI_FRAG 0x2028
> > +#define GDMA4_SGMII1_TX_WEIGHT_MASK GENMASK(31, 26)
> > +#define GDMA4_SGMII1_TX_FRAG_SIZE_MASK GENMASK(25, 16)
> > +#define GDMA4_SGMII0_TX_WEIGHT_MASK GENMASK(15, 10)
> > +#define GDMA4_SGMII0_TX_FRAG_SIZE_MASK GENMASK(9, 0)
> > +
> > +#define REG_GDMA4_RMBI_FRAG 0x202c
> > +#define GDMA4_SGMII1_RX_WEIGHT_MASK GENMASK(31, 26)
> > +#define GDMA4_SGMII1_RX_FRAG_SIZE_MASK GENMASK(25, 16)
> > +#define GDMA4_SGMII0_RX_WEIGHT_MASK GENMASK(15, 10)
> > +#define GDMA4_SGMII0_RX_FRAG_SIZE_MASK GENMASK(9, 0)
> > +
> > #define REG_MC_VLAN_EN 0x2100
> > #define MC_VLAN_EN_MASK BIT(0)
> >
> > --
> > 2.53.0
> >
--
Ansuel
^ permalink raw reply
* [PATCH v1] arm64: errata: Mitigate TLBI errata on NVIDIA Olympus CPU
From: Shanker Donthineni @ 2026-06-09 23:40 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, linux-arm-kernel, Mark Rutland
Cc: linux-kernel, linux-doc, Shanker Donthineni, Vikram Sethi,
Jason Sequeira, Alok Mooley, Rich Wiley
NVIDIA Olympus cores are affected by the TLBI completion issue tracked as
CVE-2025-10263. The existing ARM64_ERRATUM_4118414 handling already uses
ARM64_WORKAROUND_REPEAT_TLBI to issue an additional broadcast TLBI;DSB
sequence and ensure affected memory write effects are globally observed.
Add MIDR_NVIDIA_OLYMPUS to the repeat-TLBI match list so the same
mitigation is enabled on affected Olympus systems. Also document the
NVIDIA Olympus erratum in the arm64 silicon errata table and list it in
the Kconfig help text.
Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
---
Note: This patch depends on the following series as a prerequisite:
https://lore.kernel.org/all/20260609101203.1512409-1-mark.rutland@arm.com/
Documentation/arch/arm64/silicon-errata.rst | 2 ++
arch/arm64/Kconfig | 3 ++-
arch/arm64/kernel/cpu_errata.c | 1 +
3 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index a01e916ede17..ad09bbb10da8 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -298,6 +298,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | Carmel Core | N/A | NVIDIA_CARMEL_CNP_ERRATUM |
+----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA | Olympus core | T410-OLY-1029 | ARM64_ERRATUM_4118414 |
++----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 48233b54c482..c65cef81be86 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1155,7 +1155,7 @@ config ARM64_ERRATUM_4193714
If unsure, say Y.
config ARM64_ERRATUM_4118414
- bool "Cortex-*/Neoverse-*/C1-*: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
+ bool "Cortex-*/Neoverse-*/C1-*/Olympus: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
default y
select ARM64_WORKAROUND_REPEAT_TLBI
help
@@ -1182,6 +1182,7 @@ config ARM64_ERRATUM_4118414
* ARM Neoverse-V2 erratum 4193787
* ARM Neoverse-V3 erratum 4193784
* ARM Neoverse-V3AE erratum 4193784
+ * NVIDIA Olympus erratum T410-OLY-1029
On affected cores, some memory accesses might not be completed by
broadcast TLB invalidation.
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index fe6fe5de495b..d597896b0f7f 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -364,6 +364,7 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE),
+ MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
{}
})),
},
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v2 00/16] Bump minimum version of LLVM for building the kernel to 17.0.1
From: Nathan Chancellor @ 2026-06-09 23:28 UTC (permalink / raw)
To: Nicolas Schier, Bill Wendling, Justin Stitt, Nick Desaulniers,
Nathan Chancellor
Cc: linux-kernel, llvm, linux-kbuild, Jonathan Corbet, Shuah Khan,
linux-doc, Kees Cook, Gustavo A. R. Silva, linux-hardening,
linux-security-module, Rong Xu, Han Shen, Russell King,
Arnd Bergmann, linux-arm-kernel, Paul Walmsley, Palmer Dabbelt,
Albert Ou, Alexandre Ghiti, linux-riscv, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Ard Biesheuvel, Peter Zijlstra
In-Reply-To: <20260517-bump-minimum-supported-llvm-version-to-17-v2-0-b3b8cda46bdd@kernel.org>
On Sun, 17 May 2026 13:05:03 -1000, Nathan Chancellor wrote:
> Bump minimum version of LLVM for building the kernel to 17.0.1
>
> The current minimum version of LLVM for building the kernel is 15.0.0.
> However, there are two deficiencies compared to GCC that were fixed in
> LLVM 17 that are starting to become more noticeable.
>
> The first was a bug in LLVM's scope checker [1], where all labels in a
> function were validated as potential targets of an asm goto statement,
> even if they were not listed in the asm goto statement as targets. This
> becomes particularly problematic when the cleanup attribute is used, as
>
> [...]
Applied to
https://git.kernel.org/pub/scm/linux/kernel/git/kbuild/linux.git kbuild-next
Thanks!
[01/16] kbuild: Bump minimum version of LLVM for building the kernel to 17.0.1
https://git.kernel.org/kbuild/c/ce3267a39a92b
[02/16] security/Kconfig.hardening: Remove tautological condition from CC_HAS_ZERO_CALL_USED_REGS
https://git.kernel.org/kbuild/c/813fe686e90b4
[03/16] security/Kconfig.hardening: Remove tautological condition from FORTIFY_SOURCE
https://git.kernel.org/kbuild/c/8ad2017578c99
[04/16] security/Kconfig.hardening: Remove tautological condition from CC_HAS_RANDSTRUCT
https://git.kernel.org/kbuild/c/9331258bc129a
[05/16] arch/Kconfig: Remove tautological conditions from HAS_LTO_CLANG
https://git.kernel.org/kbuild/c/2189cb1a80f06
[06/16] arch/Kconfig: Remove tautological condition from AUTOFDO_CLANG
https://git.kernel.org/kbuild/c/de0bf1e138fcd
[07/16] ARM: Drop tautological ld.lld conditions from ARCH_MULTI_V4{,T}
https://git.kernel.org/kbuild/c/48d229b6a48ae
[08/16] riscv: Remove tautological condition from selection of ARCH_SUPPORTS_CFI
https://git.kernel.org/kbuild/c/62c4af8689511
[09/16] riscv: Drop tautological condition from TOOLCHAIN_NEEDS_OLD_ISA_SPEC
https://git.kernel.org/kbuild/c/7e279976cf2a2
[10/16] scripts/Makefile.warn: Drop -Wformat handling for clang < 16
https://git.kernel.org/kbuild/c/2a35c63c6bc42
[11/16] x86/build: Drop unnecessary '-ffreestanding' addition to KBUILD_CFLAGS
https://git.kernel.org/kbuild/c/7b3281fcb43c5
[12/16] x86/module: Revert "Deal with GOT based stack cookie load on Clang < 17"
https://git.kernel.org/kbuild/c/12b7bf92bddd4
[13/16] x86/entry/vdso32: Remove conditional omission of '.cfi_offset eflags'
https://git.kernel.org/kbuild/c/4e7af20d0d104
[14/16] kbuild: Remove check for broken scoping with clang < 17 in CC_HAS_ASM_GOTO_OUTPUT
https://git.kernel.org/kbuild/c/f3de78cb19d12
[15/16] compiler-clang.h: Remove __cleanup -Wunused-variable workaround
https://git.kernel.org/kbuild/c/c69eaa687667e
[16/16] compiler-clang.h: Drop explicit version number from "all" diagnostic macro
https://git.kernel.org/kbuild/c/c919893eabb43
Please look out for regression or issue reports or other follow up
comments, as they may result in the patch/series getting dropped or
reverted. Patches applied to an "unstable" branch are accepted pending
wider testing in -next and any post-commit review; they will generally
be moved to the main branch in a week if no issues are found.
Best regards,
--
Cheers,
Nathan
^ permalink raw reply
* Re: [PATCH net-next] docs: networking: add guidance on what to push via extack
From: Randy Dunlap @ 2026-06-09 23:34 UTC (permalink / raw)
To: Jakub Kicinski, davem
Cc: netdev, edumazet, pabeni, andrew+netdev, horms, corbet, skhan,
linux-doc
In-Reply-To: <20260609190919.1139517-1-kuba@kernel.org>
On 6/9/26 12:09 PM, Jakub Kicinski wrote:
> Every now and then someone tries to duplicated extack
duplicate
> messages to dmesg. Document our guidance against this.
> Also indicate that system level faults should continue
> to go to system logs. The high level thinking is to try
> to distinguish between what's important to the user vs
> system admin.
>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> ---
> CC: corbet@lwn.net
> CC: skhan@linuxfoundation.org
> CC: linux-doc@vger.kernel.org
> ---
> Documentation/networking/driver.rst | 13 +++++++++++++
> 1 file changed, 13 insertions(+)
>
> diff --git a/Documentation/networking/driver.rst b/Documentation/networking/driver.rst
> index 195a916dc0de..abd366dd5e43 100644
> --- a/Documentation/networking/driver.rst
> +++ b/Documentation/networking/driver.rst
> @@ -128,3 +128,16 @@ to be freed up.
> If you return NETDEV_TX_BUSY from the ndo_start_xmit method, you
> must not keep any reference to that SKB and you must not attempt
> to free it up.
> +
> +Error message reporting
> +=======================
> +
> +Number of driver configuration interfaces pass a Netlink extended ACK
A number of ...
(agreeing with Joe)
> +(``extack``) object to the driver (either directly as an argument or
> +as a member of a parameter struct). The drivers should try to report
> +most errors via the ``extack`` object. System level exceptions,
> +indicating that system or device is misbehaving or is in bad state
bad state,
> +should continue to be reported to system logs.
> +
> +Messages should be passed **either** via ``extack`` **or** to system logs.
> +Drivers should not try to report the same information to both.
--
~Randy
^ permalink raw reply
* Re: [PATCH v5 08/34] KVM: x86: Add KVM_VCPU_TSC_SCALE and fix the documentation on TSC migration
From: Randy Dunlap @ 2026-06-09 23:26 UTC (permalink / raw)
To: David Woodhouse, Paolo Bonzini, Jonathan Corbet, Shuah Khan,
Sean Christopherson, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Vitaly Kuznetsov, Juergen Gross, Boris Ostrovsky, Paul Durrant,
Jonathan Cameron, Sascha Bischoff, Marc Zyngier, Joey Gouly,
Jack Allister, Dongli Zhang, joe.jin, kvm, linux-doc,
linux-kernel, xen-devel, linux-kselftest
In-Reply-To: <20260608145455.89187-9-dwmw2@infradead.org>
On 6/8/26 7:47 AM, David Woodhouse wrote:
> diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst
> index 5e3805820010..167aa4140d30 100644
> --- a/Documentation/virt/kvm/devices/vcpu.rst
> +++ b/Documentation/virt/kvm/devices/vcpu.rst
> @@ -243,7 +243,10 @@ Returns:
> Specifies the guest's TSC offset relative to the host's TSC. The guest's
> TSC is then derived by the following equation:
>
> - guest_tsc = host_tsc + KVM_VCPU_TSC_OFFSET
> + guest_tsc = ((host_tsc * tsc_scale_ratio) >> tsc_scale_bits) + KVM_VCPU_TSC_OFFSET
> +
> +The values of tsc_scale_ratio and tsc_scale_bits can be obtained using
> +the KVM_VCPU_TSC_SCALE attribute.
>
> This attribute is useful to adjust the guest's TSC on live migration,
> so that the TSC counts the time during which the VM was paused. The
> @@ -251,44 +254,100 @@ following describes a possible algorithm to use for this purpose.
>
> From the source VMM process:
>
> -1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (tsc_src),
> +1. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (host_tsc_src),
> kvmclock nanoseconds (guest_src), and host CLOCK_REALTIME nanoseconds
> - (host_src).
> + (time_src) at a given moment (Tsrc).
> +
> +2. For each vCPU[i]:
> +
> + a. Read the KVM_VCPU_TSC_OFFSET attribute to record the guest TSC offset
> + (ofs_src[i]).
>
> -2. Read the KVM_VCPU_TSC_OFFSET attribute for every vCPU to record the
> - guest TSC offset (ofs_src[i]).
> + b. Read the KVM_VCPU_TSC_SCALE attribute to record the guest TSC scaling
> + ratio (ratio_src[i], frac_bits_src[i]).
>
> -3. Invoke the KVM_GET_TSC_KHZ ioctl to record the frequency of the
> - guest's TSC (freq).
> + c. Use host_tsc_src and the scaling/offset factors to calculate this
> + vCPU's TSC at time Tsrc:
> +
> + tsc_src[i] = ((host_tsc_src * ratio_src[i]) >> frac_bits_src[i]) + ofs_src[i]
> +
> +3. Invoke the KVM_GET_CLOCK_GUEST ioctl on the boot vCPU to return the KVM
> + clock as a function of the guest TSC (pvti_src). (This ioctl may not
> + succeed if the host and guest TSCs are not consistent and well-behaved.)
>
> From the destination VMM process:
>
> -4. Invoke the KVM_SET_CLOCK ioctl, providing the source nanoseconds from
> - kvmclock (guest_src) and CLOCK_REALTIME (host_src) in their respective
> - fields. Ensure that the KVM_CLOCK_REALTIME flag is set in the provided
> - structure.
> +4. Before creating the vCPUs, invoke the KVM_SET_TSC_KHZ ioctl on the VM, to
> + set the scaled frequency of the guest's TSC (freq).
> +
> +5. Invoke the KVM_GET_CLOCK ioctl to record the host TSC (host_tsc_dst) and
> + host CLOCK_REALTIME nanoseconds (time_dst) at a given moment (Tdst).
> +
> +6. Calculate the number of nanoseconds elapsed between Tsrc and Tdst:
> +
> + ΔT = time_dst - time_src
> +
> +7. As each vCPU[i] is created:
> +
> + a. Read the KVM_VCPU_TSC_SCALE attribute to record the guest TSC scaling
> + ratio (ratio_dst[i], frac_bits_dst[i]).
> +
> + b. Calculate the intended guest TSC value at time Tdst:
> +
> + tsc_dst[i] = tsc_src[i] + (ΔT * freq[i])
>
> - KVM will advance the VM's kvmclock to account for elapsed time since
> - recording the clock values. Note that this will cause problems in
> - the guest (e.g., timeouts) unless CLOCK_REALTIME is synchronized
> - between the source and destination, and a reasonably short time passes
> - between the source pausing the VMs and the destination executing
> - steps 4-7.
> + c. Use host_tsc_dst and the scaling factors to calculate this vCPU's
> + raw scaled TSC at time Tdst without offsetting:
> +
> + raw_dst[i] = ((host_tsc_dst * ratio_dst[i]) >> frac_bits_dst[i])
> +
> + d. Calculate ofs_dst[i] = tsc_dst[i] - raw_dst[i] and set the resulting
> + offset using the KVM_VCPU_TSC_OFFSET attribute.
> +
> +8. If pvti_src was provided, invoke the KVM_SET_CLOCK_GUEST ioctl on the boot
> + vCPU to restore the KVM clock as a precise function of the guest TSC.
> +
> +9. If KVM_SET_CLOCK_GUEST was not available or failed (e.g. because the
> + master clock is not active), fall back to the KVM_SET_CLOCK ioctl,
> + providing the source nanoseconds from kvmclock (guest_src) and
> + CLOCK_REALTIME (time_src) in their respective fields. Ensure that the
> + KVM_CLOCK_REALTIME flag is set in the provided structure.
> +
> + KVM will restore the VM's kvmclock, accounting for elapsed time since
> + the clock values were recorded. Note that this will cause problems in
> + the guest (e.g., timeouts) unless CLOCK_REALTIME is synchronized between
> + the source and destination, and a reasonably short time passes between
> + the source pausing the VMs and the destination resuming them.
> + Due to the KVM_[SG]ET_CLOCK API using CLOCK_REALTIME instead of
> + CLOCK_TAI, leap seconds during the migration may also introduce errors.
> +
> +4.2 ATTRIBUTE: KVM_VCPU_TSC_SCALE
> +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Documentation/virt/kvm/devices/vcpu.rst:327: ERROR: Inconsistent title style: skip from level 2 to 4.
4.2 ATTRIBUTE: KVM_VCPU_TSC_SCALE
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Established title styles: =/= = - [docutils]
Change this "underline" to use "--------------------------" (for whatever
width is needed) and also add the same to the 4.1 heading.
> +
> +:Parameters: struct kvm_vcpu_tsc_scale
> +
> +Returns:
> +
> + ======= ======================================
> + -EFAULT Error reading the provided parameter
> + address.
> + -ENXIO Attribute not supported (no TSC scaling)
> + -EINVAL Invalid request to write the attribute
> + ======= ======================================
--
~Randy
^ permalink raw reply
* Re: [PATCH v3 1/6] alloc_tag: add ioctl to /proc/allocinfo
From: Abhishek Bapat @ 2026-06-09 22:00 UTC (permalink / raw)
To: Hao Ge
Cc: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Shuah Khan,
Jonathan Corbet, linux-doc, linux-kernel, linux-mm, Sourav Panda
In-Reply-To: <CAL41Mv6XfbSMUvdU9btrtsh-HXejCH=HZmCuTL1gKRe2XAPvpQ@mail.gmail.com>
On Tue, Jun 9, 2026 at 2:54 PM Abhishek Bapat <abhishekbapat@google.com> wrote:
>
> On Tue, Jun 9, 2026 at 1:51 PM Abhishek Bapat <abhishekbapat@google.com> wrote:
> >
> > On Mon, Jun 8, 2026 at 6:44 PM Hao Ge <hao.ge@linux.dev> wrote:
> > >
> > > Hi Abhishek
> > >
> > >
> > > On 2026/6/9 08:19, Abhishek Bapat wrote:
> > > > On Sun, Jun 7, 2026 at 6:53 PM Hao Ge <hao.ge@linux.dev> wrote:
> > > >> Hi Suren and Abhishek
> > > >>
> > > >>
> > > >> Thanks for the new version.
> > > >>
> > > >>
> > > >> On 2026/6/6 07:36, Abhishek Bapat wrote:
> > > >>> From: Suren Baghdasaryan <surenb@google.com>
> > > >>>
> > > >>> Add the following ioctl commands for /proc/allocinfo file:
> > > >>>
> > > >>> ALLOCINFO_IOC_CONTENT_ID - gets content identifier which can be used
> > > >>> to check whether the file content has changed specifically due to module
> > > >>> load/unload. Every time a module is loaded / unloaded, the returned
> > > >>> value will be different. By comparing the identifier value at the
> > > >>> beginning and at the end of the content retrieval operation, users can
> > > >>> validate retrieved information for consistency.
> > > >>>
> > > >>> ALLOCINFO_IOC_GET_AT - gets the record at the specified position. This
> > > >>> is the position of a record in /proc/allocinfo.
> > > >>>
> > > >>> ALLOCINFO_IOC_GET_NEXT - gets the record next to the last retrieved
> > > >>> one. If no records were previously retrieved, returns the first
> > > >>> record.
> > > >>>
> > > >>> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > >>> Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
> > > >>> ---
> > > >>> Documentation/mm/allocation-profiling.rst | 5 +
> > > >>> .../userspace-api/ioctl/ioctl-number.rst | 2 +
> > > >>> MAINTAINERS | 1 +
> > > >>> include/linux/codetag.h | 2 +
> > > >>> include/uapi/linux/alloc_tag.h | 54 ++++
> > > >>> lib/alloc_tag.c | 232 +++++++++++++++++-
> > > >>> lib/codetag.c | 18 ++
> > > >>> 7 files changed, 312 insertions(+), 2 deletions(-)
> > > >>> create mode 100644 include/uapi/linux/alloc_tag.h
> > > >>>
> > > >>> diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
> > > >>> index 5389d241176a..c3a28467955f 100644
> > > >>> --- a/Documentation/mm/allocation-profiling.rst
> > > >>> +++ b/Documentation/mm/allocation-profiling.rst
> > > >>> @@ -46,6 +46,11 @@ sysctl:
> > > >>> Runtime info:
> > > >>> /proc/allocinfo
> > > >>>
> > > >>> + Profiling data can be retrieved either by reading `/proc/allocinfo` directly as
> > > >>> + text or programmatically via `ioctl()` calls defined in `<uapi/linux/alloc_tag.h>`.
> > > >>> + The ioctl interface supports structured binary data extraction as well as filtering
> > > >>> + by module name, function, file, line number, accuracy, or allocation size limits.
> > > >>> +
> > > >>> Example output::
> > > >>>
> > > >>> root@moria-kvm:~# sort -g /proc/allocinfo|tail|numfmt --to=iec
> > > >>> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > > >>> index 331223761fff..84f6808a8578 100644
> > > >>> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > > >>> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > > >>> @@ -349,6 +349,8 @@ Code Seq# Include File Comments
> > > >>> <mailto:luzmaximilian@gmail.com>
> > > >>> 0xA5 20-2F linux/surface_aggregator/dtx.h Microsoft Surface DTX driver
> > > >>> <mailto:luzmaximilian@gmail.com>
> > > >>> +0xA6 00-0F uapi/linux/alloc_tag.h Memory allocation profiling
> > > >>> + <mailto:surenb@google.com>
> > > >>> 0xAA 00-3F linux/uapi/linux/userfaultfd.h
> > > >>> 0xAB 00-1F linux/nbd.h
> > > >>> 0xAC 00-1F linux/raw.h
> > > >>> diff --git a/MAINTAINERS b/MAINTAINERS
> > > >>> index a31f6f207afd..77f3fc487691 100644
> > > >>> --- a/MAINTAINERS
> > > >>> +++ b/MAINTAINERS
> > > >>> @@ -16711,6 +16711,7 @@ S: Maintained
> > > >>> F: Documentation/mm/allocation-profiling.rst
> > > >>> F: include/linux/alloc_tag.h
> > > >>> F: include/linux/pgalloc_tag.h
> > > >>> +F: include/uapi/linux/alloc_tag.h
> > > >>> F: lib/alloc_tag.c
> > > >>>
> > > >>> MEMORY CONTROLLER DRIVERS
> > > >>> diff --git a/include/linux/codetag.h b/include/linux/codetag.h
> > > >>> index ddae7484ca45..a25a085c2df1 100644
> > > >>> --- a/include/linux/codetag.h
> > > >>> +++ b/include/linux/codetag.h
> > > >>> @@ -77,6 +77,8 @@ struct codetag_iterator {
> > > >>> void codetag_lock_module_list(struct codetag_type *cttype);
> > > >>> bool codetag_trylock_module_list(struct codetag_type *cttype);
> > > >>> void codetag_unlock_module_list(struct codetag_type *cttype);
> > > >>> +unsigned long codetag_get_content_id(struct codetag_type *cttype);
> > > >>> +unsigned int codetag_get_count(struct codetag_type *cttype);
> > > >>> struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
> > > >>> struct codetag *codetag_next_ct(struct codetag_iterator *iter);
> > > >>>
> > > >>> diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
> > > >>> new file mode 100644
> > > >>> index 000000000000..901199bad514
> > > >>> --- /dev/null
> > > >>> +++ b/include/uapi/linux/alloc_tag.h
> > > >>> @@ -0,0 +1,54 @@
> > > >>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > > >>> +/*
> > > >>> + * include/linux/alloc_tag.h
> > > >> nit: it should be include/uapi/linux/alloc_tag.h
> > > >>
> > > >> (I guess you may have missed the comment I brought up before. It is not
> > > >> a critical problem though.)
> > > >>
> > > > Apologies, I missed that comment earlier. Included in the v4 patchset.
> > > > Thanks for bringing this up.
> > > >
> > > >>> + */
> > > >>> +
> > > >>> +#ifndef _UAPI_ALLOC_TAG_H
> > > >>> +#define _UAPI_ALLOC_TAG_H
> > > >>> +
> > > >>> +#include <linux/types.h>
> > > >>> +
> > > >>> +#define ALLOCINFO_STR_SIZE 64
> > > >>> +
> > > >>> +struct allocinfo_content_id {
> > > >>> + __u64 id;
> > > >>> +};
> > > >>> +
> > > >>> +struct allocinfo_tag {
> > > >>> + /* Longer names are trimmed */
> > > >>> + char modname[ALLOCINFO_STR_SIZE];
> > > >>> + char function[ALLOCINFO_STR_SIZE];
> > > >>> + char filename[ALLOCINFO_STR_SIZE];
> > > >>> + __u64 lineno;
> > > >>> +};
> > > >>> +
> > > >>> +/* The alignment ensures 32-bit compatible interfaces are not broken */
> > > >>> +struct allocinfo_counter {
> > > >>> + __u64 bytes;
> > > >>> + __u64 calls;
> > > >>> + __u8 accurate;
> > > >>> +} __attribute__((aligned(8)));
> > > >>> +
> > > >>> +struct allocinfo_tag_data {
> > > >>> + struct allocinfo_tag tag;
> > > >>> + struct allocinfo_counter counter;
> > > >>> +};
> > > >>> +
> > > >>> +struct allocinfo_get_at {
> > > >>> + __u64 pos; /* input */
> > > >>> + struct allocinfo_tag_data data;
> > > >>> +};
> > > >>> +
> > > >>> +#define _ALLOCINFO_IOC_CONTENT_ID 0
> > > >>> +#define _ALLOCINFO_IOC_GET_AT 1
> > > >>> +#define _ALLOCINFO_IOC_GET_NEXT 2
> > > >>> +
> > > >>> +#define ALLOCINFO_IOC_BASE 0xA6
> > > >>> +#define ALLOCINFO_IOC_CONTENT_ID _IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_CONTENT_ID, \
> > > >>> + struct allocinfo_content_id)
> > > >>> +#define ALLOCINFO_IOC_GET_AT _IOWR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_AT, \
> > > >>> + struct allocinfo_get_at)
> > > >>> +#define ALLOCINFO_IOC_GET_NEXT _IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_NEXT, \
> > > >>> + struct allocinfo_tag_data)
> > > >>> +
> > > >>> +#endif /* _UAPI_ALLOC_TAG_H */
> > > >>> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > > >>> index d9be1cf5187d..a0577215eb3d 100644
> > > >>> --- a/lib/alloc_tag.c
> > > >>> +++ b/lib/alloc_tag.c
> > > >>> @@ -5,6 +5,7 @@
> > > >>> #include <linux/gfp.h>
> > > >>> #include <linux/kallsyms.h>
> > > >>> #include <linux/module.h>
> > > >>> +#include <linux/mutex.h>
> > > >>> #include <linux/page_ext.h>
> > > >>> #include <linux/pgalloc_tag.h>
> > > >>> #include <linux/proc_fs.h>
> > > >>> @@ -14,6 +15,7 @@
> > > >>> #include <linux/string_choices.h>
> > > >>> #include <linux/vmalloc.h>
> > > >>> #include <linux/kmemleak.h>
> > > >>> +#include <uapi/linux/alloc_tag.h>
> > > >>>
> > > >>> #define ALLOCINFO_FILE_NAME "allocinfo"
> > > >>> #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag))
> > > >>> @@ -47,6 +49,10 @@ struct allocinfo_private {
> > > >>> struct codetag_iterator iter;
> > > >>> struct codetag_iterator reported_iter;
> > > >>> bool print_header;
> > > >>> + /* ioctl uses a separate iterator not to interfere with reads */
> > > >>> + struct codetag_iterator ioctl_iter;
> > > >>> + bool positioned; /* seq_open_private() sets to 0 */
> > > >>> + struct mutex ioctl_lock;
> > > >>> };
> > > >>>
> > > >>> static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> > > >>> @@ -130,6 +136,229 @@ static const struct seq_operations allocinfo_seq_op = {
> > > >>> .show = allocinfo_show,
> > > >>> };
> > > >>>
> > > >>> +/*
> > > >>> + * Initializes seq_file operations and allocates private state when opening
> > > >>> + * the /proc/allocinfo procfs entry.
> > > >>> + */
> > > >>> +static int allocinfo_open(struct inode *inode, struct file *file)
> > > >>> +{
> > > >>> + int ret;
> > > >>> +
> > > >>> + ret = seq_open_private(file, &allocinfo_seq_op,
> > > >>> + sizeof(struct allocinfo_private));
> > > >>> + if (!ret) {
> > > >>> + struct seq_file *m = file->private_data;
> > > >>> + struct allocinfo_private *priv = m->private;
> > > >>> +
> > > >>> + mutex_init(&priv->ioctl_lock);
> > > >>> + }
> > > >>> + return ret;
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Cleans up the seq_file state and frees up the private state allocated in
> > > >>> + * allocinfo_open() when closing the /proc/allocinfo file descriptor.
> > > >>> + */
> > > >>> +static int allocinfo_release(struct inode *inode, struct file *file)
> > > >>> +{
> > > >>> + return seq_release_private(inode, file);
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Returns a pointer to the suffix of a string so that its length fits within
> > > >>> + * ALLOCINFO_STR_SIZE, preserving the trailing characters.
> > > >>> + */
> > > >>> +static const char *allocinfo_str(const char *str)
> > > >>> +{
> > > >>> + size_t len = strlen(str);
> > > >>> +
> > > >>> + /* Keep an extra space for the trailing NULL. */
> > > >>> + if (len >= ALLOCINFO_STR_SIZE)
> > > >>> + str += (len - ALLOCINFO_STR_SIZE) + 1;
> > > >>> + return str;
> > > >>> +}
> > > >>> +
> > > >>> +/* Copy a string and trim from the beginning if it's too long */
> > > >>> +static void allocinfo_copy_str(char *dest, const char *src)
> > > >>> +{
> > > >>> + strscpy_pad(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE);
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Populates the UAPI allocinfo_tag_data structure with active runtime
> > > >>> + * profiling counters extracted from the given kernel codetag.
> > > >>> + */
> > > >>> +static void allocinfo_to_params(struct codetag *ct,
> > > >>> + struct allocinfo_tag_data *data)
> > > >>> +{
> > > >>> + struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > > >>> + struct alloc_tag_counters counter = alloc_tag_read(tag);
> > > >>> +
> > > >>> + if (ct->modname)
> > > >>> + allocinfo_copy_str(data->tag.modname, ct->modname);
> > > >>> + else
> > > >>> + data->tag.modname[0] = '\0';
> > > >>> + allocinfo_copy_str(data->tag.function, ct->function);
> > > >>> + allocinfo_copy_str(data->tag.filename, ct->filename);
> > > >>> + data->tag.lineno = ct->lineno;
> > > >>> + data->counter.bytes = counter.bytes;
> > > >>> + data->counter.calls = counter.calls;
> > > >>> + data->counter.accurate = !alloc_tag_is_inaccurate(tag);
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Retrieves the unique content ID representing the current allocation tag module
> > > >>> + * layout, allowing userspace to detect if modules were loaded / unloaded.
> > > >>> + */
> > > >>> +static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
> > > >>> +{
> > > >>> + struct allocinfo_content_id params;
> > > >>> +
> > > >>> + codetag_lock_module_list(alloc_tag_cttype);
> > > >>> + params.id = codetag_get_content_id(alloc_tag_cttype);
> > > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > > >>> + if (copy_to_user(arg, ¶ms, sizeof(params)))
> > > >>> + return -EFAULT;
> > > >>> +
> > > >>> + return 0;
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Seeks the ioctl iterator to the specified 0-indexed tag position, reads its
> > > >>> + * profiling data and returns it to userspace.
> > > >>> + */
> > > >>> +static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
> > > >>> +{
> > > >>> + struct allocinfo_private *priv;
> > > >>> + struct codetag *ct;
> > > >>> + __u64 pos;
> > > >>> + struct allocinfo_get_at params = {0};
> > > >>> +
> > > >>> + if (copy_from_user(¶ms, arg, sizeof(params)))
> > > >>> + return -EFAULT;
> > > >>> +
> > > >>> + priv = m->private;
> > > >>> + pos = params.pos;
> > > >>> +
> > > >>> + mutex_lock(&priv->ioctl_lock);
> > > >>> + codetag_lock_module_list(alloc_tag_cttype);
> > > >>> +
> > > >>> + if (pos >= codetag_get_count(alloc_tag_cttype)) {
> > > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > > >>> + mutex_unlock(&priv->ioctl_lock);
> > > >>> + return -ENOENT;
> > > >>> + }
> > > >>> +
> > > >>> + /* Find the codetag */
> > > >>> + priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
> > > >>> + ct = codetag_next_ct(&priv->ioctl_iter);
> > > >>> + while (ct && pos--)
> > > >>> + ct = codetag_next_ct(&priv->ioctl_iter);
> > > >>> + if (ct) {
> > > >>> + allocinfo_to_params(ct, ¶ms.data);
> > > >>> + priv->positioned = true;
> > > >>> + }
> > > >>> +
> > > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > > >>> + mutex_unlock(&priv->ioctl_lock);
> > > >>> +
> > > >>> + if (!ct)
> > > >>> + return -ENOENT;
> > > >>> +
> > > >>> + if (copy_to_user(arg, ¶ms, sizeof(params)))
> > > >>> + return -EFAULT;
> > > >>> +
> > > >>> + return 0;
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Advances the ioctl iterator to the next allocation tag in the sequence and
> > > >>> + * returns its profiling data to userspace.
> > > >>> + */
> > > >>> +static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
> > > >>> +{
> > > >>> + struct allocinfo_private *priv;
> > > >>> + struct codetag *ct;
> > > >>> + struct allocinfo_tag_data params;
> > > >>> + int ret = 0;
> > > >>> +
> > > >>> + memset(¶ms, 0, sizeof(params));
> > > >>> + priv = m->private;
> > > >>> +
> > > >>> + mutex_lock(&priv->ioctl_lock);
> > > >>> + codetag_lock_module_list(alloc_tag_cttype);
> > > >>> +
> > > >>> + if (!priv->positioned) {
> > > >>> + priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
> > > >>> + priv->positioned = true;
> > > >>> + }
> > > >>> +
> > > >>> + ct = codetag_next_ct(&priv->ioctl_iter);
> > > >>> + if (ct)
> > > >>> + allocinfo_to_params(ct, ¶ms);
> > > >>> +
> > > >>> + if (!ct) {
> > > >>> + priv->positioned = false;
> > > >>> + ret = -ENOENT;
> > > >>> + }
> > > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > > >>> + mutex_unlock(&priv->ioctl_lock);
> > > >>> +
> > > >>> + if (ret == 0) {
> > > >>> + if (copy_to_user(arg, ¶ms, sizeof(params)))
> > > >>> + return -EFAULT;
> > > >>> + }
> > > >>> + return ret;
> > > >>> +}
> > > >>> +
> > > >>> +/*
> > > >>> + * Entry point ioctl function for /proc/allocinfo routing requests to fetch the
> > > >>> + * layout content ID, seek to a specific tag, or read sequential tags.
> > > >>> + */
> > > >>> +static long allocinfo_ioctl(struct file *file, unsigned int cmd,
> > > >>> + unsigned long __arg)
> > > >>> +{
> > > >>> + void __user *arg = (void __user *)__arg;
> > > >>> + int ret;
> > > >>> +
> > > >>> + switch (cmd) {
> > > >>> + case ALLOCINFO_IOC_CONTENT_ID:
> > > >>> + ret = allocinfo_ioctl_get_content_id(file->private_data, arg);
> > > >>> + break;
> > > >>> + case ALLOCINFO_IOC_GET_AT:
> > > >>> + ret = allocinfo_ioctl_get_at(file->private_data, arg);
> > > >>> + break;
> > > >>> + case ALLOCINFO_IOC_GET_NEXT:
> > > >>> + ret = allocinfo_ioctl_get_next(file->private_data, arg);
> > > >>> + break;
> > > >>> + default:
> > > >>> + ret = -ENOIOCTLCMD;
> > > >>> + break;
> > > >>> + }
> > > >>> +
> > > >>> + return ret;
> > > >>> +}
> > > >>> +
> > > >>> +#ifdef CONFIG_COMPAT
> > > >>> +static long allocinfo_compat_ioctl(struct file *file, unsigned int cmd,
> > > >>> + unsigned long arg)
> > > >>> +{
> > > >>> + return allocinfo_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
> > > >>> +}
> > > >>> +#endif
> > > >>> +
> > > >>> +static const struct proc_ops allocinfo_proc_ops = {
> > > >>> + .proc_open = allocinfo_open,
> > > >>> + .proc_read_iter = seq_read_iter,
> > > >>> + .proc_lseek = seq_lseek,
> > > >>> + .proc_release = allocinfo_release,
> > > >>> + .proc_ioctl = allocinfo_ioctl,
> > > >>> +#ifdef CONFIG_COMPAT
> > > >>> + .proc_compat_ioctl = allocinfo_compat_ioctl,
> > > >>> +#endif
> > > >>> +
> > > >>> +};
> > > >>> +
> > > >>> size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
> > > >>> {
> > > >>> struct codetag_iterator iter;
> > > >>> @@ -993,8 +1222,7 @@ static int __init alloc_tag_init(void)
> > > >>> return 0;
> > > >>> }
> > > >>>
> > > >>> - if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
> > > >>> - sizeof(struct allocinfo_private), NULL)) {
> > > >>> + if (!proc_create(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_proc_ops)) {
> > > >>> pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
> > > >>> shutdown_mem_profiling(false);
> > > >>> return -ENOMEM;
> > > >>> diff --git a/lib/codetag.c b/lib/codetag.c
> > > >>> index 4001a7ea6675..a9cda4c962a3 100644
> > > >>> --- a/lib/codetag.c
> > > >>> +++ b/lib/codetag.c
> > > >>> @@ -19,6 +19,8 @@ struct codetag_type {
> > > >>> struct codetag_type_desc desc;
> > > >>> /* generates unique sequence number for module load */
> > > >>> unsigned long next_mod_seq;
> > > >>> + /* bumped on every module load and unload */
> > > >>> + unsigned long content_id;
> > > >>> };
> > > >>>
> > > >>> struct codetag_range {
> > > >>> @@ -50,6 +52,20 @@ void codetag_unlock_module_list(struct codetag_type *cttype)
> > > >>> up_read(&cttype->mod_lock);
> > > >>> }
> > > >>>
> > > >>> +unsigned long codetag_get_content_id(struct codetag_type *cttype)
> > > >>> +{
> > > >>> + lockdep_assert_held(&cttype->mod_lock);
> > > >>> +
> > > >>> + return cttype->content_id;
> > > >>> +}
> > > >>> +
> > > >>> +unsigned int codetag_get_count(struct codetag_type *cttype)
> > > >>> +{
> > > >>> + lockdep_assert_held(&cttype->mod_lock);
> > > >>> +
> > > >>> + return cttype->count;
> > > >>> +}
> > > >>> +
> > > >>> struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
> > > >>> {
> > > >>> struct codetag_iterator iter = {
> > > >>> @@ -204,6 +220,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
> > > >>>
> > > >>> down_write(&cttype->mod_lock);
> > > >>> cmod->mod_seq = ++cttype->next_mod_seq;
> > > >>> + ++cttype->content_id;
> > > >> I have a comment on the content_id bump placement.
> > > >>
> > > >> ++cttype->content_id is placed before idr_alloc and the module_load
> > > >>
> > > >> callback. If idr_alloc fails or module_load returns an error
> > > >>
> > > >> (While the chance of this occurring is very low.), the idr entry gets
> > > >>
> > > >> rolled back but content_id has already been bumped. The actual
> > > >>
> > > >> content didn't change in this case, so userspace would see a
> > > >>
> > > >> different content_id and assume the data is inconsistent when it
> > > >>
> > > >> isn't.
> > > >>
> > > >>
> > > >> Thanks
> > > >>
> > > >> Best Regards
> > > >>
> > > >> Hao
> > > > While I agree with your comment, I decided to place the counter
> > > > increment there because the chance of failure is low. Furthermore,
> > > > even if it falsely invalidates user data, the user will simply query
> > > > the content again. This placement also aligns with where the
> > > > previously used field (cttype->next_mod_seq) was incremented. Let me
> > > > know if you still think I should move it. Thanks!
> > >
> > > Sorry, I should have marked this as a nit when I raised the comment.
> > >
> > > Given its low probability of occurring, it doesn't block anything for now.
> > >
> > > The reason I raised this comment was just in case someone adds new logic
> > >
> > > in the feature that could fail. But if that happens, we can move both
> > > next_mod_seq
> > >
> > > and content_id down together.
> > >
> > >
> > > Thanks
> > >
> > > Best Regards
> > >
> > > Hao
> > >
> >
> > Sounds good, so for now I am not touching this patch and keeping it as
> > is other than the file path nit inside the UAPI header file.
>
> Actually, checkpatch is complaining about using the same filename
> inside of the file and hence I am dropping that comment completely.
>
Actually, I realized the header file lacked a proper copyright notice,
so I'm including that.
> > > >>> mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
> > > >>> if (mod_id >= 0) {
> > > >>> if (cttype->desc.module_load) {
> > > >>> @@ -368,6 +385,7 @@ void codetag_unload_module(struct module *mod)
> > > >>> cttype->count -= range_size(cttype, &cmod->range);
> > > >>> idr_remove(&cttype->mod_idr, mod_id);
> > > >>> kfree(cmod);
> > > >>> + ++cttype->content_id;
> > > >>> }
> > > >>> up_write(&cttype->mod_lock);
> > > >>> if (found && cttype->desc.free_section_mem)
^ permalink raw reply
* Re: [PATCH v3 1/6] alloc_tag: add ioctl to /proc/allocinfo
From: Abhishek Bapat @ 2026-06-09 21:54 UTC (permalink / raw)
To: Hao Ge
Cc: Suren Baghdasaryan, Andrew Morton, Kent Overstreet, Shuah Khan,
Jonathan Corbet, linux-doc, linux-kernel, linux-mm, Sourav Panda
In-Reply-To: <CAL41Mv5ZzuVRy5cjp_Ozn=9QCpCe1mFS36o25Z1dnbYCvNs9+Q@mail.gmail.com>
On Tue, Jun 9, 2026 at 1:51 PM Abhishek Bapat <abhishekbapat@google.com> wrote:
>
> On Mon, Jun 8, 2026 at 6:44 PM Hao Ge <hao.ge@linux.dev> wrote:
> >
> > Hi Abhishek
> >
> >
> > On 2026/6/9 08:19, Abhishek Bapat wrote:
> > > On Sun, Jun 7, 2026 at 6:53 PM Hao Ge <hao.ge@linux.dev> wrote:
> > >> Hi Suren and Abhishek
> > >>
> > >>
> > >> Thanks for the new version.
> > >>
> > >>
> > >> On 2026/6/6 07:36, Abhishek Bapat wrote:
> > >>> From: Suren Baghdasaryan <surenb@google.com>
> > >>>
> > >>> Add the following ioctl commands for /proc/allocinfo file:
> > >>>
> > >>> ALLOCINFO_IOC_CONTENT_ID - gets content identifier which can be used
> > >>> to check whether the file content has changed specifically due to module
> > >>> load/unload. Every time a module is loaded / unloaded, the returned
> > >>> value will be different. By comparing the identifier value at the
> > >>> beginning and at the end of the content retrieval operation, users can
> > >>> validate retrieved information for consistency.
> > >>>
> > >>> ALLOCINFO_IOC_GET_AT - gets the record at the specified position. This
> > >>> is the position of a record in /proc/allocinfo.
> > >>>
> > >>> ALLOCINFO_IOC_GET_NEXT - gets the record next to the last retrieved
> > >>> one. If no records were previously retrieved, returns the first
> > >>> record.
> > >>>
> > >>> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > >>> Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
> > >>> ---
> > >>> Documentation/mm/allocation-profiling.rst | 5 +
> > >>> .../userspace-api/ioctl/ioctl-number.rst | 2 +
> > >>> MAINTAINERS | 1 +
> > >>> include/linux/codetag.h | 2 +
> > >>> include/uapi/linux/alloc_tag.h | 54 ++++
> > >>> lib/alloc_tag.c | 232 +++++++++++++++++-
> > >>> lib/codetag.c | 18 ++
> > >>> 7 files changed, 312 insertions(+), 2 deletions(-)
> > >>> create mode 100644 include/uapi/linux/alloc_tag.h
> > >>>
> > >>> diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst
> > >>> index 5389d241176a..c3a28467955f 100644
> > >>> --- a/Documentation/mm/allocation-profiling.rst
> > >>> +++ b/Documentation/mm/allocation-profiling.rst
> > >>> @@ -46,6 +46,11 @@ sysctl:
> > >>> Runtime info:
> > >>> /proc/allocinfo
> > >>>
> > >>> + Profiling data can be retrieved either by reading `/proc/allocinfo` directly as
> > >>> + text or programmatically via `ioctl()` calls defined in `<uapi/linux/alloc_tag.h>`.
> > >>> + The ioctl interface supports structured binary data extraction as well as filtering
> > >>> + by module name, function, file, line number, accuracy, or allocation size limits.
> > >>> +
> > >>> Example output::
> > >>>
> > >>> root@moria-kvm:~# sort -g /proc/allocinfo|tail|numfmt --to=iec
> > >>> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > >>> index 331223761fff..84f6808a8578 100644
> > >>> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> > >>> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> > >>> @@ -349,6 +349,8 @@ Code Seq# Include File Comments
> > >>> <mailto:luzmaximilian@gmail.com>
> > >>> 0xA5 20-2F linux/surface_aggregator/dtx.h Microsoft Surface DTX driver
> > >>> <mailto:luzmaximilian@gmail.com>
> > >>> +0xA6 00-0F uapi/linux/alloc_tag.h Memory allocation profiling
> > >>> + <mailto:surenb@google.com>
> > >>> 0xAA 00-3F linux/uapi/linux/userfaultfd.h
> > >>> 0xAB 00-1F linux/nbd.h
> > >>> 0xAC 00-1F linux/raw.h
> > >>> diff --git a/MAINTAINERS b/MAINTAINERS
> > >>> index a31f6f207afd..77f3fc487691 100644
> > >>> --- a/MAINTAINERS
> > >>> +++ b/MAINTAINERS
> > >>> @@ -16711,6 +16711,7 @@ S: Maintained
> > >>> F: Documentation/mm/allocation-profiling.rst
> > >>> F: include/linux/alloc_tag.h
> > >>> F: include/linux/pgalloc_tag.h
> > >>> +F: include/uapi/linux/alloc_tag.h
> > >>> F: lib/alloc_tag.c
> > >>>
> > >>> MEMORY CONTROLLER DRIVERS
> > >>> diff --git a/include/linux/codetag.h b/include/linux/codetag.h
> > >>> index ddae7484ca45..a25a085c2df1 100644
> > >>> --- a/include/linux/codetag.h
> > >>> +++ b/include/linux/codetag.h
> > >>> @@ -77,6 +77,8 @@ struct codetag_iterator {
> > >>> void codetag_lock_module_list(struct codetag_type *cttype);
> > >>> bool codetag_trylock_module_list(struct codetag_type *cttype);
> > >>> void codetag_unlock_module_list(struct codetag_type *cttype);
> > >>> +unsigned long codetag_get_content_id(struct codetag_type *cttype);
> > >>> +unsigned int codetag_get_count(struct codetag_type *cttype);
> > >>> struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
> > >>> struct codetag *codetag_next_ct(struct codetag_iterator *iter);
> > >>>
> > >>> diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
> > >>> new file mode 100644
> > >>> index 000000000000..901199bad514
> > >>> --- /dev/null
> > >>> +++ b/include/uapi/linux/alloc_tag.h
> > >>> @@ -0,0 +1,54 @@
> > >>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > >>> +/*
> > >>> + * include/linux/alloc_tag.h
> > >> nit: it should be include/uapi/linux/alloc_tag.h
> > >>
> > >> (I guess you may have missed the comment I brought up before. It is not
> > >> a critical problem though.)
> > >>
> > > Apologies, I missed that comment earlier. Included in the v4 patchset.
> > > Thanks for bringing this up.
> > >
> > >>> + */
> > >>> +
> > >>> +#ifndef _UAPI_ALLOC_TAG_H
> > >>> +#define _UAPI_ALLOC_TAG_H
> > >>> +
> > >>> +#include <linux/types.h>
> > >>> +
> > >>> +#define ALLOCINFO_STR_SIZE 64
> > >>> +
> > >>> +struct allocinfo_content_id {
> > >>> + __u64 id;
> > >>> +};
> > >>> +
> > >>> +struct allocinfo_tag {
> > >>> + /* Longer names are trimmed */
> > >>> + char modname[ALLOCINFO_STR_SIZE];
> > >>> + char function[ALLOCINFO_STR_SIZE];
> > >>> + char filename[ALLOCINFO_STR_SIZE];
> > >>> + __u64 lineno;
> > >>> +};
> > >>> +
> > >>> +/* The alignment ensures 32-bit compatible interfaces are not broken */
> > >>> +struct allocinfo_counter {
> > >>> + __u64 bytes;
> > >>> + __u64 calls;
> > >>> + __u8 accurate;
> > >>> +} __attribute__((aligned(8)));
> > >>> +
> > >>> +struct allocinfo_tag_data {
> > >>> + struct allocinfo_tag tag;
> > >>> + struct allocinfo_counter counter;
> > >>> +};
> > >>> +
> > >>> +struct allocinfo_get_at {
> > >>> + __u64 pos; /* input */
> > >>> + struct allocinfo_tag_data data;
> > >>> +};
> > >>> +
> > >>> +#define _ALLOCINFO_IOC_CONTENT_ID 0
> > >>> +#define _ALLOCINFO_IOC_GET_AT 1
> > >>> +#define _ALLOCINFO_IOC_GET_NEXT 2
> > >>> +
> > >>> +#define ALLOCINFO_IOC_BASE 0xA6
> > >>> +#define ALLOCINFO_IOC_CONTENT_ID _IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_CONTENT_ID, \
> > >>> + struct allocinfo_content_id)
> > >>> +#define ALLOCINFO_IOC_GET_AT _IOWR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_AT, \
> > >>> + struct allocinfo_get_at)
> > >>> +#define ALLOCINFO_IOC_GET_NEXT _IOR(ALLOCINFO_IOC_BASE, _ALLOCINFO_IOC_GET_NEXT, \
> > >>> + struct allocinfo_tag_data)
> > >>> +
> > >>> +#endif /* _UAPI_ALLOC_TAG_H */
> > >>> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > >>> index d9be1cf5187d..a0577215eb3d 100644
> > >>> --- a/lib/alloc_tag.c
> > >>> +++ b/lib/alloc_tag.c
> > >>> @@ -5,6 +5,7 @@
> > >>> #include <linux/gfp.h>
> > >>> #include <linux/kallsyms.h>
> > >>> #include <linux/module.h>
> > >>> +#include <linux/mutex.h>
> > >>> #include <linux/page_ext.h>
> > >>> #include <linux/pgalloc_tag.h>
> > >>> #include <linux/proc_fs.h>
> > >>> @@ -14,6 +15,7 @@
> > >>> #include <linux/string_choices.h>
> > >>> #include <linux/vmalloc.h>
> > >>> #include <linux/kmemleak.h>
> > >>> +#include <uapi/linux/alloc_tag.h>
> > >>>
> > >>> #define ALLOCINFO_FILE_NAME "allocinfo"
> > >>> #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag))
> > >>> @@ -47,6 +49,10 @@ struct allocinfo_private {
> > >>> struct codetag_iterator iter;
> > >>> struct codetag_iterator reported_iter;
> > >>> bool print_header;
> > >>> + /* ioctl uses a separate iterator not to interfere with reads */
> > >>> + struct codetag_iterator ioctl_iter;
> > >>> + bool positioned; /* seq_open_private() sets to 0 */
> > >>> + struct mutex ioctl_lock;
> > >>> };
> > >>>
> > >>> static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> > >>> @@ -130,6 +136,229 @@ static const struct seq_operations allocinfo_seq_op = {
> > >>> .show = allocinfo_show,
> > >>> };
> > >>>
> > >>> +/*
> > >>> + * Initializes seq_file operations and allocates private state when opening
> > >>> + * the /proc/allocinfo procfs entry.
> > >>> + */
> > >>> +static int allocinfo_open(struct inode *inode, struct file *file)
> > >>> +{
> > >>> + int ret;
> > >>> +
> > >>> + ret = seq_open_private(file, &allocinfo_seq_op,
> > >>> + sizeof(struct allocinfo_private));
> > >>> + if (!ret) {
> > >>> + struct seq_file *m = file->private_data;
> > >>> + struct allocinfo_private *priv = m->private;
> > >>> +
> > >>> + mutex_init(&priv->ioctl_lock);
> > >>> + }
> > >>> + return ret;
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Cleans up the seq_file state and frees up the private state allocated in
> > >>> + * allocinfo_open() when closing the /proc/allocinfo file descriptor.
> > >>> + */
> > >>> +static int allocinfo_release(struct inode *inode, struct file *file)
> > >>> +{
> > >>> + return seq_release_private(inode, file);
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Returns a pointer to the suffix of a string so that its length fits within
> > >>> + * ALLOCINFO_STR_SIZE, preserving the trailing characters.
> > >>> + */
> > >>> +static const char *allocinfo_str(const char *str)
> > >>> +{
> > >>> + size_t len = strlen(str);
> > >>> +
> > >>> + /* Keep an extra space for the trailing NULL. */
> > >>> + if (len >= ALLOCINFO_STR_SIZE)
> > >>> + str += (len - ALLOCINFO_STR_SIZE) + 1;
> > >>> + return str;
> > >>> +}
> > >>> +
> > >>> +/* Copy a string and trim from the beginning if it's too long */
> > >>> +static void allocinfo_copy_str(char *dest, const char *src)
> > >>> +{
> > >>> + strscpy_pad(dest, allocinfo_str(src), ALLOCINFO_STR_SIZE);
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Populates the UAPI allocinfo_tag_data structure with active runtime
> > >>> + * profiling counters extracted from the given kernel codetag.
> > >>> + */
> > >>> +static void allocinfo_to_params(struct codetag *ct,
> > >>> + struct allocinfo_tag_data *data)
> > >>> +{
> > >>> + struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > >>> + struct alloc_tag_counters counter = alloc_tag_read(tag);
> > >>> +
> > >>> + if (ct->modname)
> > >>> + allocinfo_copy_str(data->tag.modname, ct->modname);
> > >>> + else
> > >>> + data->tag.modname[0] = '\0';
> > >>> + allocinfo_copy_str(data->tag.function, ct->function);
> > >>> + allocinfo_copy_str(data->tag.filename, ct->filename);
> > >>> + data->tag.lineno = ct->lineno;
> > >>> + data->counter.bytes = counter.bytes;
> > >>> + data->counter.calls = counter.calls;
> > >>> + data->counter.accurate = !alloc_tag_is_inaccurate(tag);
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Retrieves the unique content ID representing the current allocation tag module
> > >>> + * layout, allowing userspace to detect if modules were loaded / unloaded.
> > >>> + */
> > >>> +static int allocinfo_ioctl_get_content_id(struct seq_file *m, void __user *arg)
> > >>> +{
> > >>> + struct allocinfo_content_id params;
> > >>> +
> > >>> + codetag_lock_module_list(alloc_tag_cttype);
> > >>> + params.id = codetag_get_content_id(alloc_tag_cttype);
> > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > >>> + if (copy_to_user(arg, ¶ms, sizeof(params)))
> > >>> + return -EFAULT;
> > >>> +
> > >>> + return 0;
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Seeks the ioctl iterator to the specified 0-indexed tag position, reads its
> > >>> + * profiling data and returns it to userspace.
> > >>> + */
> > >>> +static int allocinfo_ioctl_get_at(struct seq_file *m, void __user *arg)
> > >>> +{
> > >>> + struct allocinfo_private *priv;
> > >>> + struct codetag *ct;
> > >>> + __u64 pos;
> > >>> + struct allocinfo_get_at params = {0};
> > >>> +
> > >>> + if (copy_from_user(¶ms, arg, sizeof(params)))
> > >>> + return -EFAULT;
> > >>> +
> > >>> + priv = m->private;
> > >>> + pos = params.pos;
> > >>> +
> > >>> + mutex_lock(&priv->ioctl_lock);
> > >>> + codetag_lock_module_list(alloc_tag_cttype);
> > >>> +
> > >>> + if (pos >= codetag_get_count(alloc_tag_cttype)) {
> > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > >>> + mutex_unlock(&priv->ioctl_lock);
> > >>> + return -ENOENT;
> > >>> + }
> > >>> +
> > >>> + /* Find the codetag */
> > >>> + priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
> > >>> + ct = codetag_next_ct(&priv->ioctl_iter);
> > >>> + while (ct && pos--)
> > >>> + ct = codetag_next_ct(&priv->ioctl_iter);
> > >>> + if (ct) {
> > >>> + allocinfo_to_params(ct, ¶ms.data);
> > >>> + priv->positioned = true;
> > >>> + }
> > >>> +
> > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > >>> + mutex_unlock(&priv->ioctl_lock);
> > >>> +
> > >>> + if (!ct)
> > >>> + return -ENOENT;
> > >>> +
> > >>> + if (copy_to_user(arg, ¶ms, sizeof(params)))
> > >>> + return -EFAULT;
> > >>> +
> > >>> + return 0;
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Advances the ioctl iterator to the next allocation tag in the sequence and
> > >>> + * returns its profiling data to userspace.
> > >>> + */
> > >>> +static int allocinfo_ioctl_get_next(struct seq_file *m, void __user *arg)
> > >>> +{
> > >>> + struct allocinfo_private *priv;
> > >>> + struct codetag *ct;
> > >>> + struct allocinfo_tag_data params;
> > >>> + int ret = 0;
> > >>> +
> > >>> + memset(¶ms, 0, sizeof(params));
> > >>> + priv = m->private;
> > >>> +
> > >>> + mutex_lock(&priv->ioctl_lock);
> > >>> + codetag_lock_module_list(alloc_tag_cttype);
> > >>> +
> > >>> + if (!priv->positioned) {
> > >>> + priv->ioctl_iter = codetag_get_ct_iter(alloc_tag_cttype);
> > >>> + priv->positioned = true;
> > >>> + }
> > >>> +
> > >>> + ct = codetag_next_ct(&priv->ioctl_iter);
> > >>> + if (ct)
> > >>> + allocinfo_to_params(ct, ¶ms);
> > >>> +
> > >>> + if (!ct) {
> > >>> + priv->positioned = false;
> > >>> + ret = -ENOENT;
> > >>> + }
> > >>> + codetag_unlock_module_list(alloc_tag_cttype);
> > >>> + mutex_unlock(&priv->ioctl_lock);
> > >>> +
> > >>> + if (ret == 0) {
> > >>> + if (copy_to_user(arg, ¶ms, sizeof(params)))
> > >>> + return -EFAULT;
> > >>> + }
> > >>> + return ret;
> > >>> +}
> > >>> +
> > >>> +/*
> > >>> + * Entry point ioctl function for /proc/allocinfo routing requests to fetch the
> > >>> + * layout content ID, seek to a specific tag, or read sequential tags.
> > >>> + */
> > >>> +static long allocinfo_ioctl(struct file *file, unsigned int cmd,
> > >>> + unsigned long __arg)
> > >>> +{
> > >>> + void __user *arg = (void __user *)__arg;
> > >>> + int ret;
> > >>> +
> > >>> + switch (cmd) {
> > >>> + case ALLOCINFO_IOC_CONTENT_ID:
> > >>> + ret = allocinfo_ioctl_get_content_id(file->private_data, arg);
> > >>> + break;
> > >>> + case ALLOCINFO_IOC_GET_AT:
> > >>> + ret = allocinfo_ioctl_get_at(file->private_data, arg);
> > >>> + break;
> > >>> + case ALLOCINFO_IOC_GET_NEXT:
> > >>> + ret = allocinfo_ioctl_get_next(file->private_data, arg);
> > >>> + break;
> > >>> + default:
> > >>> + ret = -ENOIOCTLCMD;
> > >>> + break;
> > >>> + }
> > >>> +
> > >>> + return ret;
> > >>> +}
> > >>> +
> > >>> +#ifdef CONFIG_COMPAT
> > >>> +static long allocinfo_compat_ioctl(struct file *file, unsigned int cmd,
> > >>> + unsigned long arg)
> > >>> +{
> > >>> + return allocinfo_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
> > >>> +}
> > >>> +#endif
> > >>> +
> > >>> +static const struct proc_ops allocinfo_proc_ops = {
> > >>> + .proc_open = allocinfo_open,
> > >>> + .proc_read_iter = seq_read_iter,
> > >>> + .proc_lseek = seq_lseek,
> > >>> + .proc_release = allocinfo_release,
> > >>> + .proc_ioctl = allocinfo_ioctl,
> > >>> +#ifdef CONFIG_COMPAT
> > >>> + .proc_compat_ioctl = allocinfo_compat_ioctl,
> > >>> +#endif
> > >>> +
> > >>> +};
> > >>> +
> > >>> size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
> > >>> {
> > >>> struct codetag_iterator iter;
> > >>> @@ -993,8 +1222,7 @@ static int __init alloc_tag_init(void)
> > >>> return 0;
> > >>> }
> > >>>
> > >>> - if (!proc_create_seq_private(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op,
> > >>> - sizeof(struct allocinfo_private), NULL)) {
> > >>> + if (!proc_create(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_proc_ops)) {
> > >>> pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME);
> > >>> shutdown_mem_profiling(false);
> > >>> return -ENOMEM;
> > >>> diff --git a/lib/codetag.c b/lib/codetag.c
> > >>> index 4001a7ea6675..a9cda4c962a3 100644
> > >>> --- a/lib/codetag.c
> > >>> +++ b/lib/codetag.c
> > >>> @@ -19,6 +19,8 @@ struct codetag_type {
> > >>> struct codetag_type_desc desc;
> > >>> /* generates unique sequence number for module load */
> > >>> unsigned long next_mod_seq;
> > >>> + /* bumped on every module load and unload */
> > >>> + unsigned long content_id;
> > >>> };
> > >>>
> > >>> struct codetag_range {
> > >>> @@ -50,6 +52,20 @@ void codetag_unlock_module_list(struct codetag_type *cttype)
> > >>> up_read(&cttype->mod_lock);
> > >>> }
> > >>>
> > >>> +unsigned long codetag_get_content_id(struct codetag_type *cttype)
> > >>> +{
> > >>> + lockdep_assert_held(&cttype->mod_lock);
> > >>> +
> > >>> + return cttype->content_id;
> > >>> +}
> > >>> +
> > >>> +unsigned int codetag_get_count(struct codetag_type *cttype)
> > >>> +{
> > >>> + lockdep_assert_held(&cttype->mod_lock);
> > >>> +
> > >>> + return cttype->count;
> > >>> +}
> > >>> +
> > >>> struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
> > >>> {
> > >>> struct codetag_iterator iter = {
> > >>> @@ -204,6 +220,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
> > >>>
> > >>> down_write(&cttype->mod_lock);
> > >>> cmod->mod_seq = ++cttype->next_mod_seq;
> > >>> + ++cttype->content_id;
> > >> I have a comment on the content_id bump placement.
> > >>
> > >> ++cttype->content_id is placed before idr_alloc and the module_load
> > >>
> > >> callback. If idr_alloc fails or module_load returns an error
> > >>
> > >> (While the chance of this occurring is very low.), the idr entry gets
> > >>
> > >> rolled back but content_id has already been bumped. The actual
> > >>
> > >> content didn't change in this case, so userspace would see a
> > >>
> > >> different content_id and assume the data is inconsistent when it
> > >>
> > >> isn't.
> > >>
> > >>
> > >> Thanks
> > >>
> > >> Best Regards
> > >>
> > >> Hao
> > > While I agree with your comment, I decided to place the counter
> > > increment there because the chance of failure is low. Furthermore,
> > > even if it falsely invalidates user data, the user will simply query
> > > the content again. This placement also aligns with where the
> > > previously used field (cttype->next_mod_seq) was incremented. Let me
> > > know if you still think I should move it. Thanks!
> >
> > Sorry, I should have marked this as a nit when I raised the comment.
> >
> > Given its low probability of occurring, it doesn't block anything for now.
> >
> > The reason I raised this comment was just in case someone adds new logic
> >
> > in the feature that could fail. But if that happens, we can move both
> > next_mod_seq
> >
> > and content_id down together.
> >
> >
> > Thanks
> >
> > Best Regards
> >
> > Hao
> >
>
> Sounds good, so for now I am not touching this patch and keeping it as
> is other than the file path nit inside the UAPI header file.
Actually, checkpatch is complaining about using the same filename
inside of the file and hence I am dropping that comment completely.
> > >>> mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
> > >>> if (mod_id >= 0) {
> > >>> if (cttype->desc.module_load) {
> > >>> @@ -368,6 +385,7 @@ void codetag_unload_module(struct module *mod)
> > >>> cttype->count -= range_size(cttype, &cmod->range);
> > >>> idr_remove(&cttype->mod_idr, mod_id);
> > >>> kfree(cmod);
> > >>> + ++cttype->content_id;
> > >>> }
> > >>> up_write(&cttype->mod_lock);
> > >>> if (found && cttype->desc.free_section_mem)
^ permalink raw reply
* Re: [PATCH v3 5/6] kselftest: alloc_tag: add kselftest for ioctl interface
From: Abhishek Bapat @ 2026-06-09 20:54 UTC (permalink / raw)
To: Hao Ge
Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
Sourav Panda, Suren Baghdasaryan, Andrew Morton, Kent Overstreet
In-Reply-To: <26193499-e9dd-45e7-afc2-365685d6a749@linux.dev>
On Mon, Jun 8, 2026 at 11:27 PM Hao Ge <hao.ge@linux.dev> wrote:
>
>
> On 2026/6/9 14:09, Hao Ge wrote:
> > Hi Abhishek
> >
> >
> > On 2026/6/6 07:36, Abhishek Bapat wrote:
> >> Introduce a kselftest to verify the new IOCTL-based interface for
> >> /proc/allocinfo. The test covers:
> >>
> >> 1. Validation of the filename filter.
> >> 2. Validation of the function filter.
> >>
> >> The first test validates the functionality of the filename filter. Using
> >> "mm/memory.c" as the candidate filename filter, it retrieves filtered
> >> entries from both procfs and ioctl and matches the first VEC_MAX_ENTRIES
> >> entries.
> >>
> >> The second test validates the functionality of the function filter.
> >> It uses "dup_mm" as the candidate function as we do not expect this
> >> function name to change frequently and hence won't be needing to modify
> >> this test often.
> >>
> >> Note that both the tests match line no, function name and file name
> >> fields. Bytes allocated and calls are not matched as those values may
> >> change in the time when the data is being read from procfs and ioctl and
> >> hence can lead to false negatives.
> >>
> >> Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
> >> ---
> >> MAINTAINERS | 1 +
> >> tools/testing/selftests/alloc_tag/Makefile | 9 +
> >> .../alloc_tag/allocinfo_ioctl_test.c | 313 ++++++++++++++++++
> >> 3 files changed, 323 insertions(+)
> >> create mode 100644 tools/testing/selftests/alloc_tag/Makefile
> >> create mode 100644
> >> tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> >>
> >> diff --git a/MAINTAINERS b/MAINTAINERS
> >> index 77f3fc487691..80560f5f1292 100644
> >> --- a/MAINTAINERS
> >> +++ b/MAINTAINERS
> >> @@ -16713,6 +16713,7 @@ F: include/linux/alloc_tag.h
> >> F: include/linux/pgalloc_tag.h
> >> F: include/uapi/linux/alloc_tag.h
> >> F: lib/alloc_tag.c
> >> +F: tools/testing/selftests/alloc_tag/
> >> MEMORY CONTROLLER DRIVERS
> >> M: Krzysztof Kozlowski <krzk@kernel.org>
> >> diff --git a/tools/testing/selftests/alloc_tag/Makefile
> >> b/tools/testing/selftests/alloc_tag/Makefile
> >> new file mode 100644
> >> index 000000000000..f2b8fc022c3b
> >> --- /dev/null
> >> +++ b/tools/testing/selftests/alloc_tag/Makefile
> >> @@ -0,0 +1,9 @@
> >> +# SPDX-License-Identifier: GPL-2.0
> >> +
> >> +TEST_GEN_PROGS := allocinfo_ioctl_test
> >> +
> >> +CFLAGS += -Wall
> >> +CFLAGS += -I../../../../usr/include
> >> +
> >> +include ../lib.mk
> >> +
> >> diff --git a/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> >> b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> >> new file mode 100644
> >> index 000000000000..5c3c16e86c23
> >> --- /dev/null
> >> +++ b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> >> @@ -0,0 +1,313 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +
> >> +/* kselftest for allocinfo ioctl
> >> + * allocinfo ioctl retrives allocinfo data through ioctl
> >
> >
> > nit: s/retrives/retrieves/
> >
> >
> > I've applied the full patch series locally and ran the kselftest, all
> > 4 tests pass:
> >
> > [root@localhost alloc_tag]# ./allocinfo_ioctl_test
> > 1..4
> > ok 1 test_filename_filter
> > ok 2 test_function_filter
> > ok 3 test_size_filter
> > ok 4 test_lineno_filter
> > # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0
> >
> > But there are no tests for ALLOCINFO_FILTER_MASK_MODNAME and
> >
> > ALLOCINFO_FILTER_MASK_INACCURATE.
>
>
> Sorry, please disregard my suggestion about adding tests for
>
> ALLOCINFO_FILTER_MASK_MODNAME and ALLOCINFO_FILTER_MASK_INACCURATE.
>
> ALLOCINFO_FILTER_MASK_MODNAME depends on kernel config and also requires
>
> the module to be loaded. ALLOCINFO_FILTER_MASK_INACCURATE entries may not
>
> be common, unless we can find a stable way to produce them.
>
>
Ack, those are non-trivial to repro and hence we left them out.
> >
> >
> > Thanks
> >
> > Best Regards
> >
> > Hao
> >
> >> + * Copyright (C) 2026 Google, Inc.
> >> + */
> >> +
> >> +#include <errno.h>
> >> +#include <fcntl.h>
> >> +#include <stdio.h>
> >> +#include <stdlib.h>
> >> +#include <string.h>
> >> +#include <stdbool.h>
> >> +#include <unistd.h>
> >> +#include <sys/ioctl.h>
> >> +#include <linux/types.h>
> >> +#include <linux/alloc_tag.h>
> >> +#include "../kselftest.h"
> >> +
> >> +#define MAX_LINE_LEN 512
> >> +#define ALLOCINFO_PROC "/proc/allocinfo"
> >> +
> >> +enum ioctl_ret {
> >> + IOCTL_SUCCESS = 0,
> >> + IOCTL_FAILURE = 1,
> >> + IOCTL_INVALID_DATA = 2,
> >> +};
> >> +
> >> +#define VEC_MAX_ENTRIES 32
> >> +
> >> +struct allocinfo_tag_data_vec {
> >> + struct allocinfo_tag_data tag[VEC_MAX_ENTRIES];
> >> + __u64 count;
> >> +};
> >> +
> >> +static inline int __allocinfo_get_content_id(int dev_fd, struct
> >> allocinfo_content_id *params)
> >> +{
> >> + return ioctl(dev_fd, ALLOCINFO_IOC_CONTENT_ID, params);
> >> +}
> >> +
> >> +static inline int __allocinfo_get_at(int dev_fd, struct
> >> allocinfo_get_at *params)
> >> +{
> >> + return ioctl(dev_fd, ALLOCINFO_IOC_GET_AT, params);
> >> +}
> >> +
> >> +static inline int __allocinfo_get_next(int dev_fd, struct
> >> allocinfo_tag_data *params)
> >> +{
> >> + return ioctl(dev_fd, ALLOCINFO_IOC_GET_NEXT, params);
> >> +}
> >> +
> >> +static bool match_entry(const struct allocinfo_tag_data *procfs_entry,
> >> + const struct allocinfo_tag_data *tag_data,
> >> + bool match_bytes, bool match_calls, bool match_lineno,
> >> + bool match_function, bool match_filename)
> >> +{
> >> + if (match_bytes && tag_data->counter.bytes !=
> >> procfs_entry->counter.bytes) {
> >> + ksft_print_msg("size retrieved through ioctl does not match
> >> procfs\n");
> >> + return false;
> >> + }
> >> +
> >> + if (match_calls && tag_data->counter.calls !=
> >> procfs_entry->counter.calls) {
> >> + ksft_print_msg("call count retrieved through ioctl does not
> >> match procfs\n");
> >> + return false;
> >> + }
> >> +
> >> + if (match_lineno && tag_data->tag.lineno !=
> >> procfs_entry->tag.lineno) {
> >> + ksft_print_msg("lineno retrieved through ioctl does not
> >> match procfs\n");
> >> + return false;
> >> + }
> >> +
> >> + if (match_function &&
> >> + strncmp(tag_data->tag.function, procfs_entry->tag.function,
> >> ALLOCINFO_STR_SIZE)) {
> >> + ksft_print_msg("function retrieved through ioctl does not
> >> match procfs\n");
> >> + return false;
> >> + }
> >> +
> >> + if (match_filename &&
> >> + strncmp(tag_data->tag.filename, procfs_entry->tag.filename,
> >> ALLOCINFO_STR_SIZE)) {
> >> + ksft_print_msg("filename retrieved through ioctl does not
> >> match procfs\n");
> >> + return false;
> >> + }
> >> + return true;
> >> +}
> >> +
> >> +static bool match_entries(const struct allocinfo_tag_data_vec
> >> *procfs_entries,
> >> + const struct allocinfo_tag_data_vec *tags,
> >> + bool match_bytes, bool match_calls, bool match_lineno,
> >> + bool match_function, bool match_filename)
> >> +{
> >> + __u64 i;
> >> +
> >> + if (procfs_entries->count != tags->count) {
> >> + ksft_print_msg("Entry count mismatch. ioctl entries: %llu,
> >> proc entries: %llu\n",
> >> + tags->count, procfs_entries->count);
> >> + return false;
> >> + }
> >> + for (i = 0; i < procfs_entries->count; i++) {
> >> + if (!match_entry(&procfs_entries->tag[i], &tags->tag[i],
> >> + match_bytes, match_calls, match_lineno,
> >> + match_function, match_filename)) {
> >> + ksft_print_msg("%lluth entry does not match.\n", i);
> >> + return false;
> >> + }
> >> + }
> >> + return true;
> >> +}
> >> +
> >> +static int get_filtered_procfs_entries(struct allocinfo_tag_data_vec
> >> *procfs_entries,
> >> + const struct allocinfo_filter *filter, int fd)
> >> +{
> >> + FILE *fp = fdopen(fd, "r");
> >> + char line[MAX_LINE_LEN];
> >> + int matches;
> >> + struct allocinfo_tag_data procfs_entry;
> >> +
> >> + if (!fp) {
> >> + ksft_print_msg("Failed to open " ALLOCINFO_PROC " for
> >> reading\n");
> >> + return 1;
> >> + }
> >> + memset(procfs_entries, 0, sizeof(*procfs_entries));
> >> + while (fgets(line, sizeof(line), fp) && procfs_entries->count <
> >> VEC_MAX_ENTRIES) {
> >> +
> >> + memset(&procfs_entry, 0, sizeof(procfs_entry));
> >> + matches = sscanf(line, "%llu %llu %[^:]:%llu func:%s",
> >> + &procfs_entry.counter.bytes,
> >> + &procfs_entry.counter.calls,
> >> + procfs_entry.tag.filename,
> >> + &procfs_entry.tag.lineno,
> >> + procfs_entry.tag.function);
> >> +
> >> + if (matches != 5)
> >> + continue;
> >> +
> >> + if (filter->mask & ALLOCINFO_FILTER_MASK_FILENAME) {
> >> + if (strncmp(procfs_entry.tag.filename,
> >> + filter->fields.filename, ALLOCINFO_STR_SIZE))
> >> + continue;
> >> + }
> >> + if (filter->mask & ALLOCINFO_FILTER_MASK_FUNCTION) {
> >> + if (strncmp(procfs_entry.tag.function,
> >> + filter->fields.function, ALLOCINFO_STR_SIZE))
> >> + continue;
> >> + }
> >> + if (filter->mask & ALLOCINFO_FILTER_MASK_LINENO) {
> >> + if (procfs_entry.tag.lineno != filter->fields.lineno)
> >> + continue;
> >> + }
> >> + if (filter->mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) {
> >> + if (procfs_entry.counter.bytes < filter->min_size)
> >> + continue;
> >> + }
> >> + if (filter->mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) {
> >> + if (procfs_entry.counter.bytes > filter->max_size)
> >> + continue;
> >> + }
> >> +
> >> + memcpy(&procfs_entries->tag[procfs_entries->count++], &procfs_entry,
> >> + sizeof(procfs_entry));
> >> + }
> >> + return 0;
> >> +}
> >> +
> >> +static enum ioctl_ret get_filtered_ioctl_entries(struct
> >> allocinfo_tag_data_vec *tags,
> >> + const struct allocinfo_filter *filter, int fd,
> >> + __u64 start_pos)
> >> +{
> >> + struct allocinfo_content_id start_cont_id, end_cont_id;
> >> + struct allocinfo_get_at get_at_params;
> >> + const int max_retries = 10;
> >> + int retry_count = 0;
> >> + int status;
> >> +
> >> + /*
> >> + * __allocinfo_get_content_id may return different values if a
> >> kernel module was loaded
> >> + * between the two calls. If that happens, the data gathered
> >> cannot be considered consistent
> >> + * and hence needs to be fetched again to avoid flakiness.
> >> + */
> >> + do {
> >> + if (__allocinfo_get_content_id(fd, &start_cont_id)) {
> >> + ksft_print_msg("allocinfo_get_content_id failed\n");
> >> + return IOCTL_FAILURE;
> >> + }
> >> +
> >> + memset(tags, 0, sizeof(*tags));
> >> + memset(&get_at_params, 0, sizeof(get_at_params));
> >> + memcpy(&get_at_params.filter, filter, sizeof(*filter));
> >> + get_at_params.pos = start_pos;
> >> + if (__allocinfo_get_at(fd, &get_at_params)) {
> >> + ksft_print_msg("allocinfo_get_at failed\n");
> >> + return IOCTL_FAILURE;
> >> + }
> >> + memcpy(&tags->tag[tags->count++], &get_at_params.data,
> >> sizeof(get_at_params.data));
> >> +
> >> + while (tags->count < VEC_MAX_ENTRIES &&
> >> + __allocinfo_get_next(fd, &tags->tag[tags->count]) == 0)
> >> + tags->count++;
> >> +
> >> + if (__allocinfo_get_content_id(fd, &end_cont_id)) {
> >> + ksft_print_msg("allocinfo_get_content_id failed\n");
> >> + return IOCTL_FAILURE;
> >> + }
> >> +
> >> + if (start_cont_id.id == end_cont_id.id) {
> >> + status = IOCTL_SUCCESS;
> >> + } else {
> >> + ksft_print_msg("allocinfo_get_content_id mismatch,
> >> retrying...\n");
> >> + status = IOCTL_INVALID_DATA;
> >> + }
> >> + } while (status == IOCTL_INVALID_DATA && retry_count++ <
> >> max_retries);
> >> +
> >> + return status;
> >> +}
> >> +
> >> +static int run_filter_test(const struct allocinfo_filter *filter)
> >> +{
> >> + int fd;
> >> + struct allocinfo_tag_data_vec *tags = malloc(sizeof(*tags));
> >> + struct allocinfo_tag_data_vec *procfs_entries =
> >> malloc(sizeof(*procfs_entries));
> >> + int ioctl_status;
> >> + int ret = KSFT_PASS;
> >> +
> >> + if (!tags || !procfs_entries) {
> >> + ksft_print_msg("Memory allocation failed.\n");
> >> + ret = KSFT_FAIL;
> >> + goto freemem;
> >> + }
> >> +
> >> + fd = open(ALLOCINFO_PROC, O_RDONLY);
> >> + if (fd < 0) {
> >> + ksft_exit_skip("Failed to open " ALLOCINFO_PROC ": %s\n",
> >> strerror(errno));
> >> + ret = KSFT_FAIL;
> >> + goto freemem;
> >> + }
> >> +
> >> + if (get_filtered_procfs_entries(procfs_entries, filter, fd)) {
> >> + ksft_print_msg("Error retrieving entries from "
> >> ALLOCINFO_PROC "\n");
> >> + ret = KSFT_FAIL;
> >> + goto exit;
> >> + }
> >> +
> >> + if (procfs_entries->count == 0) {
> >> + ksft_print_msg("No entries found in " ALLOCINFO_PROC ",
> >> skipping test\n");
> >> + ret = KSFT_SKIP;
> >> + goto exit;
> >> + }
> >> +
> >> + ioctl_status = get_filtered_ioctl_entries(tags, filter, fd, 0);
> >> + if (ioctl_status == IOCTL_INVALID_DATA) {
> >> + ksft_print_msg("Trouble retrieving valid IOCTL entries,
> >> skipping.\n");
> >> + ret = KSFT_SKIP;
> >> + goto exit;
> >> + }
> >> + if (ioctl_status == IOCTL_FAILURE) {
> >> + ksft_print_msg("Error retrieving IOCTL entries.\n");
> >> + ret = KSFT_FAIL;
> >> + goto exit;
> >> + }
> >> +
> >> + if (!match_entries(procfs_entries, tags, false, false, true,
> >> true, true))
> >> + ret = KSFT_FAIL;
> >> +
> >> +exit:
> >> + close(fd);
> >> +freemem:
> >> + free(tags);
> >> + free(procfs_entries);
> >> + return ret;
> >> +}
> >> +
> >> +static int test_filename_filter(void)
> >> +{
> >> + struct allocinfo_filter filter;
> >> + const char *target_filename = "mm/memory.c";
> >> +
> >> + memset(&filter, 0, sizeof(filter));
> >> + filter.mask |= ALLOCINFO_FILTER_MASK_FILENAME;
> >> + strncpy(filter.fields.filename, target_filename,
> >> ALLOCINFO_STR_SIZE);
> >> +
> >> + return run_filter_test(&filter);
> >> +}
> >> +
> >> +static int test_function_filter(void)
> >> +{
> >> + struct allocinfo_filter filter;
> >> + const char *target_function = "dup_mm";
> >> +
> >> + memset(&filter, 0, sizeof(filter));
> >> + filter.mask |= ALLOCINFO_FILTER_MASK_FUNCTION;
> >> + strncpy(filter.fields.function, target_function,
> >> ALLOCINFO_STR_SIZE);
> >> +
> >> + return run_filter_test(&filter);
> >> +}
> >> +
> >> +int main(int argc, char *argv[])
> >> +{
> >> + int ret;
> >> +
> >> + ksft_set_plan(2);
> >> +
> >> + ret = test_filename_filter();
> >> + if (ret == KSFT_SKIP)
> >> + ksft_test_result_skip("Skipping test_filename_filter\n");
> >> + else
> >> + ksft_test_result(ret == KSFT_PASS, "test_filename_filter\n");
> >> +
> >> + ret = test_function_filter();
> >> + if (ret == KSFT_SKIP)
> >> + ksft_test_result_skip("Skipping test_function_filter\n");
> >> + else
> >> + ksft_test_result(ret == KSFT_PASS, "test_function_filter\n");
> >> +
> >> + ksft_finished();
> >> +}
^ permalink raw reply
* Re: [PATCH v3 4/6] alloc_tag: add accuracy based filtering to ioctl
From: Abhishek Bapat @ 2026-06-09 20:53 UTC (permalink / raw)
To: Suren Baghdasaryan
Cc: Hao Ge, Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel,
linux-mm, Sourav Panda, Andrew Morton, Kent Overstreet
In-Reply-To: <CAJuCfpGbxO0zu_UAWCYNNv8RHgT=E4AF0tgG-kWoLXECOL3byA@mail.gmail.com>
On Tue, Jun 9, 2026 at 7:41 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, Jun 8, 2026 at 6:26 PM Hao Ge <hao.ge@linux.dev> wrote:
> >
> > Hi Suren
> >
> >
> > On 2026/6/9 04:55, Suren Baghdasaryan wrote:
> > > On Mon, Jun 8, 2026 at 1:25 AM Hao Ge <hao.ge@linux.dev> wrote:
> > >>
> > >> On 2026/6/8 14:22, Hao Ge wrote:
> > >>> Hi Abhishek
> > >>>
> > >>>
> > >>> On 2026/6/6 07:36, Abhishek Bapat wrote:
> > >>>> Extend the allocinfo filtering mechanism to allow users to filter tags
> > >>>> based on their accuracy.
> > >>>>
> > >>>> Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
> > >>>> ---
> > >>>> include/uapi/linux/alloc_tag.h | 3 +++
> > >>>> lib/alloc_tag.c | 8 ++++++++
> > >>>> 2 files changed, 11 insertions(+)
> > >>>>
> > >>>> diff --git a/include/uapi/linux/alloc_tag.h
> > >>>> b/include/uapi/linux/alloc_tag.h
> > >>>> index 0e648192df4d..42445bdb11c5 100644
> > >>>> --- a/include/uapi/linux/alloc_tag.h
> > >>>> +++ b/include/uapi/linux/alloc_tag.h
> > >>>> @@ -20,6 +20,7 @@ struct allocinfo_tag {
> > >>>> char function[ALLOCINFO_STR_SIZE];
> > >>>> char filename[ALLOCINFO_STR_SIZE];
> > >>>> __u64 lineno;
> > >>>> + __u64 inaccurate;
> > >>>
> > >>> I was wondering if it would make sense to define inaccurate as a flags
> > >>> field
> > >>>
> > >>> (e.g. __u64 flags with ALLOCINFO_TAG_F_INACCURATE (1 <<0)),
> > >>>
> > >>> so that only bit 0 is used today and the upper bits are reserved for
> > >>> future use,
> > >>>
> > >>> aligning with current kernel codebase.
> > >>>
> > >>> This design also allows for better extensibility if we need to
> > >>>
> > >>> add new flags for any reason in the future.
> > >>>
> > >>> We also need to add flag validity checks if we go this route.
> > >>>
> > >> And I've reviewed the issue reported by Sashiko, and I think it's valid.
> > >>
> > >> When we expand the allocinfo_tag_data structure
> > >>
> > >> struct allocinfo_tag_data{
> > >>
> > >> char modname[64];
> > >>
> > >> char function[64];
> > >>
> > >> char filename[64];
> > >>
> > >> __u64 lineno;
> > >>
> > >> __u64 inaccurate;
> > >>
> > >> __u64 bytes;
> > >>
> > >> __u64 calls;
> > >>
> > >> __u8 accurate;
> > >> /* padding */
> > >>
> > >> }
> > >>
> > >> I think user space may see two fields related to inaccuracy.
> > > Yes but one field (inside allocinfo_tag) is the input parameter which
> > > user provides to specify the filtering criteria and the other is the
> > > returned tag information. It's similar to any other tag attribute
> > > which you can be included in the filters.
> > >
> > >> How do you like these modifications?
> > >>
> > >>
> > >> diff --git a/include/uapi/linux/alloc_tag.h b/include/uapi/linux/alloc_tag.h
> > >> --- a/include/uapi/linux/alloc_tag.h
> > >> +++ b/include/uapi/linux/alloc_tag.h
> > >> @@ -20,7 +20,6 @@ struct allocinfo_tag {
> > >> char function[ALLOCINFO_STR_SIZE];
> > >> char filename[ALLOCINFO_STR_SIZE];
> > >> __u64 lineno;
> > >> - __u64 inaccurate;
> > >> };
> > >>
> > >> /* The alignment ensures 32-bit compatible interfaces are not broken */
> > >> @@ -40,7 +39,7 @@ enum {
> > >> ALLOCINFO_FILTER_FUNCTION,
> > >> ALLOCINFO_FILTER_FILENAME,
> > >> ALLOCINFO_FILTER_LINENO,
> > >> - ALLOCINFO_FILTER_INACCURATE,
> > >> + ALLOCINFO_FILTER_FLAGS,
> > >> ALLOCINFO_FILTER_MIN_SIZE,
> > >> ALLOCINFO_FILTER_MAX_SIZE,
> > >> __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
> > >> @@ -50,16 +49,20 @@ enum {
> > >> #define ALLOCINFO_FILTER_MASK_FUNCTION (1 <<
> > >> ALLOCINFO_FILTER_FUNCTION)
> > >> #define ALLOCINFO_FILTER_MASK_FILENAME (1 <<
> > >> ALLOCINFO_FILTER_FILENAME)
> > >> #define ALLOCINFO_FILTER_MASK_LINENO (1 << ALLOCINFO_FILTER_LINENO)
> > >> -#define ALLOCINFO_FILTER_MASK_INACCURATE (1 <<
> > >> ALLOCINFO_FILTER_INACCURATE)
> > >> +#define ALLOCINFO_FILTER_MASK_FLAGS (1 << ALLOCINFO_FILTER_FLAGS)
> > >> #define ALLOCINFO_FILTER_MASK_MIN_SIZE (1 <<
> > >> ALLOCINFO_FILTER_MIN_SIZE)
> > >> #define ALLOCINFO_FILTER_MASK_MAX_SIZE (1 <<
> > >> ALLOCINFO_FILTER_MAX_SIZE)
> > >>
> > >> #define ALLOCINFO_FILTER_MASKS \
> > >> ((1 << (__ALLOCINFO_FILTER_LAST + 1)) - 1)
> > >>
> > >> +#define ALLOCINFO_FILTER_F_INACCURATE (1ULL << 0)
> > >> +#define ALLOCINFO_FILTER_FLAGS_ALL ALLOCINFO_FILTER_F_INACCURATE
> > >> +
> > >> struct allocinfo_filter {
> > >> __u64 mask; /* bitmask of the filter fields used */
> > >> struct allocinfo_tag fields;
> > >> + __u64 flags; /* bitmask of ALLOCINFO_FILTER_F_* */
> > >> __u64 min_size;
> > >> __u64 max_size;
> > >> };
> > >> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > >> --- a/lib/alloc_tag.c
> > >> +++ b/lib/alloc_tag.c
> > >> @@ -249,8 +249,6 @@ static bool matches_filter(struct codetag *ct,
> > >> struct allocinfo_filter *filter,
> > >> struct alloc_tag_counters *counters,
> > >> bool *fetched_counters)
> > >> {
> > >> - bool inaccurate;
> > >> -
> > >> if (!filter || !filter->mask)
> > >> return true;
> > >>
> > >> @@ -277,10 +275,11 @@ static bool matches_filter(struct codetag *ct,
> > >> struct allocinfo_filter *filter,
> > >> ct->lineno != filter->fields.lineno)
> > >> return false;
> > >>
> > >> - if (filter->mask & ALLOCINFO_FILTER_MASK_INACCURATE) {
> > >> - inaccurate = !!(ct->flags & CODETAG_FLAG_INACCURATE);
> > >> - if (inaccurate != !!(filter->fields.inaccurate))
> > >> - return false;
> > >> + if (filter->mask & ALLOCINFO_FILTER_MASK_FLAGS) {
> > >> + if (filter->flags & ALLOCINFO_FILTER_F_INACCURATE) {
> > >> + if (!(ct->flags & CODETAG_FLAG_INACCURATE))
> > > How would you filter records which have only accurate data?
> >
> >
> > Sorry, I overlooked this case.
> >
> > Since allocinfo_tag_data exposes both inaccurate (from allocinfo_tag) and
> >
> > accurate (from allocinfo_counter), userspace developers might mistakenly
> > read
> >
> > inaccurate instead of accurate when checking accuracy.
> >
> > How about we add a comment to clarify?
> >
> > struct allocinfo_tag {
> >
> > /* ... */
> >
> > __u64 lineno;
> >
> > /* filter criteria only; see allocinfo_counter.accurate for actual
> > accuracy */
> >
> > __u64 inaccurate;
>
> I think we had comments showing which block of parameters are inputs
> and which ones are outputs but I'm not opposed to an additional
> reminder here.
>
Ack, I'll include the recommended comment.
> >
> > };
> >
> >
> > LGTM for the rest.
> >
> >
> > Thanks
> >
> > Best Regards
> >
> > Hao
> >
> > > Overall I would prefer ALLOCINFO_FILTER_MASK_INACCURATE rather than
> > > ALLOCINFO_FILTER_MASK_FLAGS. The fact that this attribute is a
> > > single-bit flag is a technical detail. It's still a tag attribuite
> > > like file and module names and IMO deserves its own filter.
> > >
> > >
> > >
> > >> + return false;
> > >> + }
> > >> }
> > >>
> > >> if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE |
> > >> ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
> > >> @@ -318,6 +317,10 @@ static int allocinfo_ioctl_get_at(struct seq_file
> > >> *m, void __user *arg)
> > >> if (params.filter.mask & ~ALLOCINFO_FILTER_MASKS)
> > >> return -EINVAL;
> > >>
> > >> + if ((params.filter.mask & ALLOCINFO_FILTER_MASK_FLAGS) &&
> > >> + (params.filter.flags & ~ALLOCINFO_FILTER_FLAGS_ALL))
> > >> + return -EINVAL;
> > >> +
> > >> if ((params.filter.mask & ALLOCINFO_FILTER_MASK_MIN_SIZE) &&
> > >> (params.filter.mask & ALLOCINFO_FILTER_MASK_MAX_SIZE) &&
> > >> params.filter.min_size > params.filter.max_size)
> > >>
> > >>
> > >> Thanks
> > >>
> > >> Best Regards
> > >>
> > >> Hao
> > >>
> > >>
> > >>> Thanks
> > >>>
> > >>> Best Regards
> > >>>
> > >>> Hao
> > >>>
> > >>>
> > >>>> };
> > >>>> /* The alignment ensures 32-bit compatible interfaces are not
> > >>>> broken */
> > >>>> @@ -39,6 +40,7 @@ enum {
> > >>>> ALLOCINFO_FILTER_FUNCTION,
> > >>>> ALLOCINFO_FILTER_FILENAME,
> > >>>> ALLOCINFO_FILTER_LINENO,
> > >>>> + ALLOCINFO_FILTER_INACCURATE,
> > >>>> ALLOCINFO_FILTER_MIN_SIZE,
> > >>>> ALLOCINFO_FILTER_MAX_SIZE,
> > >>>> __ALLOCINFO_FILTER_LAST = ALLOCINFO_FILTER_MAX_SIZE
> > >>>> @@ -48,6 +50,7 @@ enum {
> > >>>> #define ALLOCINFO_FILTER_MASK_FUNCTION (1 <<
> > >>>> ALLOCINFO_FILTER_FUNCTION)
> > >>>> #define ALLOCINFO_FILTER_MASK_FILENAME (1 <<
> > >>>> ALLOCINFO_FILTER_FILENAME)
> > >>>> #define ALLOCINFO_FILTER_MASK_LINENO (1 <<
> > >>>> ALLOCINFO_FILTER_LINENO)
> > >>>> +#define ALLOCINFO_FILTER_MASK_INACCURATE (1 <<
> > >>>> ALLOCINFO_FILTER_INACCURATE)
> > >>>> #define ALLOCINFO_FILTER_MASK_MIN_SIZE (1 <<
> > >>>> ALLOCINFO_FILTER_MIN_SIZE)
> > >>>> #define ALLOCINFO_FILTER_MASK_MAX_SIZE (1 <<
> > >>>> ALLOCINFO_FILTER_MAX_SIZE)
> > >>>> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > >>>> index ddc6946f56ab..cbcd12c4ef9c 100644
> > >>>> --- a/lib/alloc_tag.c
> > >>>> +++ b/lib/alloc_tag.c
> > >>>> @@ -249,6 +249,8 @@ static bool matches_filter(struct codetag *ct,
> > >>>> struct allocinfo_filter *filter,
> > >>>> struct alloc_tag_counters *counters,
> > >>>> bool *fetched_counters)
> > >>>> {
> > >>>> + bool inaccurate;
> > >>>> +
> > >>>> if (!filter || !filter->mask)
> > >>>> return true;
> > >>>> @@ -275,6 +277,12 @@ static bool matches_filter(struct codetag *ct,
> > >>>> struct allocinfo_filter *filter,
> > >>>> ct->lineno != filter->fields.lineno)
> > >>>> return false;
> > >>>> + if (filter->mask & ALLOCINFO_FILTER_MASK_INACCURATE) {
> > >>>> + inaccurate = !!(ct->flags & CODETAG_FLAG_INACCURATE);
> > >>>> + if (inaccurate != !!(filter->fields.inaccurate))
> > >>>> + return false;
> > >>>> + }
> > >>>> +
> > >>>> if (filter->mask & (ALLOCINFO_FILTER_MASK_MIN_SIZE |
> > >>>> ALLOCINFO_FILTER_MASK_MAX_SIZE)) {
> > >>>> if (!*fetched_counters) {
> > >>>> *counters = allocinfo_prefetch_counters(ct);
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox