* [PATCH] tracing: Switch trace_recursion_record.c code over to use guard()
From: Yash Suthar @ 2026-05-02 17:47 UTC (permalink / raw)
To: rostedt
Cc: mhiramat, mathieu.desnoyers, linux-kernel, linux-trace-kernel,
skhan, me, Yash Suthar
Switch mutex_lock()/mutex_unlock() to guard().
also drop the ret local variable and return directly.
Signed-off-by: Yash Suthar <yashsuthar983@gmail.com>
---
kernel/trace/trace_recursion_record.c | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
index 784fe1fbb866..bac4bc844ccd 100644
--- a/kernel/trace/trace_recursion_record.c
+++ b/kernel/trace/trace_recursion_record.c
@@ -180,9 +180,8 @@ static const struct seq_operations recursed_function_seq_ops = {
static int recursed_function_open(struct inode *inode, struct file *file)
{
- int ret = 0;
+ guard(mutex)(&recursed_function_lock);
- mutex_lock(&recursed_function_lock);
/* If this file was opened for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
/* disable updating records */
@@ -194,10 +193,9 @@ static int recursed_function_open(struct inode *inode, struct file *file)
atomic_set(&nr_records, 0);
}
if (file->f_mode & FMODE_READ)
- ret = seq_open(file, &recursed_function_seq_ops);
- mutex_unlock(&recursed_function_lock);
+ return seq_open(file, &recursed_function_seq_ops);
- return ret;
+ return 0;
}
static ssize_t recursed_function_write(struct file *file,
--
2.43.0
^ permalink raw reply related
* [PATCH] tracing: probes: remove unused variable
From: Martin Kaiser @ 2026-05-02 13:57 UTC (permalink / raw)
To: Steven Rostedt, Masami Hiramatsu
Cc: Markus Schneider-Pargmann, linux-trace-kernel, linux-kernel,
Martin Kaiser
params is always NULL in traceprobe_expand_meta_args, it can be removed.
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
---
Would it be better to return ERR_PTR(-EOPNOTSUPP) instead of NULL, similar to
what is done in other places where NOSUP_BTFARG is logged? This would make the
parsing fail if $arg* is used without BTF support. At the moment, we skip the
$arg*...
kernel/trace/trace_probe.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index e0d3a0da26af..b627093a941e 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1729,7 +1729,6 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
int *new_argc, char *buf, int bufsize,
struct traceprobe_parse_context *ctx)
{
- const struct btf_param *params = NULL;
int i, j, n, used, ret, args_idx = -1;
const char **new_argv __free(kfree) = NULL;
@@ -1747,7 +1746,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
if (args_idx != -1) {
/* $arg* requires BTF info */
trace_probe_log_err(0, NOSUP_BTFARG);
- return (const char **)params;
+ return NULL;
}
*new_argc = argc;
return NULL;
--
2.43.7
^ permalink raw reply related
* [PATCH v3] trace_printk: remove local variable for argument detection
From: Qian-Yu Lin @ 2026-05-02 7:55 UTC (permalink / raw)
To: rostedt
Cc: mhiramat, david.laight.linux, linux-kernel, linux-trace-kernel,
Qian-Yu Lin
In-Reply-To: <20260429165707.7020-1-tiffany019230@gmail.com>
The trace_printk() macro uses a local variable _______STR to detect
whether variadic arguments are present. This name can shadow outer
variables.
Replace the local variable with sizeof applied directly to the
stringified arguments:
if (sizeof __stringify((__VA_ARGS__)) > 3)
This eliminates the shadowing risk entirely without introducing
any additional includes or local variables.
Verified with objdump on samples/trace_printk that all four cases
branch correctly: __trace_bputs, __trace_puts, __trace_bprintk,
and __trace_printk.
Suggested-by: David Laight <david.laight.linux@gmail.com>
Signed-off-by: Qian-Yu Lin <tiffany019230@gmail.com>
---
include/linux/trace_printk.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
index 2670ec7f4262..3d54f440dccf 100644
--- a/include/linux/trace_printk.h
+++ b/include/linux/trace_printk.h
@@ -86,8 +86,7 @@ do { \
#define trace_printk(fmt, ...) \
do { \
- char _______STR[] = __stringify((__VA_ARGS__)); \
- if (sizeof(_______STR) > 3) \
+ if (sizeof __stringify((__VA_ARGS__)) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
--
2.43.0
^ permalink raw reply related
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: Qian-Yu Lin @ 2026-05-02 7:37 UTC (permalink / raw)
To: David Laight; +Cc: rostedt, mhiramat, linux-kernel, linux-trace-kernel
In-Reply-To: <20260501221315.1f709d6d@pumpkin>
On Fri, May 01, 2026 at 10:13:15PM +0100, David Laight wrote:
> On Fri, 1 May 2026 22:40:17 +0800
> Qian-Yu Lin <tiffany019230@gmail.com> wrote:
>
> ...
> > Yes. I measured compile time of kernel/trace/ring_buffer_benchmark.o
> > after make clean on an x86_64 machine running Ubuntu 24.04 LTS:
> >
> > - Original _______STR: 49.8s
> > - v1 with __UNIQUE_ID (compiler.h): 53.5s
> > - compound literal (no extra include): 33.2s
>
> That difference looks far to big to me.
> And the times are far too large to be measuring the actual compile time.
>
You're right, my earlier measurements included dependency rebuilds
after make clean. I re-measured using touch to isolate the actual
compile time of ring_buffer_benchmark.o on x86_64:
- Original ___STR: 1.757s
- v1 with __UNIQUE_ID (compiler.h): 1.836s
- sizeof __stringify (your suggestion): 1.781s
> >
> > I propose using a compound literal in v2, which eliminates the local
> > variable entirely and requires no extra include:
> >
> > #define trace_printk(fmt, ...) \
> > do { \
> > if (sizeof((char[]) \
> > {__stringify((__VA_ARGS__))}) > 3) \
> > do_trace_printk(fmt, ##__VA_ARGS__); \
>
> There has to be a better way to align that code.
> Although you should be able to use:
> if (sizeof __stringify((__VA_ARGS__)) > 3)
> (I've omitted one set of parenthesis for clarity)
>
> You could change __stringify() to work with __VA_ARGS__ the you don't need
> the extra (); this works fine:
> #define _x(...) #__VA_ARGS__
> #define x(...) _x(__VA_ARGS__)
> #define z abcd
> int a = sizeof x(z, v); /* 8 */
> See: https://godbolt.org/z/zo4h4nr9b
>
> -- David
>
Yes, this works. I verified with objdump on the
samples/trace_printk module that all four cases branch correctly:
__trace_bputs, __trace_puts, __trace_bprintk, and __trace_printk.
I'll use this form in v3 since it's simpler than the compound literal.
> > else \
> > trace_puts(fmt); \
> > } while (0)
> >
> > This fully eliminates the shadowing risk without any compile overhead.
> >
> > Qian-Yu
> >
> > >
> > >
> > > > #include <linux/compiler_attributes.h>
> > > > #include <linux/instruction_pointer.h>
> > > > #include <linux/stddef.h>
> > > > @@ -84,15 +85,18 @@ do { \
> > > > * let gcc optimize the rest.
> > > > */
> > > >
> > > > -#define trace_printk(fmt, ...) \
> > > > +#define ___trace_printk(fmt, str, ...) \
> > > > do { \
> > > > - char _______STR[] = __stringify((__VA_ARGS__)); \
> > > > - if (sizeof(_______STR) > 3) \
> > > > + char str[] = __stringify((__VA_ARGS__)); \
> > > > + if (sizeof(str) > 3) \
> > > > do_trace_printk(fmt, ##__VA_ARGS__); \
> > > > else \
> > > > trace_puts(fmt); \
> > > > } while (0)
> > > >
> > > > +#define trace_printk(fmt, ...) \
> > > > + ___trace_printk(fmt, __UNIQUE_ID(str), ##__VA_ARGS__)
> > > > +
> > > > #define do_trace_printk(fmt, args...) \
> > > > do { \
> > > > static const char *trace_printk_fmt __used \
> > >
>
^ permalink raw reply
* Re: [PATCH RFC v5 00/53] guest_memfd: In-place conversion support
From: Ackerley Tng @ 2026-05-01 22:21 UTC (permalink / raw)
To: Michael Roth
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta, qperret,
rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
aneesh.kumar, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
linux-coco, Jacob Xu, Darwin Guo
In-Reply-To: <CAEvNRgHRpvsEjtr1A_Qz3d4oMEaffTxESavrZ73Jtt6OobCwhA@mail.gmail.com>
Ackerley Tng <ackerleytng@google.com> writes:
>
> [...snip...]
>
>
> TLDR:
>
> + PRESERVE == guarantee that the process of setting memory attributes
> doesn't change memory contents.
> + implementation == do nothing in most cases, except -EOPNOTSUPP for
> to-shared on TDX, since unmapping is a required part of setting
> memory attributes to private, and a TDX side effect of unmapping
> is zeroing memory,
-EOPNOTSUPP will only be for TDX, not SNP.
> + ZERO == guarantee that the process of setting memory attributes zeroes
> memory contents.
> + implementation == memset(zero) in most cases. For TDX, a future
> optimization exists, where memset() can be skipped for pages that
> were mapped in Secure EPTs before conversion
> + UNSPECIFIED == no guarantees
> + implementation == guest_memfd does nothing explicitly about memory
> contents. The implementation is pretty much the same as PRESERVE
> except guest_memfd won't take into account vendor-specific side
> effects of the process of conversion. Except for the test vehicle
> KVM_X86_SW_PROTECTED_VMS, where memory is scrambled.
>
Found another use case internally for pre-finalize, SNP, to-shared,
PRESERVE, which works with the above smaller scope.
During SNP_LAUNCH_UPDATE, when inserting a CPUID page, the firmware will
check that the CPUID values would not lead to an insecure guest
state. SNP_LAUNCH_UPDATE will fail with an error and the page remains
shared in the RMP table.
Here's the proposed flow in the userspace VMM:
1. Load CPUID in shared guest_memfd memory
2. SET_MEMORY_ATTRIBUTES(PRIVATE, PRESERVE)
3. SNP_LAUNCH_UPDATE => get error since CPUID was insecure
4. SET_MEMORY_ATTRIBUTES(SHARED, PRESERVE)
5. Read shared guest_memfd memory, error if VMM disagrees
6. SET_MEMORY_ATTRIBUTES(PRIVATE, PRESERVE)
7. SNP_LAUNCH_UPDATE => successful, since CPUID is now corrected
Does that seem ok?
>>>
>>> [...snip...]
>>>
^ permalink raw reply
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: David Laight @ 2026-05-01 21:13 UTC (permalink / raw)
To: Qian-Yu Lin; +Cc: Steven Rostedt, mhiramat, linux-kernel, linux-trace-kernel
In-Reply-To: <afS7UUAfzgiSzEHv@nova>
On Fri, 1 May 2026 22:40:17 +0800
Qian-Yu Lin <tiffany019230@gmail.com> wrote:
...
> Yes. I measured compile time of kernel/trace/ring_buffer_benchmark.o
> after make clean on an x86_64 machine running Ubuntu 24.04 LTS:
>
> - Original _______STR: 49.8s
> - v1 with __UNIQUE_ID (compiler.h): 53.5s
> - compound literal (no extra include): 33.2s
That difference looks far to big to me.
And the times are far too large to be measuring the actual compile time.
>
> I propose using a compound literal in v2, which eliminates the local
> variable entirely and requires no extra include:
>
> #define trace_printk(fmt, ...) \
> do { \
> if (sizeof((char[]) \
> {__stringify((__VA_ARGS__))}) > 3) \
> do_trace_printk(fmt, ##__VA_ARGS__); \
There has to be a better way to align that code.
Although you should be able to use:
if (sizeof __stringify((__VA_ARGS__)) > 3)
(I've omitted one set of parenthesis for clarity)
You could change __stringify() to work with __VA_ARGS__ the you don't need
the extra (); this works fine:
#define _x(...) #__VA_ARGS__
#define x(...) _x(__VA_ARGS__)
#define z abcd
int a = sizeof x(z, v); /* 8 */
See: https://godbolt.org/z/zo4h4nr9b
-- David
> else \
> trace_puts(fmt); \
> } while (0)
>
> This fully eliminates the shadowing risk without any compile overhead.
>
> Qian-Yu
>
> >
> >
> > > #include <linux/compiler_attributes.h>
> > > #include <linux/instruction_pointer.h>
> > > #include <linux/stddef.h>
> > > @@ -84,15 +85,18 @@ do { \
> > > * let gcc optimize the rest.
> > > */
> > >
> > > -#define trace_printk(fmt, ...) \
> > > +#define ___trace_printk(fmt, str, ...) \
> > > do { \
> > > - char _______STR[] = __stringify((__VA_ARGS__)); \
> > > - if (sizeof(_______STR) > 3) \
> > > + char str[] = __stringify((__VA_ARGS__)); \
> > > + if (sizeof(str) > 3) \
> > > do_trace_printk(fmt, ##__VA_ARGS__); \
> > > else \
> > > trace_puts(fmt); \
> > > } while (0)
> > >
> > > +#define trace_printk(fmt, ...) \
> > > + ___trace_printk(fmt, __UNIQUE_ID(str), ##__VA_ARGS__)
> > > +
> > > #define do_trace_printk(fmt, args...) \
> > > do { \
> > > static const char *trace_printk_fmt __used \
> >
^ permalink raw reply
* Re: [PATCH] mm/madvise: preserve uprobe breakpoints across MADV_DONTNEED
From: David Hildenbrand (Arm) @ 2026-05-01 19:25 UTC (permalink / raw)
To: Daniel Walker (danielwa), Jann Horn
Cc: Oleg Nesterov,
Darko Tominac -X (dtominac - GLOBALLOGIC INC at Cisco),
Masami Hiramatsu, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
James Clark, Andrew Morton, Liam R. Howlett, Lorenzo Stoakes,
Vlastimil Babka, xe-linux-external(mailer list),
linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
linux-perf-users@vger.kernel.org, linux-mm@kvack.org
In-Reply-To: <afOsxsC6MKhkuqKD@goliath>
On 4/30/26 21:25, Daniel Walker (danielwa) wrote:
> On Thu, Apr 30, 2026 at 05:22:18PM +0200, Jann Horn wrote:
>> On Wed, Apr 29, 2026 at 11:11 PM Daniel Walker (danielwa)
>> <danielwa@cisco.com> wrote:
>>>
>>>
>>> Shouldn't there be some sort of compensation or notification for this, or is each person that
>>> hits this suppose to just scratch their head and send a patch that's rejected?
>>
>> I guess we could add a pr_warn_once() that warns when
>> madvise(MADV_DONTNEED) is called on a read+execute file mapping,
Private mapping, yes. MADV_DONTNEED indeed interacts poorly here.
uprobe registration indeed fails if VM_WRITE is set (valid_vma), but nothing
stops the VMA from getting mprotect'ed later I guess, to allow for write access.
We could try marking a VMA that has uprobes, to then pr_warn_once() of
MADV_DONTNEED is done on such a VMA. It wouldn't sort out ptrace access.
>> and/or (as David said) add an explicit note in the madvise() manpage
>> about how that can interfere with software breakpoints and uprobes?
I guess not just software breakpoints, but any modifications done by a debugger.
I guess for read+execute file mapping we would expect these to be software
breakpoints.
>
> It does feel like it's the debuggers problem. The application doesn't know it's
> getting debugged. So the application does whatever it does. If GDB is debugging
> an application it should assume there's a problematic madvise() call which will
> hurt/stop the debugging from happening. It should endeavor to prevent that
> from happening. There are options in userspace to prevent it from happening. I'm
> sure madvise() is not the only thing GDB has to worry about w.r.t. screwing up
> the debugging.
>
> Noting it in the man page seems reasonable.
I assume the bigger problem here is that MADV_DONTNEED was used for memory
reclaim, when in fact, it shouldn't be used for that.
For shared mappings, the man page even documents: "MADV_DONTNEED might not lead
to immediate freeing of the pages in the range. The kernel is free to delay
freeing the pages until an appropriate moment.".
MADV_PAGEOUT is better for reclaim, but it has its limitations when it comes to
pages shared with other processes.
--
Cheers,
David
^ permalink raw reply
* Re: [PATCH 01/11] media: Move visl traces to v4l2-core
From: Steven Rostedt @ 2026-05-01 19:07 UTC (permalink / raw)
To: Detlev Casanova
Cc: linux-kernel, Nicolas Dufresne, Benjamin Gaignard, Philipp Zabel,
Mauro Carvalho Chehab, Heiko Stuebner, Daniel Almeida,
Masami Hiramatsu, Mathieu Desnoyers, Hans Verkuil,
Laurent Pinchart, Ricardo Ribalda, Yunke Cao, Sakari Ailus,
Pavan Bobba, James Cowgill, Ma Ke, Jacopo Mondi, Daniel Scally,
linux-media, linux-rockchip, linux-arm-kernel, linux-trace-kernel,
kernel
In-Reply-To: <20260212162328.192217-2-detlev.casanova@collabora.com>
Sorry, I didn't look at these patches at the time they were posted, but
when I saw you reply recently with:
"On the userspace side, each trace events can be enabled separately and
even filtered (although I'm not sure filtering is good for performance,
has it has to match each event with a regex)."
I had to see what you meant by regex, because no tracer should be using
regex for filtering.
On Thu, 12 Feb 2026 11:23:18 -0500
Detlev Casanova <detlev.casanova@collabora.com> wrote:
> +
> +DECLARE_EVENT_CLASS(v4l2_ctrl_mpeg2_seq_tmpl,
> + TP_PROTO(const struct v4l2_ctrl_mpeg2_sequence *s),
> + TP_ARGS(s),
> + TP_STRUCT__entry(__field_struct(struct v4l2_ctrl_mpeg2_sequence, s)),
> + TP_fast_assign(__entry->s = *s;),
What the heck! You are copying an entire structure onto the ring buffer to
print just a portion of it? This is really a waste of ring buffer, and also
prevents you from doing any real filtering.
You do realize that you can filter on fields (if they are defined as normal
fields).
For example,
# cd /sys/kernel/tracing
# echo 'height > 20 && width > 30' > events/visl_fwht_controls/v4l2_ctrl_fwht_params/filter
# echo 1 > events/visl_fwht_controls/v4l2_ctrl_fwht_params/enable
And that will only trace that event where the height field is greater than
20 and the width field is greater than 30. It's converted into a fast array
to do the filtering. No regex involved.
Also, there's a library to read the raw events from the ring buffers where
you can do filtering on the read side too:
https://trace-cmd.org/Documentation/libtracefs/
https://trace-cmd.org/Documentation/libtraceevent/
-- Steve
> + TP_printk("\nhorizontal_size %u\nvertical_size %u\nvbv_buffer_size %u\n"
> + "profile_and_level_indication %u\nchroma_format %u\nflags %s\n",
> + __entry->s.horizontal_size,
> + __entry->s.vertical_size,
> + __entry->s.vbv_buffer_size,
> + __entry->s.profile_and_level_indication,
> + __entry->s.chroma_format,
> + __print_flags(__entry->s.flags, "|",
> + {V4L2_MPEG2_SEQ_FLAG_PROGRESSIVE, "PROGRESSIVE"})
> + )
> +);
> +
^ permalink raw reply
* Re: [PATCH 00/11] v4l2: Add tracing for stateless codecs
From: Detlev Casanova @ 2026-05-01 18:07 UTC (permalink / raw)
To: Nicolas Dufresne, linux-kernel
Cc: Benjamin Gaignard, Philipp Zabel, Mauro Carvalho Chehab,
Heiko Stuebner, Daniel Almeida, Steven Rostedt, Masami Hiramatsu,
Mathieu Desnoyers, Hans Verkuil, Laurent Pinchart,
Ricardo Ribalda, Yunke Cao, Sakari Ailus, Pavan Bobba,
James Cowgill, Ma Ke, Jacopo Mondi, Daniel Scally, linux-media,
linux-rockchip, linux-arm-kernel, linux-trace-kernel, kernel
In-Reply-To: <4c216c195d237e71188dfbdd54bbc1e234eff434.camel@collabora.com>
Hi Nicolas,
On 4/28/26 15:52, Nicolas Dufresne wrote:
> Le jeudi 12 février 2026 à 11:23 -0500, Detlev Casanova a écrit :
>> Hi !
>>
>> This patchset aims to improve codec event tracing in v4l2.
>>
>> The traces added in visl by Daniel Almeida are moved to the global trace
>> events and slightly reworked to be printed in a more consistent format.
>>
>> To each trace are also added a tgid and fd fields, helping userspace track
>> different decoding sessions (contexts) based on the given file descriptor
>> used by the given process id.
>>
>> Also for better tracking, stream on and stream off events are added as
>> well as HW run and HW done events to track decoder core usage.
>>
>> Finally, add a show_fdinfo callback on video device files, allowing drivers
>> to expose usage information.
>> Currently only used for frame buffer memory usage.
>>
>> The main focus is to be able to generate perfetto traces to show VPU usage,
>> a perfetto producer using this can be found at [1].
>>
>> [1]:
>> https://gitlab.collabora.com/detlev/hantro-perf/-/tree/hantro-improved-info
> I would recommend renaming this repo, as its not really evolving toward
> something truly hantro specific anymore, same goes for the code below appart
> from the run/done traces and the fdinfo that has to be opted in.
Yes, hantro was the first focus, but I've also been integrating rkvdec
into this.
>
> There is effectively a bit of an overlap with the v4l2-tracer, but there is also
> advantages having it in the kernel, as you source all the event, for the entire
> system, in one place, so I'm fine with that. Considering the possible large
> overhead of the full trace, I'd like to see the ability to filter what we want
> to trace, with some level of granularity.
Tracing is by default disable and and has a really small performance
footprint when disabled.
On the userspace side, each trace events can be enabled separately and
even filtered (although I'm not sure filtering is good for performance,
has it has to match each event with a regex).
> Maybe we only need decode_params for
> specific use case to be debugged, and don't care about large scaling list/matrix
> ? I would also like to see some Documentation on the tracing, so that its usage
> is not only explained in a tool.
Agreed.
> To Hans, there is nice tools idea in there, the perfetto producer is simply C++,
> and the v4l2top utility is Rust. Would you see these as tools v4l2-utils ? For
> the rust part, we can either leave the build independent, and cargo would be
> used to build and run, or we can implement a meson wrapper around cargo. But I
> don't believe its a good idea to use native rustc support in meson for that one
> because of the large number of third party crate needed.
>
> I like the direction, hope the feedback suite you well.
>
> Nicolas
>
>> Detlev Casanova (11):
>> media: Move visl traces to v4l2-core
>> media: Reformat v4l2-requests trace event printk
>> media: Add tgid and fd fields in v4l2_fh struct
>> media: Add tgid and fd to the v4l2-requests trace fields
>> media: Add missing types to v4l2_ctrl_ptr
>> media: Trace the stateless controls when set in v4l2-ctrls-core.c
>> media: Add stream on/off traces and run them in the ioctl
>> media: Add HW run/done trace events
>> media: hantro: Add v4l2_hw run/done traces
>> media: v4l2: Add callback for show_fdinfo
>> media: hantro: Add fdinfo callback
>>
>> drivers/media/platform/verisilicon/hantro.h | 2 +
>> .../media/platform/verisilicon/hantro_drv.c | 25 +
>> .../media/platform/verisilicon/hantro_v4l2.c | 10 +-
>> .../verisilicon/rockchip_vpu981_regs.h | 1 +
>> .../platform/verisilicon/rockchip_vpu_hw.c | 4 +
>> drivers/media/test-drivers/visl/Makefile | 2 +-
>> drivers/media/test-drivers/visl/visl-dec.c | 76 -
>> .../media/test-drivers/visl/visl-trace-av1.h | 314 ---
>> .../media/test-drivers/visl/visl-trace-fwht.h | 66 -
>> .../media/test-drivers/visl/visl-trace-h264.h | 349 ----
>> .../media/test-drivers/visl/visl-trace-hevc.h | 464 -----
>> .../test-drivers/visl/visl-trace-mpeg2.h | 99 -
>> .../test-drivers/visl/visl-trace-points.c | 11 -
>> .../media/test-drivers/visl/visl-trace-vp8.h | 156 --
>> .../media/test-drivers/visl/visl-trace-vp9.h | 292 ---
>> drivers/media/v4l2-core/v4l2-ctrls-api.c | 10 +
>> drivers/media/v4l2-core/v4l2-ctrls-core.c | 114 +
>> drivers/media/v4l2-core/v4l2-dev.c | 10 +
>> drivers/media/v4l2-core/v4l2-fh.c | 1 +
>> drivers/media/v4l2-core/v4l2-ioctl.c | 37 +-
>> drivers/media/v4l2-core/v4l2-trace.c | 48 +
>> include/media/v4l2-ctrls.h | 19 +
>> include/media/v4l2-dev.h | 1 +
>> include/media/v4l2-fh.h | 4 +
>> include/trace/events/v4l2.h | 58 +
>> include/trace/events/v4l2_requests.h | 1836 +++++++++++++++++
>> 26 files changed, 2178 insertions(+), 1831 deletions(-)
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-av1.h
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-fwht.h
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-h264.h
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-hevc.h
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-mpeg2.h
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-points.c
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-vp8.h
>> delete mode 100644 drivers/media/test-drivers/visl/visl-trace-vp9.h
>> create mode 100644 include/trace/events/v4l2_requests.h
^ permalink raw reply
* Re: [PATCH v6] tracing: Bound synthetic-field strings with seq_buf
From: Steven Rostedt @ 2026-05-01 17:26 UTC (permalink / raw)
To: Tom Zanussi
Cc: Pengpeng Hou, Masami Hiramatsu, Mathieu Desnoyers,
linux-trace-kernel, linux-kernel
In-Reply-To: <097ea2386bfea803f7c33d915f6990b6338e7821.camel@kernel.org>
On Fri, 01 May 2026 12:10:39 -0500
Tom Zanussi <zanussi@kernel.org> wrote:
> > + /* Terminate synthetic_name with a NUL. */
> > + seq_buf_str(&s);
> > +
>
> This doesn't hurt, but is it really needed? I think seq_buf_printf()
> already null-terminates.
Technically it does, but it's not guaranteed to do so. That is, only
seq_buf_str() defines terminating the string in the API.
-- Steve
^ permalink raw reply
* Re: [PATCH v6] tracing: Bound synthetic-field strings with seq_buf
From: Tom Zanussi @ 2026-05-01 17:10 UTC (permalink / raw)
To: Pengpeng Hou, Steven Rostedt, Masami Hiramatsu
Cc: Mathieu Desnoyers, Tom Zanussi, linux-trace-kernel, linux-kernel
In-Reply-To: <20260430043350.57928-1-pengpeng@iscas.ac.cn>
Hi Pengpeng,
Looks fine to me, just a couple minor nits below..
On Thu, 2026-04-30 at 12:33 +0800, Pengpeng Hou wrote:
> The synthetic field helpers build a prefixed synthetic variable name and
> a generated hist command in fixed MAX_FILTER_STR_VAL buffers. The
> current code appends those strings with raw strcat(), so long key lists,
> field names, or saved filters can run past the end of the staging
> buffers.
>
> Build both strings with seq_buf and propagate -E2BIG if either the
> synthetic variable name or the generated command exceeds
> MAX_FILTER_STR_VAL. This keeps the existing tracing-side limit while
> using the helper intended for bounded command construction.
>
> Fixes: 02205a6752f2 ("tracing: Add support for 'field variables'")
> Signed-off-by: Pengpeng Hou <pengpeng@iscas.ac.cn>
> ---
> Changes since v5: https://lore.kernel.org/all/20260424070104.1-tracing-synth-v5-pengpeng@iscas.ac.cn/
> - start a new thread for the new patch revision
> - use a lore link for the previous version in the changelog
> - simplify the synthetic-name construction with seq_buf_printf()
> - keep saved_filter as a normal local variable and avoid an anonymous block
>
> kernel/trace/trace_events_hist.c | 41 ++++++++++++++++++++++----------
> 1 file changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
> index 0dbbf6cca9bc..aa8e7f043ac0 100644
> --- a/kernel/trace/trace_events_hist.c
> +++ b/kernel/trace/trace_events_hist.c
> @@ -8,6 +8,7 @@
> #include <linux/module.h>
> #include <linux/kallsyms.h>
> #include <linux/security.h>
> +#include <linux/seq_buf.h>
> #include <linux/mutex.h>
> #include <linux/slab.h>
> #include <linux/stacktrace.h>
> @@ -2968,14 +2969,23 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
> char *system, char *event_name, char *field_name)
> {
> struct hist_field *event_var;
> + struct seq_buf s;
> char *synthetic_name;
Can you move this down a line, to maintain the reverse Christmas tree
declarations?
>
> synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
> if (!synthetic_name)
> return ERR_PTR(-ENOMEM);
>
> - strcpy(synthetic_name, "synthetic_");
> - strcat(synthetic_name, field_name);
> + seq_buf_init(&s, synthetic_name, MAX_FILTER_STR_VAL);
> + seq_buf_printf(&s, "synthetic_%s", field_name);
> +
> + /* Terminate synthetic_name with a NUL. */
> + seq_buf_str(&s);
> +
This doesn't hurt, but is it really needed? I think seq_buf_printf()
already null-terminates.
> + if (seq_buf_has_overflowed(&s)) {
> + kfree(synthetic_name);
> + return ERR_PTR(-E2BIG);
> + }
>
> event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
>
> @@ -3020,6 +3030,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
> struct trace_event_file *file;
> struct hist_field *key_field;
> struct hist_field *event_var;
> + struct seq_buf s;
Same here.
> char *saved_filter;
> char *cmd;
> int ret;
> @@ -3065,28 +3076,34 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data,
> return ERR_PTR(-ENOMEM);
> }
>
> + seq_buf_init(&s, cmd, MAX_FILTER_STR_VAL);
> +
> /* Use the same keys as the compatible histogram */
> - strcat(cmd, "keys=");
> + seq_buf_puts(&s, "keys=");
>
> for_each_hist_key_field(i, hist_data) {
> key_field = hist_data->fields[i];
> if (!first)
> - strcat(cmd, ",");
> - strcat(cmd, key_field->field->name);
> + seq_buf_putc(&s, ',');
> + seq_buf_puts(&s, key_field->field->name);
> first = false;
> }
>
> /* Create the synthetic field variable specification */
> - strcat(cmd, ":synthetic_");
> - strcat(cmd, field_name);
> - strcat(cmd, "=");
> - strcat(cmd, field_name);
> + seq_buf_printf(&s, ":synthetic_%s=%s", field_name, field_name);
>
> /* Use the same filter as the compatible histogram */
> saved_filter = find_trigger_filter(hist_data, file);
> - if (saved_filter) {
> - strcat(cmd, " if ");
> - strcat(cmd, saved_filter);
> + if (saved_filter)
> + seq_buf_printf(&s, " if %s", saved_filter);
> +
> + /* Terminate cmd with a NUL. */
> + seq_buf_str(&s);
> +
And here.
> + if (seq_buf_has_overflowed(&s)) {
> + kfree(cmd);
> + kfree(var_hist);
> + return ERR_PTR(-E2BIG);
> }
>
> var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
Anyway,
Reviewed-by: Tom Zanussi <zanussi@kernel.org>
Thanks,
Tom
^ permalink raw reply
* [PATCH v2] trace_printk: replace ___STR with compound literal
From: Qian-Yu Lin @ 2026-05-01 16:28 UTC (permalink / raw)
To: rostedt, mhiramat
Cc: mathieu.desnoyers, david.laight.linux, linux-kernel,
linux-trace-kernel, Qian-Yu Lin
In-Reply-To: <20260429165707.7020-1-tiffany019230@gmail.com>
The macro trace_printk() uses a hardcoded identifier ___STR
within a statement expression, which can lead to variable name
shadowing if a caller happens to use the same name in its scope.
Replace the hardcoded identifier with a compound literal, which
eliminates the local variable entirely and removes any shadowing
risk without requiring an extra include or helper macro.
Since trace_printk() is never nested, __UNIQUE_ID() provides no
practical benefit here. The compound literal approach is simpler
and has no compile time overhead, unlike the v1 approach which
added #include <linux/compiler.h>.
Signed-off-by: Qian-Yu Lin <tiffany019230@gmail.com>
---
include/linux/trace_printk.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
index 2670ec7f4262..7a4f6711834f 100644
--- a/include/linux/trace_printk.h
+++ b/include/linux/trace_printk.h
@@ -86,8 +86,7 @@ do { \
#define trace_printk(fmt, ...) \
do { \
- char _______STR[] = __stringify((__VA_ARGS__)); \
- if (sizeof(_______STR) > 3) \
+ if (sizeof((char[]){__stringify((__VA_ARGS__))}) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
--
2.43.0
^ permalink raw reply related
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: Steven Rostedt @ 2026-05-01 16:21 UTC (permalink / raw)
To: Qian-Yu Lin
Cc: mhiramat, david.laight.linux, linux-kernel, linux-trace-kernel
In-Reply-To: <afTSLgFwpYW1py36@nova>
On Sat, 2 May 2026 00:17:50 +0800
Qian-Yu Lin <tiffany019230@gmail.com> wrote:
> Yes, I verified it with the preprocessor output. I created a minimal
> test file:
>
> // kernel/trace/test_trace_printk.c
> #include <linux/trace_printk.h>
>
> void test(void) {
> trace_printk("no args\n");
> trace_printk("with arg %d\n", 42);
> }
>
> Then ran make kernel/trace/test_trace_printk.i.
>
> The no-args case has sizeof((char[]){"()"}) which is 3, so 3 > 3 is
> false and it falls through to trace_bputs/trace_puts.
>
> The args case has sizeof((char[]){"(42)"}) which is 5, so 5 > 3 is
> true and it goes to trace_bprintk/trace_printk with the argument
> 42 correctly passed through.
Thanks. When I test, I do the same but just compile down to an object file
and run objdump and look at which functions were called.
-- Steve
^ permalink raw reply
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: Qian-Yu Lin @ 2026-05-01 16:17 UTC (permalink / raw)
To: Steven Rostedt
Cc: mhiramat, david.laight.linux, linux-kernel, linux-trace-kernel
In-Reply-To: <20260501111939.0536140a@gandalf.local.home>
On Fri, May 01, 2026 at 11:19:39AM -0400, Steven Rostedt wrote:
> On Fri, 1 May 2026 22:40:17 +0800
> Qian-Yu Lin <tiffany019230@gmail.com> wrote:
>
> > I propose using a compound literal in v2, which eliminates the local
> > variable entirely and requires no extra include:
> >
> > #define trace_printk(fmt, ...) \
> > do { \
> > if (sizeof((char[]) \
> > {__stringify((__VA_ARGS__))}) > 3) \
> > do_trace_printk(fmt, ##__VA_ARGS__); \
> > else \
> > trace_puts(fmt); \
> > } while (0)
> >
> > This fully eliminates the shadowing risk without any compile overhead.
>
> Have you tested to make sure a string with no arguments still produces the
> trace_puts() and one that has arguments calls do_trace_printk()?
>
> I'm fine with that one if it still works.
>
> -- Steve
Yes, I verified it with the preprocessor output. I created a minimal
test file:
// kernel/trace/test_trace_printk.c
#include <linux/trace_printk.h>
void test(void) {
trace_printk("no args\n");
trace_printk("with arg %d\n", 42);
}
Then ran make kernel/trace/test_trace_printk.i.
The no-args case has sizeof((char[]){"()"}) which is 3, so 3 > 3 is
false and it falls through to trace_bputs/trace_puts.
The args case has sizeof((char[]){"(42)"}) which is 5, so 5 > 3 is
true and it goes to trace_bprintk/trace_printk with the argument
42 correctly passed through.
Qian-Yu
^ permalink raw reply
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: Steven Rostedt @ 2026-05-01 15:19 UTC (permalink / raw)
To: Qian-Yu Lin
Cc: mhiramat, david.laight.linux, linux-kernel, linux-trace-kernel
In-Reply-To: <afS7UUAfzgiSzEHv@nova>
On Fri, 1 May 2026 22:40:17 +0800
Qian-Yu Lin <tiffany019230@gmail.com> wrote:
> I propose using a compound literal in v2, which eliminates the local
> variable entirely and requires no extra include:
>
> #define trace_printk(fmt, ...) \
> do { \
> if (sizeof((char[]) \
> {__stringify((__VA_ARGS__))}) > 3) \
> do_trace_printk(fmt, ##__VA_ARGS__); \
> else \
> trace_puts(fmt); \
> } while (0)
>
> This fully eliminates the shadowing risk without any compile overhead.
Have you tested to make sure a string with no arguments still produces the
trace_puts() and one that has arguments calls do_trace_printk()?
I'm fine with that one if it still works.
-- Steve
^ permalink raw reply
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: Qian-Yu Lin @ 2026-05-01 14:51 UTC (permalink / raw)
To: David Laight; +Cc: rostedt, mhiramat, linux-kernel, linux-trace-kernel
In-Reply-To: <20260429224728.339a632c@pumpkin>
On Wed, Apr 29, 2026 at 10:47:28PM +0100, David Laight wrote:
> On Wed, 29 Apr 2026 13:42:26 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
>
> > On Thu, 30 Apr 2026 00:57:07 +0800
> > Qian-Yu Lin <tiffany019230@gmail.com> wrote:
> >
> > > The macro trace_printk() uses a hardcoded identifier _______STR
> > > within a statement expression, which can lead to variable name
> > > shadowing if a caller happens to use the same name in its scope.
> >
> > Has this ever been a problem?
> >
> > >
> > > Following the pattern in commit 24ba53017e18 ("rcu: Replace ________p1
> > > and _________p1 with __UNIQUE_ID(rcu)") and commit 589a9785ee3a
> > > ("min/max: remove sparse warnings when they're nested"), replace the
> > > hardcoded identifier with __UNIQUE_ID(STR).
>
> min and max do get nested - so it is important that the 'local'
> variables have different names - otherwise you can get invalid expansions.
>
> No one is going to have: trace_printk(fmt1, trace_printk(ftm2, ...), ...)
> it just doesn't make sense.
>
> There is a slight problem the ____________________________STR might
> be used by an entirely different #define.
> Just using _trace_printk_str[] would make this pretty unlikely.
> It would also have the advantage of making the .i file a bit easier
> to read, all the UNIQUE names in min/max output make it really hard
> to see what the output actually means.
>
Thank you for the suggestion. However, _trace_printk_str still carries
a theoretical shadowing risk if a caller happens to use the same name.
> > >
> > > Since __UNIQUE_ID() must be expanded once to remain consistent across
> > > declaration and sizeof() within the statement expression, introduce a
> > > nested helper macro ___trace_printk.
> >
> > Hmm, so we are replacing one name with underscores with another name
> > with underscores?
> >
> > >
> > > Signed-off-by: Qian-Yu Lin <tiffany019230@gmail.com>
> > > ---
> > > include/linux/trace_printk.h | 10 +++++++---
> > > 1 file changed, 7 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
> > > index 2670ec7f4262..060eccb40838 100644
> > > --- a/include/linux/trace_printk.h
> > > +++ b/include/linux/trace_printk.h
> > > @@ -2,6 +2,7 @@
> > > #ifndef _LINUX_TRACE_PRINTK_H
> > > #define _LINUX_TRACE_PRINTK_H
> > >
> > > +#include <linux/compiler.h>
> >
> > People are already saying that trace_printk.h slows down the compile.
> > Does this add any overhead to the compile?
>
> A little - nothing is free.
>
> David
>
Indeed. I measured compile time of kernel/trace/ring_buffer_benchmark.o
after make clean on an x86_64 machine running Ubuntu 24.04 LTS:
- Original _______STR: 49.8s
- v1 with __UNIQUE_ID (compiler.h): 53.5s
- compound literal (no extra include): 33.2s
A compound literal eliminates the local variable entirely, removing
the shadowing risk without any extra include or compile overhead:
#define trace_printk(fmt, ...) \
do { \
if (sizeof((char[]) \
{__stringify((__VA_ARGS__))}) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
} while (0)
Qian-Yu
> >
> > -- Steve
> >
> >
> > > #include <linux/compiler_attributes.h>
> > > #include <linux/instruction_pointer.h>
> > > #include <linux/stddef.h>
> > > @@ -84,15 +85,18 @@ do { \
> > > * let gcc optimize the rest.
> > > */
> > >
> > > -#define trace_printk(fmt, ...) \
> > > +#define ___trace_printk(fmt, str, ...) \
> > > do { \
> > > - char _______STR[] = __stringify((__VA_ARGS__)); \
> > > - if (sizeof(_______STR) > 3) \
> > > + char str[] = __stringify((__VA_ARGS__)); \
> > > + if (sizeof(str) > 3) \
> > > do_trace_printk(fmt, ##__VA_ARGS__); \
> > > else \
> > > trace_puts(fmt); \
> > > } while (0)
> > >
> > > +#define trace_printk(fmt, ...) \
> > > + ___trace_printk(fmt, __UNIQUE_ID(str), ##__VA_ARGS__)
> > > +
> > > #define do_trace_printk(fmt, args...) \
> > > do { \
> > > static const char *trace_printk_fmt __used \
> >
> >
>
^ permalink raw reply
* Re: [PATCH] trace_printk: replace _______STR with __UNIQUE_ID(STR)
From: Qian-Yu Lin @ 2026-05-01 14:40 UTC (permalink / raw)
To: Steven Rostedt
Cc: mhiramat, david.laight.linux, linux-kernel, linux-trace-kernel
In-Reply-To: <20260429134226.49e95e9d@robin>
On Wed, Apr 29, 2026 at 01:42:26PM -0400, Steven Rostedt wrote:
> On Thu, 30 Apr 2026 00:57:07 +0800
> Qian-Yu Lin <tiffany019230@gmail.com> wrote:
>
> > The macro trace_printk() uses a hardcoded identifier _______STR
> > within a statement expression, which can lead to variable name
> > shadowing if a caller happens to use the same name in its scope.
>
> Has this ever been a problem?
>
No, this has not been a practical problem. However, the long-underscore
name ___STR is an indication of a potential shadowing risk.
> >
> > Following the pattern in commit 24ba53017e18 ("rcu: Replace ________p1
> > and _________p1 with __UNIQUE_ID(rcu)") and commit 589a9785ee3a
> > ("min/max: remove sparse warnings when they're nested"), replace the
> > hardcoded identifier with __UNIQUE_ID(STR).
> >
> > Since __UNIQUE_ID() must be expanded once to remain consistent across
> > declaration and sizeof() within the statement expression, introduce a
> > nested helper macro ___trace_printk.
>
> Hmm, so we are replacing one name with underscores with another name
> with underscores?
>
> >
> > Signed-off-by: Qian-Yu Lin <tiffany019230@gmail.com>
> > ---
> > include/linux/trace_printk.h | 10 +++++++---
> > 1 file changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h
> > index 2670ec7f4262..060eccb40838 100644
> > --- a/include/linux/trace_printk.h
> > +++ b/include/linux/trace_printk.h
> > @@ -2,6 +2,7 @@
> > #ifndef _LINUX_TRACE_PRINTK_H
> > #define _LINUX_TRACE_PRINTK_H
> >
> > +#include <linux/compiler.h>
>
> People are already saying that trace_printk.h slows down the compile.
> Does this add any overhead to the compile?
>
> -- Steve
Yes. I measured compile time of kernel/trace/ring_buffer_benchmark.o
after make clean on an x86_64 machine running Ubuntu 24.04 LTS:
- Original _______STR: 49.8s
- v1 with __UNIQUE_ID (compiler.h): 53.5s
- compound literal (no extra include): 33.2s
I propose using a compound literal in v2, which eliminates the local
variable entirely and requires no extra include:
#define trace_printk(fmt, ...) \
do { \
if (sizeof((char[]) \
{__stringify((__VA_ARGS__))}) > 3) \
do_trace_printk(fmt, ##__VA_ARGS__); \
else \
trace_puts(fmt); \
} while (0)
This fully eliminates the shadowing risk without any compile overhead.
Qian-Yu
>
>
> > #include <linux/compiler_attributes.h>
> > #include <linux/instruction_pointer.h>
> > #include <linux/stddef.h>
> > @@ -84,15 +85,18 @@ do { \
> > * let gcc optimize the rest.
> > */
> >
> > -#define trace_printk(fmt, ...) \
> > +#define ___trace_printk(fmt, str, ...) \
> > do { \
> > - char _______STR[] = __stringify((__VA_ARGS__)); \
> > - if (sizeof(_______STR) > 3) \
> > + char str[] = __stringify((__VA_ARGS__)); \
> > + if (sizeof(str) > 3) \
> > do_trace_printk(fmt, ##__VA_ARGS__); \
> > else \
> > trace_puts(fmt); \
> > } while (0)
> >
> > +#define trace_printk(fmt, ...) \
> > + ___trace_printk(fmt, __UNIQUE_ID(str), ##__VA_ARGS__)
> > +
> > #define do_trace_printk(fmt, args...) \
> > do { \
> > static const char *trace_printk_fmt __used \
>
^ permalink raw reply
* Re: [PATCH v5 0/7] locking: contended_release tracepoint instrumentation
From: Dmitry Ilvokhin @ 2026-05-01 13:32 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
Broadcom internal kernel review list, Thomas Gleixner,
Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
Masami Hiramatsu, Mathieu Desnoyers
Cc: linux-kernel, linux-mips, virtualization, linux-arch, linux-mm,
linux-trace-kernel, kernel-team, Paul E. McKenney
In-Reply-To: <cover.1776350944.git.d@ilvokhin.com>
I plan to rebase this series on top of Linus' tree to pick up Vineeth's
patch [1], and send an updated version next week.
In the meantime, I would appreciate feedback, especially on:
- tracepoint semantics across different lock types
- overhead concerns in hot paths (e.g. qspinlock)
As a follow-up, I am also working on an RFC to extend perf lock
contention to make use of the contended_release tracepoint, so feedback
in that context would also be helpful.
Feedback from locking and tracing maintainers would be particularly
appreciated before respinning.
[1]: https://lore.kernel.org/all/20260323160052.17528-1-vineeth@bitbyteword.org/
^ permalink raw reply
* Re: [PATCH v5 2/2] blk-mq: expose tag starvation counts via debugfs
From: kernel test robot @ 2026-05-01 4:24 UTC (permalink / raw)
To: Aaron Tomlin, axboe, rostedt, mhiramat, mathieu.desnoyers
Cc: oe-kbuild-all, bvanassche, johannes.thumshirn, kch, dlemoal,
ritesh.list, loberman, neelx, sean, mproche, chjohnst,
linux-block, linux-kernel, linux-trace-kernel
In-Reply-To: <20260427020142.358912-3-atomlin@atomlin.com>
Hi Aaron,
kernel test robot noticed the following build errors:
[auto build test ERROR on axboe/for-next]
[also build test ERROR on trace/for-next riteshharjani/for-next linus/master v7.1-rc1 next-20260430]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Aaron-Tomlin/blk-mq-add-tracepoint-block_rq_tag_wait/20260428-013259
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/r/20260427020142.358912-3-atomlin%40atomlin.com
patch subject: [PATCH v5 2/2] blk-mq: expose tag starvation counts via debugfs
config: i386-allnoconfig-bpf (https://download.01.org/0day-ci/archive/20260501/202605010658.UrX2ImE9-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260501/202605010658.UrX2ImE9-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605010658.UrX2ImE9-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from ./include/trace/define_trace.h:132,
from ./include/trace/events/block.h:726,
from block/blk-core.c:45:
./include/trace/events/block.h: In function 'do_trace_event_raw_event_block_rq_tag_wait':
>> ./include/trace/events/block.h:255:71: error: expected ':' before ';' token
255 | __entry->dev = q->disk ? disk_devt(q->disk);
| ^
./include/trace/trace_events.h:427:11: note: in definition of macro '__DECLARE_EVENT_CLASS'
427 | { assign; } \
| ^~~~~~
./include/trace/trace_events.h:435:23: note: in expansion of macro 'PARAMS'
435 | PARAMS(assign), PARAMS(print)) \
| ^~~~~~
./include/trace/trace_events.h:40:9: note: in expansion of macro 'DECLARE_EVENT_CLASS'
40 | DECLARE_EVENT_CLASS(name, \
| ^~~~~~~~~~~~~~~~~~~
./include/trace/trace_events.h:44:30: note: in expansion of macro 'PARAMS'
44 | PARAMS(assign), \
| ^~~~~~
./include/trace/events/block.h:241:1: note: in expansion of macro 'TRACE_EVENT'
241 | TRACE_EVENT(block_rq_tag_wait,
| ^~~~~~~~~~~
./include/trace/events/block.h:254:9: note: in expansion of macro 'TP_fast_assign'
254 | TP_fast_assign(
| ^~~~~~~~~~~~~~
In file included from ./include/trace/define_trace.h:133:
./include/trace/events/block.h: In function 'do_perf_trace_block_rq_tag_wait':
>> ./include/trace/events/block.h:255:71: error: expected ':' before ';' token
255 | __entry->dev = q->disk ? disk_devt(q->disk);
| ^
./include/trace/perf.h:51:11: note: in definition of macro '__DECLARE_EVENT_CLASS'
51 | { assign; } \
| ^~~~~~
./include/trace/perf.h:67:23: note: in expansion of macro 'PARAMS'
67 | PARAMS(assign), PARAMS(print)) \
| ^~~~~~
./include/trace/trace_events.h:40:9: note: in expansion of macro 'DECLARE_EVENT_CLASS'
40 | DECLARE_EVENT_CLASS(name, \
| ^~~~~~~~~~~~~~~~~~~
./include/trace/trace_events.h:44:30: note: in expansion of macro 'PARAMS'
44 | PARAMS(assign), \
| ^~~~~~
./include/trace/events/block.h:241:1: note: in expansion of macro 'TRACE_EVENT'
241 | TRACE_EVENT(block_rq_tag_wait,
| ^~~~~~~~~~~
./include/trace/events/block.h:254:9: note: in expansion of macro 'TP_fast_assign'
254 | TP_fast_assign(
| ^~~~~~~~~~~~~~
vim +255 ./include/trace/events/block.h
242
243 TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
244
245 TP_ARGS(q, hctx, is_sched_tag),
246
247 TP_STRUCT__entry(
248 __field( dev_t, dev )
249 __field( u32, hctx_id )
250 __field( u32, nr_tags )
251 __field( bool, is_sched_tag )
252 ),
253
254 TP_fast_assign(
> 255 __entry->dev = q->disk ? disk_devt(q->disk);
256 __entry->hctx_id = hctx->queue_num;
257 __entry->is_sched_tag = is_sched_tag;
258
259 if (is_sched_tag)
260 __entry->nr_tags = hctx->sched_tags->nr_tags;
261 else
262 __entry->nr_tags = hctx->tags->nr_tags;
263 ),
264
265 TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
266 MAJOR(__entry->dev), MINOR(__entry->dev),
267 __entry->hctx_id,
268 __entry->is_sched_tag ? "scheduler" : "hardware",
269 __entry->nr_tags)
270 );
271
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* Re: [PATCH RFC v5 00/53] guest_memfd: In-place conversion support
From: Ackerley Tng @ 2026-04-30 23:51 UTC (permalink / raw)
To: Michael Roth
Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta, qperret,
rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
aneesh.kumar, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
Youngjun Park, Qi Zheng, Shakeel Butt, Kiryl Shutsemau,
Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
linux-coco
In-Reply-To: <x7n77snnvvukofo3slopl74c5tmlb2t2un5qy5fq6eb3d2xt7e@6a6tixg5mdfc>
Michael Roth <michael.roth@amd.com> writes:
>
> [...snip...]
>
> I made a super-long-winded reply to that thread, but to summarize:
>
> PRESERVE flag has different enumeration/behavior/enforcement for pre-launch
> vs. post-launch, and similar considerations might come into play for
> other flags, so to make it easier to enumerate what flags are available
> for pre-launch/post-launch, maybe we could have 2 capabilities instead
> of 1:
>
> KVM_CAP_MEMORY_ATTRIBUTES2_PRE_LAUNCH_FLAGS
> KVM_CAP_MEMORY_ATTRIBUTES2_FLAGS
>
> where SNP/TDX would only advertise PRESERVE for PRE_LAUNCH, and pKVM I
> guess would enumerate it for both (or maybe just POST_LAUNCH?)
>
> That lets us keep the flags definitions more straightforward but still
> allows userspace to easily enumerate what exactly should be available at
> pre vs. post launch time, and give us some flexibility to detail
> variations in behavior between the 2 phases without documenting
> edge-cases in terms of VM types.
>
Oops Michael I only read this after the meeting today.
Sean, today at guest_memfd biweekly we also discussed this topic. I
brought up this topic because IMO the interface is starting to get
a little awkward, I'm struggling to put the awkwardness into words.
Here are some awkward points:
For PRESERVE, even though it is defined (now) as that what the host
writes will be readable in the guest, it only works for both to-private
and to-shared conversions for KVM_X86_SW_PROTECTED_VMs and pKVM. That's
because guest_memfd doesn't actually invoke encryption during the
conversion. For TDX and SNP, the encryption can only be done before the
VM is finalized, through vendor-specific ioctls that go through
kvm_gmem_populate() to load memory into the guest.
For ZERO, it is defined in api.rst that ZERO is not supported for
to-private conversions, and the rationale there was that when ZEROing,
guest_memfd/KVM can zero, but it's really the contract between the guest
and the vendor trusted firmware whether the guest sees zeros later.
Another awkward point is that ZERO was meant to enable an optimization
for TDX since the firmware zeroes memory, but it actually only zeroes
memory when the page is unmapped from Secure EPTs. guest_memfd (for now)
doesn't track whether the page was unmapped from Secure EPTs as part of
the conversion, so guest_memfd can't assume it was mapped before the
conversion request. To uphold the ZERO contract with userspace,
guest_memfd applies zeroing for TDX anyway.
Summarizing from guest_memfd biweekly today:
David suggested enumerating the combinations, something like
`SHARED_ZERO` and friends (since to-private and ZERO is not supported)
and Michael then brought up the other axis of pre/post launch. IIRC
there might be another axis since pKVM would need to determine
dynamically if a to_shared conversion can be permitted for the range
being converted, based on whether the guest had requested a to_shared
conversion.
I think this might just result in too many flags, and could paint us
into a corner if more options get supported later.
I spent even more time thinking about this today. I get that we want a
consistent contract to userspace, can we scope the contract differently?
What if we scope as "what KVM guarantees the content will look like
after guest_memfd updates attributes"? This is a smaller contract, since
it doesn't promise anything about what the guest sees. Running this
through a few examples:
+ Pre-finalize, SNP, to-private, PRESERVE: guest_memfd guarantees that
after setting memory attributes, the contents of the pages will not
change. The contents are then ready for populate. What populate does
to the memory is another contract between SNP and the guest that is
out of scope of guest_memfd's contract.
+ Post-finalize, SNP, to-private, PRESERVE: guest_memfd guarantees that
after setting memory attributes, the contents of the pages will not
change. SNP's contract with the guest does not, though. After the page
gets faulted in, the guest sees scrambled data. This may be a
meaningless operation now, but it leaves the door open so perhaps we
could have an SNP-specific ioctl in future where step 1 is to set
memory attributes within guest_memfd to private and step 2 is to
encrypt in place.
+ pKVM, to-private, PRESERVE: guest_memfd guarantees that after setting
memory attributes, the contents of the pages don't change. Separately,
pKVM doesn't do encryption, so the pKVM guest reads the same contents
the host wrote. The distinction here from the current state is that
guest_memfd didn't guarantee that the pKVM guest will see the same
content the host wrote since that's a separate contract between the
pKVM guest and pKVM.
+ Post-finalize, TDX, to-shared, ZERO: guest_memfd guarantees that
contents of the pages will be zeroed in the process of updating
guest_memfd attributes. Host userspace reads zeros after faulting it
in, which is because guest_memfd did zero the pages after conversion
to shared. A future optimization is possible, where guest_memfd only
zeroes the pages that were unmapped from Secure EPTs, since (this
version of) TDX zeros memory when unmapping from Secure EPTs.
+ Post-finalize, TDX, to-shared, PRESERVE: -EOPNOTSUPP. guest_memfd is
unable to guarantee that the process of setting memory attributes will
not change memory contents. The process of setting memory attributes
requires unmapping from Secure EPTs, which will zero the memory. (In
future, if we want to relax this, we could permit this if nothing in
the requested range was mapped in Secure EPTs)
+ Post-finalize, SNP, to-shared, PRESERVE: guest_memfd guarantees that
after setting memory attributes, the contents of the pages will not
change. For SNP, unmapping doesn't change memory contents? The guest
reads garbage, and that's a separate contract between SNP and the
guest. In the guest_memfd contract, guest_memfd PRESERVEs the memory
contents in the process of setting memory attributes, and can fulfil
that.
+ Post-finalize, TDX, to-private, ZERO: guest_memfd zeroes the shared
memory before updating the attributes to be private, because it
promised to. If this memory gets faulted in to Secure EPTs, TDX
firmware zeros it again, because that's TDX's contract with the
guest. I can't see any benefit to userspace in using this combination,
but the guest_memfd contract and implementation are simple.
TLDR:
+ PRESERVE == guarantee that the process of setting memory attributes
doesn't change memory contents.
+ implementation == do nothing in most cases, except -EOPNOTSUPP for
to-shared on TDX, since unmapping is a required part of setting
memory attributes to private, and a TDX side effect of unmapping
is zeroing memory,
+ ZERO == guarantee that the process of setting memory attributes zeroes
memory contents.
+ implementation == memset(zero) in most cases. For TDX, a future
optimization exists, where memset() can be skipped for pages that
were mapped in Secure EPTs before conversion
+ UNSPECIFIED == no guarantees
+ implementation == guest_memfd does nothing explicitly about memory
contents. The implementation is pretty much the same as PRESERVE
except guest_memfd won't take into account vendor-specific side
effects of the process of conversion. Except for the test vehicle
KVM_X86_SW_PROTECTED_VMS, where memory is scrambled.
>>
>> [...snip...]
>>
>
> Looking at the example you have there:
>
> + Note: These content modes apply to the entire requested range, not
> + just the parts of the range that underwent conversion. For example, if
> + this was the initial state:
> +
> + * [0x0000, 0x1000): shared
> + * [0x1000, 0x2000): private
> + * [0x2000, 0x3000): shared
> + and range [0x0000, 0x3000) was set to shared, the content mode would
> + apply to all memory in [0x0000, 0x3000), not just the range that
> + underwent conversion [0x1000, 0x2000).
>
> Userspace would be aware of whether the range contains pages that were
> already set to private, so if it really wants to set the just the
> [0x1000, 0x2000) range to shared with appropriate content mode, it is
> fully able to do so by just issuing the ioctl for that specific range.
> If it attempts to issue it for the entire range, it only seems like it
> would defy normal expectations and cause confusion to skip ranges, and
> I'm not sure it gains us anything useful in exchange for that potential
> confusion.
>
Great that we're aligned here :) No complaints from guest_memfd biweekly
today as well :)
>>
>> [...snip...]
>>
^ permalink raw reply
* Re: [PATCH v2] tracepoint: add lockdep rcu_is_watching() check to trace_##name##_enabled()
From: Steven Rostedt @ 2026-04-30 20:01 UTC (permalink / raw)
To: David Carlier
Cc: linux-trace-kernel, linux-kernel, mhiramat, mathieu.desnoyers,
peterz, vineeth
In-Reply-To: <20260430144159.10985-1-devnexen@gmail.com>
On Thu, 30 Apr 2026 15:41:59 +0100
David Carlier <devnexen@gmail.com> wrote:
> trace_call__##name() was added by commit 677a3d82b640 ("tracepoint: Add
> trace_call__##name() API") for callers that already gate on
> trace_##name##_enabled(), letting them skip the redundant
> static_branch_unlikely() re-evaluation. The LOCKDEP rcu_is_watching()
> WARN_ONCE() that trace_##name() carries on the disabled path was not
> mirrored, so the cold-path coverage added by commit c2679254b9c9
> ("tracing: Make tracepoint lockdep check actually test something") is
> lost for trace_call__##name() callers when the tracepoint is disabled.
>
> When the tracepoint is enabled, the rcu_dereference inside
> __DO_TRACE_CALL() already trips under PROVE_RCU, so the warning is
> only needed on the !enabled path. The natural place is the gate that
> trace_call__##name() callers consult: trace_##name##_enabled().
>
> Fixes: 677a3d82b640 ("tracepoint: Add trace_call__##name() API")
Thanks, but I'm 86'ing the Fixes tag. The above commit has absolutely
nothing to do with this patch. This issue was there from the beginning of
having the trace_*_enabled() macro. This isn't a fix.
I'll modify the change log to be more accurate as well.
-- Steve
> Cc: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: David Carlier <devnexen@gmail.com>
> ---
> include/linux/tracepoint.h | 4 ++++
> 1 file changed, 4 insertions(+)
>
> diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
> index 578e520b6ee6..3b3ab19c2627 100644
> --- a/include/linux/tracepoint.h
> +++ b/include/linux/tracepoint.h
> @@ -293,6 +293,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
> static inline bool \
> trace_##name##_enabled(void) \
> { \
> + if (IS_ENABLED(CONFIG_LOCKDEP)) { \
> + WARN_ONCE(!rcu_is_watching(), \
> + "RCU not watching for tracepoint"); \
> + } \
> return static_branch_unlikely(&__tracepoint_##name.key);\
> }
>
^ permalink raw reply
* Re: [PATCH] mm/madvise: preserve uprobe breakpoints across MADV_DONTNEED
From: Daniel Walker (danielwa) @ 2026-04-30 19:25 UTC (permalink / raw)
To: Jann Horn
Cc: Oleg Nesterov, David Hildenbrand (Arm),
Darko Tominac -X (dtominac - GLOBALLOGIC INC at Cisco),
Masami Hiramatsu, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
James Clark, Andrew Morton, Liam R. Howlett, Lorenzo Stoakes,
Vlastimil Babka, xe-linux-external(mailer list),
linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
linux-perf-users@vger.kernel.org, linux-mm@kvack.org
In-Reply-To: <CAG48ez0mwkHz=4+PoLv_VmYrQiPqkSQqCSo=3R-ztrQ2zAHUwQ@mail.gmail.com>
On Thu, Apr 30, 2026 at 05:22:18PM +0200, Jann Horn wrote:
> On Wed, Apr 29, 2026 at 11:11 PM Daniel Walker (danielwa)
> <danielwa@cisco.com> wrote:
> > On Wed, Apr 29, 2026 at 05:24:25PM +0200, Oleg Nesterov wrote:
> > > On 04/29, David Hildenbrand (Arm) wrote:
> > > >
> > > > On 4/29/26 15:15, Darko Tominac wrote:
> > > >
> > > > > uprobe infrastructure does not re-instrument pages on individual page
> > > > > faults (uprobe_mmap() is only called during VMA creation, not on
> > > > > page-in), the breakpoints are silently lost once the discarded pages are
> > > > > re-read from the backing file. The probes stop firing with no error
> > > > > indication, and the only recovery is to unregister and re-register the
> > > > > affected uprobes.
> > > >
> > > > Right. Don't MADV_DONTNEED uprobes, just like you are not supposed to
> > > > MADV_DONTNEED debugger breakpoints/set data etc. :)
> > >
> > > Agreed, thanks.
> >
> >
> > Shouldn't there be some sort of compensation or notification for this, or is each person that
> > hits this suppose to just scratch their head and send a patch that's rejected?
>
> I guess we could add a pr_warn_once() that warns when
> madvise(MADV_DONTNEED) is called on a read+execute file mapping,
> and/or (as David said) add an explicit note in the madvise() manpage
> about how that can interfere with software breakpoints and uprobes?
It does feel like it's the debuggers problem. The application doesn't know it's
getting debugged. So the application does whatever it does. If GDB is debugging
an application it should assume there's a problematic madvise() call which will
hurt/stop the debugging from happening. It should endeavor to prevent that
from happening. There are options in userspace to prevent it from happening. I'm
sure madvise() is not the only thing GDB has to worry about w.r.t. screwing up
the debugging.
Noting it in the man page seems reasonable.
Daniel
^ permalink raw reply
* Re: [PATCH] mm/madvise: preserve uprobe breakpoints across MADV_DONTNEED
From: Jann Horn @ 2026-04-30 19:11 UTC (permalink / raw)
To: Oleg Nesterov
Cc: David Hildenbrand (Arm), Daniel Walker (danielwa),
Darko Tominac -X (dtominac - GLOBALLOGIC INC at Cisco),
Masami Hiramatsu, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
James Clark, Andrew Morton, Liam R. Howlett, Lorenzo Stoakes,
Vlastimil Babka, xe-linux-external(mailer list),
linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
linux-perf-users@vger.kernel.org, linux-mm@kvack.org
In-Reply-To: <afOjk9igx9nlc9cg@redhat.com>
On Thu, Apr 30, 2026 at 8:47 PM Oleg Nesterov <oleg@redhat.com> wrote:
> _In that case_. Instead of this patch (which, rightly or not, I personally dislike
> in any case), can't MADV_DONTNEED paths check MMF_HAS_UPROBES or vma_has_uprobes()
> to decide whether .even_cows should be true?
The manpage currently essentially guarantees that MADV_DONTNEED
discards CoW pages. I think it would be a bad idea to change that
documented behavior without a strong reason, especially if that means
we'd have to add more special cases to the UAPI contract.
^ permalink raw reply
* Re: [PATCH] mm/madvise: preserve uprobe breakpoints across MADV_DONTNEED
From: Oleg Nesterov @ 2026-04-30 18:46 UTC (permalink / raw)
To: David Hildenbrand (Arm)
Cc: Daniel Walker (danielwa),
Darko Tominac -X (dtominac - GLOBALLOGIC INC at Cisco),
Masami Hiramatsu, Peter Zijlstra, Ingo Molnar,
Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
James Clark, Andrew Morton, Liam R. Howlett, Lorenzo Stoakes,
Vlastimil Babka, Jann Horn, xe-linux-external(mailer list),
linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org,
linux-perf-users@vger.kernel.org, linux-mm@kvack.org
In-Reply-To: <2d815203-a81e-40b4-92ba-31832c84c20b@kernel.org>
David, et al, I am sorry for the unnecessary spam but I can't resist...
On 04/30, David Hildenbrand (Arm) wrote:
>
> On 4/30/26 11:16, Oleg Nesterov wrote:
> >
> > But may be MADV_DONTNEED should not set zap_details.even_cows == true
> > by default ?
>
> It should. If you do mmap(MAP_PRIVATE, fd), and modify some pages, MADV_DONTNEED
> must zap these pages.
[... snip the correct explanation ...]
Ah. Sorry for the confusion, I should not have said "by default". Of course
you are right, and I understand this. At least I hope ;)
Let me try to explain what I tried to say. First of all, I still agree with
your "It shouldn't do that on a MAP_PRIVATE file-backed VMA". I also agree
with Jann who suggests to make the manpage more clear about uprobes/bps.
But, lets suppose that this "problem" is raised again, and we come to conclusion
that it should be fixed/mitigated. ("we" actually meaning you and other mm experts ;)
_In that case_. Instead of this patch (which, rightly or not, I personally dislike
in any case), can't MADV_DONTNEED paths check MMF_HAS_UPROBES or vma_has_uprobes()
to decide whether .even_cows should be true?
At least when madvise_behavior.mm != current->mm ?
Oleg.
^ permalink raw reply
* Re: [RFC][PATCH] unwind: Add stacktrace_setup system call
From: Steven Rostedt @ 2026-04-30 17:38 UTC (permalink / raw)
To: Indu Bhagat
Cc: LKML, Linux Trace Kernel, Masami Hiramatsu, Mathieu Desnoyers,
Jens Remus, Josh Poimboeuf, Peter Zijlstra, Ingo Molnar,
Jiri Olsa, Arnaldo Carvalho de Melo, Namhyung Kim,
Thomas Gleixner, Andrii Nakryiko, Jose E. Marchesi, Beau Belgrave,
Linus Torvalds, Andrew Morton, Florian Weimer, Kees Cook,
Carlos O'Donell, Sam James, Dylan Hatch, Borislav Petkov,
Dave Hansen, David Hildenbrand, H. Peter Anvin, Liam R. Howlett,
Lorenzo Stoakes, Michal Hocko, Mike Rapoport, Suren Baghdasaryan,
Vlastimil Babka, Heiko Carstens, Vasily Gorbik
In-Reply-To: <20260429114355.6c712e6a@gandalf.local.home>
I forgot to update Indu's email.
Indu, here's the patch for adding a system call to allow the dynamic loader
to add sframes for the dynamic libraries it adds.
The lore link to see the original email and thread:
https://lore.kernel.org/all/20260429114355.6c712e6a@gandalf.local.home/
-- Steve
On Wed, 29 Apr 2026 11:43:55 -0400
Steven Rostedt <rostedt@kernel.org> wrote:
> From: Steven Rostedt <rostedt@goodmis.org>
>
> [
> This is an RFC that adds a system call for dynamic linkers to use to
> tell the kernel where the sframe sections are when it loads dynamic
> libraries.
>
> It is built on top of Jens's sframe implementation for v3:
>
> https://lore.kernel.org/linux-trace-kernel/20260127150554.2760964-1-jremus@linux.ibm.com/
>
> I have a repo with that code that this applies on top of here:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git sframe/core
>
>
> The name of the system call is "stacktrace_setup", but I'm not attached
> to this name. If anyone can think of a better name I'm happy to take
> suggestions.
>
> This patch is just to get the conversation going and the final result
> may be much different. I tested this with the attached program which is a
> major hack. I built glibc with sframe v3 support and I used readelf to
> find the sframe size and location of glibc.
>
> readelf -e /work/usr/lib/libc.so.6 | grep sframe
> [19] .sframe GNU_SFRAME 00000000001d3fc0 001d3fc0
>
> Then I wrote a program that takes the above location and size of the
> .sframe section in libc as parameters, scans /proc/self/maps to find
> where it loaded libc and then calls this new system call with a pointer
> to the location of the sframe along with its size, as well as where the
> libc text is located.
>
> It then spins for 2 seconds, calls the system call again to remove the
> sframe section it loaded, and spins for another 2 seconds.
>
> I ran perf record --call-graph fp,defer on the program and looked for
> the do_spin() function.
>
> With sframe loaded:
>
> sframe-test 1350 1396.333593: 202366 cpu/cycles/P:
> 7fdf0ec38a44 [unknown] ([vdso])
> 5621a6b97243 get_time+0x19 (/work/c/sframe-test)
> 5621a6b9727f do_spin+0x1f (/work/c/sframe-test)
> 5621a6b975cd main+0xd4 (/work/c/sframe-test)
> 7fdf0ea26bda __libc_start_call_main+0x6a (/work/usr/lib/libc.so.6)
> 7fdf0ea26d05 __libc_start_main@@GLIBC_2.34+0x85 (/work/usr/lib/libc.so.6)
> 5621a6b97131 _start+0x21 (/work/c/sframe-test)
>
> After it unloads the sframe:
>
> sframe-test 1350 1400.332902: 657582 cpu/cycles/P:
> 7fdf0ec38a5e [unknown] ([vdso])
> 5621a6b97243 get_time+0x19 (/work/c/sframe-test)
> 5621a6b9727f do_spin+0x1f (/work/c/sframe-test)
> 5621a6b97602 main+0x109 (/work/c/sframe-test)
> 7fdf0ea26bda __libc_start_call_main+0x6a (/work/usr/lib/libc.so.6)
>
> As you can see, with the sframe loaded, it was able to walk further up
> the libc library.
>
> Again, this is just an RFC, but I would like to get agreement on the
> system call so that we can then update the dynamic linker to do this
> instead of using my hack ;-)
> ]
>
> Add a system call that can be used by dynamic linkers to tell the kernel
> where the sframe section is in memory for libraries it loads.
>
> The system call stacktrace_setup takes 5 parameters:
>
> op - the type of operation to perform
> addr_start - The virtual address of the sframe section
> addr_length - The length of the sframe section
> text_start - the text section the sframe represents
> test_length - the length of the section
>
> The current op values are:
>
> STACKTRACE_REGISTER_SFRAME - This registers the sframe
> STACKTRACE_UNREGISTER_SFRAME - This removes the sframe
>
> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
> ---
> arch/alpha/kernel/syscalls/syscall.tbl | 1 +
> arch/arm/tools/syscall.tbl | 1 +
> arch/arm64/tools/syscall_32.tbl | 1 +
> arch/m68k/kernel/syscalls/syscall.tbl | 1 +
> arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n32.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_n64.tbl | 1 +
> arch/mips/kernel/syscalls/syscall_o32.tbl | 1 +
> arch/parisc/kernel/syscalls/syscall.tbl | 1 +
> arch/powerpc/kernel/syscalls/syscall.tbl | 1 +
> arch/s390/kernel/syscalls/syscall.tbl | 1 +
> arch/sh/kernel/syscalls/syscall.tbl | 1 +
> arch/sparc/kernel/syscalls/syscall.tbl | 1 +
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> arch/xtensa/kernel/syscalls/syscall.tbl | 1 +
> include/linux/syscalls.h | 1 +
> include/uapi/asm-generic/unistd.h | 5 ++-
> include/uapi/linux/stacktrace.h | 10 ++++++
> kernel/sys_ni.c | 2 ++
> kernel/unwind/sframe.c | 37 +++++++++++++++++++++
> scripts/syscall.tbl | 1 +
> 22 files changed, 71 insertions(+), 1 deletion(-)
> create mode 100644 include/uapi/linux/stacktrace.h
>
> diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
> index f31b7afffc34..8c320029a156 100644
> --- a/arch/alpha/kernel/syscalls/syscall.tbl
> +++ b/arch/alpha/kernel/syscalls/syscall.tbl
> @@ -511,3 +511,4 @@
> 579 common file_setattr sys_file_setattr
> 580 common listns sys_listns
> 581 common rseq_slice_yield sys_rseq_slice_yield
> +582 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
> index 94351e22bfcf..60f9a33b2dc5 100644
> --- a/arch/arm/tools/syscall.tbl
> +++ b/arch/arm/tools/syscall.tbl
> @@ -486,3 +486,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
> index 62d93d88e0fe..a0bd04a23006 100644
> --- a/arch/arm64/tools/syscall_32.tbl
> +++ b/arch/arm64/tools/syscall_32.tbl
> @@ -483,3 +483,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
> index 248934257101..266ec877300a 100644
> --- a/arch/m68k/kernel/syscalls/syscall.tbl
> +++ b/arch/m68k/kernel/syscalls/syscall.tbl
> @@ -471,3 +471,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
> index 223d26303627..916294849393 100644
> --- a/arch/microblaze/kernel/syscalls/syscall.tbl
> +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
> @@ -477,3 +477,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
> index 7430714e2b8f..20fec148901e 100644
> --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
> @@ -410,3 +410,4 @@
> 469 n32 file_setattr sys_file_setattr
> 470 n32 listns sys_listns
> 471 n32 rseq_slice_yield sys_rseq_slice_yield
> +472 n32 stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
> index 630aab9e5425..2743bbcab143 100644
> --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
> @@ -386,3 +386,4 @@
> 469 n64 file_setattr sys_file_setattr
> 470 n64 listns sys_listns
> 471 n64 rseq_slice_yield sys_rseq_slice_yield
> +472 n64 stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
> index 128653112284..187eadc4a42e 100644
> --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
> +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
> @@ -459,3 +459,4 @@
> 469 o32 file_setattr sys_file_setattr
> 470 o32 listns sys_listns
> 471 o32 rseq_slice_yield sys_rseq_slice_yield
> +472 o32 stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
> index c6331dad9461..9442a92ef0aa 100644
> --- a/arch/parisc/kernel/syscalls/syscall.tbl
> +++ b/arch/parisc/kernel/syscalls/syscall.tbl
> @@ -470,3 +470,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
> index 4fcc7c58a105..005441233932 100644
> --- a/arch/powerpc/kernel/syscalls/syscall.tbl
> +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
> @@ -562,3 +562,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 nospu rseq_slice_yield sys_rseq_slice_yield
> +472 nospu stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
> index 09a7ef04d979..bc9894b25584 100644
> --- a/arch/s390/kernel/syscalls/syscall.tbl
> +++ b/arch/s390/kernel/syscalls/syscall.tbl
> @@ -398,3 +398,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
> index 70b315cbe710..5766251b4d2d 100644
> --- a/arch/sh/kernel/syscalls/syscall.tbl
> +++ b/arch/sh/kernel/syscalls/syscall.tbl
> @@ -475,3 +475,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
> index 7e71bf7fcd14..20e7f3b856e4 100644
> --- a/arch/sparc/kernel/syscalls/syscall.tbl
> +++ b/arch/sparc/kernel/syscalls/syscall.tbl
> @@ -517,3 +517,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index f832ebd2d79b..652ede93b724 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -477,3 +477,4 @@
> 469 i386 file_setattr sys_file_setattr
> 470 i386 listns sys_listns
> 471 i386 rseq_slice_yield sys_rseq_slice_yield
> +472 i386 stacktrace_setup sys_stacktrace_setup
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 524155d655da..5da918e912a6 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -396,6 +396,7 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
>
> #
> # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
> index a9bca4e484de..34f0de06baee 100644
> --- a/arch/xtensa/kernel/syscalls/syscall.tbl
> +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
> @@ -442,3 +442,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index f5639d5ac331..fdbea39c1b38 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -999,6 +999,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
> asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
> u32 size, u32 flags);
> asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
> +asmlinkage long sys_stacktrace_setup(void);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index a627acc8fb5f..d3f57d8454d7 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -863,8 +863,11 @@ __SYSCALL(__NR_listns, sys_listns)
> #define __NR_rseq_slice_yield 471
> __SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
>
> +#define __NR_stacktrace_setup 472
> +__SYSCALL(__NR_stacktrace_setup, sys_stacktrace_setup)
> +
> #undef __NR_syscalls
> -#define __NR_syscalls 472
> +#define __NR_syscalls 473
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/stacktrace.h b/include/uapi/linux/stacktrace.h
> new file mode 100644
> index 000000000000..60b581f55995
> --- /dev/null
> +++ b/include/uapi/linux/stacktrace.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_STACKTRACE_H
> +#define _UAPI_LINUX_STACKTRACE_H
> +
> +enum stacktrace_setup_types {
> + STACKTRACE_REGISTER_SFRAME = 1,
> + STACKTRACE_UNREGISTER_SFRAME = 2,
> +};
> +
> +#endif /* _UAPI_LINUX_STACKTRACE_H */
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index add3032da16f..76998b0f811a 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -394,3 +394,5 @@ COND_SYSCALL(rseq_slice_yield);
>
> COND_SYSCALL(uretprobe);
> COND_SYSCALL(uprobe);
> +
> +COND_SYSCALL(stacktrace_setup);
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> index f24997e84e05..a842038fb03b 100644
> --- a/kernel/unwind/sframe.c
> +++ b/kernel/unwind/sframe.c
> @@ -12,8 +12,10 @@
> #include <linux/mm.h>
> #include <linux/string_helpers.h>
> #include <linux/sframe.h>
> +#include <linux/syscalls.h>
> #include <asm/unwind_user_sframe.h>
> #include <linux/unwind_user_types.h>
> +#include <uapi/linux/stacktrace.h>
>
> #include "sframe.h"
> #include "sframe_debug.h"
> @@ -838,3 +840,38 @@ void sframe_free_mm(struct mm_struct *mm)
>
> mtree_destroy(&mm->sframe_mt);
> }
> +
> +/**
> + * sys_stacktrace_setup - register an address for user space stacktrace walking.
> + * @op: Type of operation to perform
> + * @addr_start: The virtual address of the stacktrace information
> + * @addr_length: The length of the stacktrace information
> + * @text_start: The virtual address of the text that @addr_start represents
> + * @text_length: The length of teh text
> + *
> + * This system call is used by dynamic library utilities to inform the kernel
> + * of meta data that it loaded that can be used by the kernel to know how
> + * to stack walk the given text locations.
> + *
> + * Currently only sframes are supported, but in the future, this may be used
> + * to tell the kernel about JIT code which will most likely have a different
> + * format.
> + *
> + * The type command may be extended and parameters may be used for other
> + * purposes.
> + *
> + * Return: 0 if successful, otherwise a negative error.
> + */
> +SYSCALL_DEFINE5(stacktrace_setup, int, op, unsigned long, addr_start,
> + unsigned long, addr_length, unsigned long, text_start,
> + unsigned long, text_length)
> +{
> + switch (op) {
> + case STACKTRACE_REGISTER_SFRAME:
> + return sframe_add_section(addr_start, addr_start + addr_length,
> + text_start, text_start+text_length);
> + case STACKTRACE_UNREGISTER_SFRAME:
> + return sframe_remove_section(addr_start);
> + }
> + return -EINVAL;
> +}
> diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
> index 7a42b32b6577..54a99cffeec4 100644
> --- a/scripts/syscall.tbl
> +++ b/scripts/syscall.tbl
> @@ -412,3 +412,4 @@
> 469 common file_setattr sys_file_setattr
> 470 common listns sys_listns
> 471 common rseq_slice_yield sys_rseq_slice_yield
> +472 common stacktrace_setup sys_stacktrace_setup
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox