Linux Trace Kernel
 help / color / mirror / Atom feed
* Re: [PATCH v6 6/7] locking: Factor out __queued_read_unlock()/__queued_write_unlock()
From: Steven Rostedt @ 2026-05-13 15:41 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-mips, virtualization,
	linux-arch, linux-mm, linux-trace-kernel, kernel-team,
	Paul E. McKenney
In-Reply-To: <8e88613c73f0603c4440ba3a62eb604a5dddc57b.1777999826.git.d@ilvokhin.com>

On Tue,  5 May 2026 17:09:35 +0000
Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> This is a preparatory refactoring for the next commit, which adds

Same thing about using "next commit" in change logs.

-- Steve

> contended_release tracepoint instrumentation and needs to call the
> unlock from both traced and non-traced paths.
> 
> No functional change.
> 
> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> Acked-by: Paul E. McKenney <paulmck@kernel.org>
> ---
>  include/asm-generic/qrwlock.h | 20 +++++++++++++++-----
>  1 file changed, 15 insertions(+), 5 deletions(-)

^ permalink raw reply

* Re: [PATCH v6 7/7] locking: Add contended_release tracepoint to qrwlock
From: Steven Rostedt @ 2026-05-13 15:43 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Peter Zijlstra, Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Masami Hiramatsu,
	Mathieu Desnoyers, linux-kernel, linux-mips, virtualization,
	linux-arch, linux-mm, linux-trace-kernel, kernel-team
In-Reply-To: <b67fda8e847fff72da05eff7f799019f8d17ce21.1777999826.git.d@ilvokhin.com>

On Tue,  5 May 2026 17:09:36 +0000
Dmitry Ilvokhin <d@ilvokhin.com> wrote:

> Extend the contended_release tracepoint to queued rwlocks, using the
> same out-of-line traced unlock approach as queued spinlocks.
> 
> Signed-off-by: Dmitry Ilvokhin <d@ilvokhin.com>
> ---
>  include/asm-generic/qrwlock.h | 22 ++++++++++++++++++++++
>  kernel/locking/qrwlock.c      | 16 ++++++++++++++++
>  2 files changed, 38 insertions(+)
> 
> diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
> index 4b627bafba8b..274c19006125 100644
> --- a/include/asm-generic/qrwlock.h
> +++ b/include/asm-generic/qrwlock.h
> @@ -14,6 +14,7 @@
>  #define __ASM_GENERIC_QRWLOCK_H
>  
>  #include <linux/atomic.h>
> +#include <linux/tracepoint-defs.h>
>  #include <asm/barrier.h>
>  #include <asm/processor.h>
>  
> @@ -35,6 +36,10 @@
>   */
>  extern void queued_read_lock_slowpath(struct qrwlock *lock);
>  extern void queued_write_lock_slowpath(struct qrwlock *lock);
> +extern void queued_read_unlock_traced(struct qrwlock *lock);
> +extern void queued_write_unlock_traced(struct qrwlock *lock);
> +
> +DECLARE_TRACEPOINT(contended_release);
>  
>  /**
>   * queued_read_trylock - try to acquire read lock of a queued rwlock
> @@ -115,6 +120,17 @@ static __always_inline void __queued_read_unlock(struct qrwlock *lock)
>   */
>  static inline void queued_read_unlock(struct qrwlock *lock)
>  {
> +	/*
> +	 * Trace and unlock are combined in the traced unlock variant so
> +	 * the compiler does not need to preserve the lock pointer across
> +	 * the function call, avoiding callee-saved register save/restore
> +	 * on the hot path.
> +	 */
> +	if (tracepoint_enabled(contended_release)) {
> +		queued_read_unlock_traced(lock);

Same issue here about duplicating the code.
> +		return;
> +	}
> +
>  	__queued_read_unlock(lock);
>  }
>  
> @@ -129,6 +145,12 @@ static __always_inline void __queued_write_unlock(struct qrwlock *lock)
>   */
>  static inline void queued_write_unlock(struct qrwlock *lock)
>  {
> +	/* See comment in queued_read_unlock(). */
> +	if (tracepoint_enabled(contended_release)) {
> +		queued_write_unlock_traced(lock);

And here.

> +		return;
> +	}
> +
>  	__queued_write_unlock(lock);
>  }
>  
> diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
> index d2ef312a8611..5ae4b0372719 100644
> --- a/kernel/locking/qrwlock.c
> +++ b/kernel/locking/qrwlock.c
> @@ -90,3 +90,19 @@ void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock)
>  	trace_contention_end(lock, 0);
>  }
>  EXPORT_SYMBOL(queued_write_lock_slowpath);
> +
> +void __lockfunc queued_read_unlock_traced(struct qrwlock *lock)
> +{
> +	if (queued_rwlock_is_contended(lock))
> +		trace_call__contended_release(lock);

Just have this trace and not actually do any locking.


> +	__queued_read_unlock(lock);
> +}
> +EXPORT_SYMBOL(queued_read_unlock_traced);
> +
> +void __lockfunc queued_write_unlock_traced(struct qrwlock *lock)
> +{
> +	if (queued_rwlock_is_contended(lock))
> +		trace_call__contended_release(lock);

Ditto.

-- Steve

> +	__queued_write_unlock(lock);
> +}
> +EXPORT_SYMBOL(queued_write_unlock_traced);


^ permalink raw reply

* [PATCH] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Rik van Riel @ 2026-05-13 16:16 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kernel-team

perf_ftrace_function_unregister() unconditionally calls
unregister_ftrace_function() without checking whether the ftrace_ops
was ever successfully registered. This triggers a WARN_ON in
__unregister_ftrace_function() when the ops doesn't have
FTRACE_OPS_FL_ENABLED set.

This can happen during perf_event_alloc() error cleanup when
perf_trace_destroy() is called via __free_event() on an event whose
ftrace_ops registration failed or was already torn down by
perf_try_init_event()'s err_destroy path.

The call path is:
  perf_event_alloc() error cleanup
    -> __free_event()
      -> event->destroy() [tp_perf_event_destroy]
        -> perf_trace_destroy()
          -> perf_trace_event_close()
            -> TRACE_REG_PERF_CLOSE
              -> perf_ftrace_function_unregister()
                -> unregister_ftrace_function()
                  -> __unregister_ftrace_function()
                    -> WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))

Fix this by checking FTRACE_OPS_FL_ENABLED before attempting to
unregister. If the ops is not enabled, just free the filter and
return success.

Assisted-by: Claude:claude-opus-4.7 syzkaller
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 kernel/trace/trace_event_perf.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a6bb7577e8c5..b9e33ae24867 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -497,7 +497,14 @@ static int perf_ftrace_function_register(struct perf_event *event)
 static int perf_ftrace_function_unregister(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
-	int ret = unregister_ftrace_function(ops);
+	int ret;
+
+	if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) {
+		ftrace_free_filter(ops);
+		return 0;
+	}
+
+	ret = unregister_ftrace_function(ops);
 	ftrace_free_filter(ops);
 	return ret;
 }
-- 
2.53.0-Meta


^ permalink raw reply related

* Re: [PATCH] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Steven Rostedt @ 2026-05-13 16:33 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kernel-team
In-Reply-To: <20260513121607.14eab1f6@fangorn>

On Wed, 13 May 2026 12:16:07 -0400
Rik van Riel <riel@surriel.com> wrote:

> diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
> index a6bb7577e8c5..b9e33ae24867 100644
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -497,7 +497,14 @@ static int perf_ftrace_function_register(struct perf_event *event)
>  static int perf_ftrace_function_unregister(struct perf_event *event)
>  {
>  	struct ftrace_ops *ops = &event->ftrace_ops;
> -	int ret = unregister_ftrace_function(ops);
> +	int ret;
> +
> +	if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) {
> +		ftrace_free_filter(ops);
> +		return 0;
> +	}
> +
> +	ret = unregister_ftrace_function(ops);
>  	ftrace_free_filter(ops);
>  	return ret;
>  }
> -- 

Instead of duplicating code, what about doing:

static int perf_ftrace_function_unregister(struct perf_event *event)
{
	struct ftrace_ops *ops = &event->ftrace_ops;
	int ret = 0;

	if (ops->flags & FTRACE_OPS_FL_ENABLED)
		ret = unregister_ftrace_function(ops);

	ftrace_free_filter(ops);
	return ret;
}


?

-- Steve

^ permalink raw reply

* Re: [RFC PATCH v3] bpf: introduce TAINT_UNSAFE_BPF for mutating helpers
From: Alexei Starovoitov @ 2026-05-13 16:35 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Aaron Tomlin, Jonathan Corbet, Song Liu, KP Singh, Matt Bobrowski,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard,
	Kumar Kartikeya Dwivedi, Masami Hiramatsu, Shuah Khan, Jiri Olsa,
	Martin KaFai Lau, Yonghong Song, Mathieu Desnoyers, Randy Dunlap,
	neelx, sean, chjohnst, steve, mproche, nick.lange,
	open list:DOCUMENTATION, LKML, bpf, linux-trace-kernel
In-Reply-To: <20260513112307.53e77312@gandalf.local.home>

On Wed, May 13, 2026 at 8:23 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 13 May 2026 08:16:07 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > It's impossible to track all modifications.
> > See what sched-ext is doing.
> > What does it modify? Everything.
>
> What about just having a list of what BPF programs are loaded, what they
> may be attached to, and what kfuncs they are calling?

Ohh. These have been available forever.
Just bpftool prog, bpftool link, bpftool prog dump xlated

^ permalink raw reply

* Re: [RFC PATCH] trace: Introduce a new filter_pred "caller"
From: Steven Rostedt @ 2026-05-13 16:40 UTC (permalink / raw)
  To: Masami Hiramatsu (Google)
  Cc: Chen Jun, mathieu.desnoyers, linux-kernel, linux-trace-kernel
In-Reply-To: <20260512084750.c17a93d0ccdacddfd52d3d40@kernel.org>

On Tue, 12 May 2026 08:47:50 +0900
Masami Hiramatsu (Google) <mhiramat@kernel.org> wrote:

> On Fri, 8 May 2026 20:26:23 +0800
> Chen Jun <chenjun102@huawei.com> wrote:
> 
> > Low-level functions have many call paths, and sometimes
> > we only care about the calls on a specific call path.
> > Add a new filter to filter based on the call stack.
> > 
> > Usage:
> > 1. echo 'caller=="$function_name"' > events/../filter  
> 
> Thanks for interesting idea :)
> 
> BTW, we already have "stacktrace". Since this actually checks
> stacktrace, not caller, so I think we should reuse it.
> Also, I think OP_GLOB is more suitable for this case.
> (and more useful)

Actually, it's not a stack trace, it's a function that is called from other
functions. But since "caller" sounds like a direct called function (stack
trace of the first instance), I think perhaps it should be "called_within" or
something similar. :-/

Also, OP_GLOB can't work because it only works for a single function. At
the time of parsing, it finds the function (and should probably error out
if there's more than one function with a given name). It then records the
start and end address of the function so it only needs to find if one of
the entries in the stack trace is between the start and end of the function.

I don't think this is possible with GLOB. We don't want to do a search of
the functions when the event is triggered.

-- Steve

^ permalink raw reply

* Re: [RFC PATCH v3] bpf: introduce TAINT_UNSAFE_BPF for mutating helpers
From: Steven Rostedt @ 2026-05-13 16:41 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Aaron Tomlin, Jonathan Corbet, Song Liu, KP Singh, Matt Bobrowski,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard,
	Kumar Kartikeya Dwivedi, Masami Hiramatsu, Shuah Khan, Jiri Olsa,
	Martin KaFai Lau, Yonghong Song, Mathieu Desnoyers, Randy Dunlap,
	neelx, sean, chjohnst, steve, mproche, nick.lange,
	open list:DOCUMENTATION, LKML, bpf, linux-trace-kernel
In-Reply-To: <CAADnVQLw+_NaOVeaKabuf085wNo_-6MAv8w0EDO3fBz3KCQT5g@mail.gmail.com>

On Wed, 13 May 2026 09:35:29 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Wed, May 13, 2026 at 8:23 AM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> > On Wed, 13 May 2026 08:16:07 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >  
> > > It's impossible to track all modifications.
> > > See what sched-ext is doing.
> > > What does it modify? Everything.  
> >
> > What about just having a list of what BPF programs are loaded, what they
> > may be attached to, and what kfuncs they are calling?  
> 
> Ohh. These have been available forever.
> Just bpftool prog, bpftool link, bpftool prog dump xlated

Ah thanks. That is useful.

-- Steve

^ permalink raw reply

* Re: [PATCH v2] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Rik van Riel @ 2026-05-13 17:24 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kernel-team
In-Reply-To: <20260513123344.05b6bcfe@gandalf.local.home>

On Wed, 13 May 2026 12:33:44 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> 
> Instead of duplicating code, what about doing:

That is much nicer. Thank you!

---8<---

From 9de86227b917c49315b7b67aac3a83afae8d792d Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@meta.com>
Date: Sat, 25 Apr 2026 03:33:54 -0700
Subject: [PATCH] perf/ftrace: Fix WARNING in __unregister_ftrace_function

perf_ftrace_function_unregister() unconditionally calls
unregister_ftrace_function() without checking whether the ftrace_ops
was ever successfully registered. This triggers a WARN_ON in
__unregister_ftrace_function() when the ops doesn't have
FTRACE_OPS_FL_ENABLED set.

This can happen during perf_event_alloc() error cleanup when
perf_trace_destroy() is called via __free_event() on an event whose
ftrace_ops registration failed or was already torn down by
perf_try_init_event()'s err_destroy path.

The call path is:
  perf_event_alloc() error cleanup
    -> __free_event()
      -> event->destroy() [tp_perf_event_destroy]
        -> perf_trace_destroy()
          -> perf_trace_event_close()
            -> TRACE_REG_PERF_CLOSE
              -> perf_ftrace_function_unregister()
                -> unregister_ftrace_function()
                  -> __unregister_ftrace_function()
                    -> WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))

Fix this by checking FTRACE_OPS_FL_ENABLED before attempting to
unregister. If the ops is not enabled, just free the filter and
return success.

Assisted-by: Claude:claude-opus-4.7 syzkaller
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 kernel/trace/trace_event_perf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a6bb7577e8c5..58e1b427b576 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -497,7 +497,11 @@ static int perf_ftrace_function_register(struct perf_event *event)
 static int perf_ftrace_function_unregister(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
-	int ret = unregister_ftrace_function(ops);
+	int ret = 0;
+
+	if (ops->flags & FTRACE_OPS_FL_ENABLED)
+		ret = unregister_ftrace_function(ops);
+
 	ftrace_free_filter(ops);
 	return ret;
 }
-- 
2.52.0



^ permalink raw reply related

* Re: [RFC v7 6/7] ext4: fast commit: add lock_updates tracepoint
From: Steven Rostedt @ 2026-05-13 17:57 UTC (permalink / raw)
  To: Li Chen
  Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel
In-Reply-To: <20260511084304.1559557-7-me@linux.beauty>

On Mon, 11 May 2026 16:43:01 +0800
Li Chen <me@linux.beauty> wrote:

> @@ -1346,8 +1383,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	}
>  	ext4_fc_unlock(sb, alloc_ctx);
>  
> -	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
> +	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
> +				      &snap_inodes, &snap_ranges, &snap_err);
>  	jbd2_journal_unlock_updates(journal);
> +	if (trace_ext4_fc_lock_updates_enabled()) {
> +		locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
> +		trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns,
> +					   snap_inodes, snap_ranges, ret,
> +					   snap_err);

Please change this to:

		trace_call__ext4_fc_lock_updates(...)

As the "trace_ext4_fc_lock_updates_enabled()" already has the static
branch. No need to do it twice anymore. 7.1 introduced the
"trace_call__foo()" that will do a direct call to the tracepoints
registered, without the need for another static branch.

-- Steve


> +	}

^ permalink raw reply

* Re: [PATCH v2] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Steven Rostedt @ 2026-05-13 18:11 UTC (permalink / raw)
  To: Rik van Riel
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kernel-team
In-Reply-To: <20260513132445.24d8d9f6@fangorn>

On Wed, 13 May 2026 13:24:45 -0400
Rik van Riel <riel@surriel.com> wrote:

> From 9de86227b917c49315b7b67aac3a83afae8d792d Mon Sep 17 00:00:00 2001
> From: Rik van Riel <riel@meta.com>
> Date: Sat, 25 Apr 2026 03:33:54 -0700
> Subject: [PATCH] perf/ftrace: Fix WARNING in __unregister_ftrace_function
> 

Can you resend this as a normal patch so that it can be picked up by patchwork.

Otherwise it will be ignored.

Thanks,

-- Steve

^ permalink raw reply

* Re: [RFC PATCH v2 18/28] mm/damon: trace probe_hits
From: Steven Rostedt @ 2026-05-13 18:07 UTC (permalink / raw)
  To: SeongJae Park
  Cc: Andrew Morton, Masami Hiramatsu, Mathieu Desnoyers, damon,
	linux-kernel, linux-mm, linux-trace-kernel
In-Reply-To: <20260512143645.113201-19-sj@kernel.org>

On Tue, 12 May 2026 07:36:33 -0700
SeongJae Park <sj@kernel.org> wrote:

> Introduce a new tracepoint for exposing the per-region per-probe
> positive sample count via tracefs.
> 
> Signed-off-by: SeongJae Park <sj@kernel.org>
> ---
>  include/trace/events/damon.h | 36 ++++++++++++++++++++++++++++++++++++
>  mm/damon/core.c              |  7 +++++++
>  2 files changed, 43 insertions(+)
> 
> diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
> index 7e25f4469b81b..d7b94c7640217 100644
> --- a/include/trace/events/damon.h
> +++ b/include/trace/events/damon.h
> @@ -130,6 +130,42 @@ TRACE_EVENT(damon_monitor_intervals_tune,
>  	TP_printk("sample_us=%lu", __entry->sample_us)
>  );
>  
> +TRACE_EVENT(damon_aggregated_v2,
> +
> +	TP_PROTO(unsigned int target_id, struct damon_region *r,
> +		unsigned int nr_regions, unsigned int nr_probes),
> +
> +	TP_ARGS(target_id, r, nr_regions, nr_probes),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned long, target_id)
> +		__field(unsigned long, start)
> +		__field(unsigned long, end)
> +		__field(unsigned int, nr_regions)
> +		__field(unsigned int, nr_accesses)
> +		__field(unsigned int, age)
> +		__dynamic_array(unsigned char, probe_hits, nr_probes)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->target_id = target_id;
> +		__entry->start = r->ar.start;
> +		__entry->end = r->ar.end;
> +		__entry->nr_regions = nr_regions;
> +		__entry->nr_accesses = r->nr_accesses;
> +		__entry->age = r->age;
> +		memcpy(__get_dynamic_array(probe_hits), r->probe_hits,
> +			sizeof(*r->probe_hits) * nr_probes);
> +	),
> +
> +	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u probe_hits=%s",
> +			__entry->target_id, __entry->nr_regions,
> +			__entry->start, __entry->end,
> +			__entry->nr_accesses, __entry->age,
> +			__print_hex(__get_dynamic_array(probe_hits),
> +				__get_dynamic_array_len(probe_hits)))
> +);
> +
>  TRACE_EVENT(damon_aggregated,
>  
>  	TP_PROTO(unsigned int target_id, struct damon_region *r,
> diff --git a/mm/damon/core.c b/mm/damon/core.c
> index fe6c789f2cecb..14b15c9876516 100644
> --- a/mm/damon/core.c
> +++ b/mm/damon/core.c
> @@ -1905,6 +1905,11 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
>  {
>  	struct damon_target *t;
>  	unsigned int ti = 0;	/* target's index */
> +	unsigned int nr_probes = 0;
> +	struct damon_probe *probe;
> +
> +	damon_for_each_probe(probe, c)
> +		nr_probes++;

Is the above logic needed when the tracepoint isn't enabled? If not, then you could add:

	if (trace_damon_aggregated_v2_enabled()) {
		damon_for_each_probe(probe, c)
			nr_probes++;
	}

And change the tracepoint to be a conditional tracepoint:

TRACE_EVENT_CONDITION(damon_aggregated_v2,

	TP_PROTO(..),

	TP_ARGS(..),

	TP_CONDITION(nr_probes > 0),

	[..]

And then the tracepoint is only triggered if nr_probes is greater than zero
(to handle races between the tracepoint being enabled in between the above
check and where it triggers).

-- Steve

>  
>  	damon_for_each_target(t, c) {
>  		struct damon_region *r;
> @@ -1913,6 +1918,8 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
>  			int i;
>  
>  			trace_damon_aggregated(ti, r, damon_nr_regions(t));
> +			trace_damon_aggregated_v2(ti, r, damon_nr_regions(t),
> +					nr_probes);
>  			damon_warn_fix_nr_accesses_corruption(r);
>  			r->last_nr_accesses = r->nr_accesses;
>  			r->nr_accesses = 0;


^ permalink raw reply

* [PATCH v2] tracing: Allow perf to read synthetic events
From: Steven Rostedt @ 2026-05-13 19:00 UTC (permalink / raw)
  To: LKML, Linux Trace Kernel
  Cc: Masami Hiramatsu, Mathieu Desnoyers, Arnaldo Carvalho de Melo,
	Jiri Olsa, Namhyung Kim, Peter Zijlstra, Ian Rogers

From: Steven Rostedt <rostedt@goodmis.org>

Currently, perf can not enable synthetic events. When it does, it either
causes a warning in the kernel or errors with "no such device".

Add the necessary code to allow perf to also attach to synthetic events.

Reported-by: Ian Rogers <irogers@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes since v1: https://patch.msgid.link/20251217113920.50b56246@gandalf.local.home

- Forward ported to v7.1-rc2

 kernel/trace/trace_events_synth.c | 121 +++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 27 deletions(-)

diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 39ac4eba0702..e6871230bde9 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -499,28 +499,19 @@ static unsigned int trace_stack(struct synth_trace_event *entry,
 	return len;
 }
 
-static void trace_event_raw_event_synth(void *__data,
-					u64 *var_ref_vals,
-					unsigned int *var_ref_idx)
+static __always_inline int get_field_size(struct synth_event *event,
+					  u64 *var_ref_vals,
+					  unsigned int *var_ref_idx)
 {
-	unsigned int i, n_u64, val_idx, len, data_size = 0;
-	struct trace_event_file *trace_file = __data;
-	struct synth_trace_event *entry;
-	struct trace_event_buffer fbuffer;
-	struct trace_buffer *buffer;
-	struct synth_event *event;
-	int fields_size = 0;
-
-	event = trace_file->event_call->data;
-
-	if (trace_trigger_soft_disabled(trace_file))
-		return;
+	int fields_size;
 
 	fields_size = event->n_u64 * sizeof(u64);
 
-	for (i = 0; i < event->n_dynamic_fields; i++) {
+	for (int i = 0; i < event->n_dynamic_fields; i++) {
 		unsigned int field_pos = event->dynamic_fields[i]->field_pos;
 		char *str_val;
+		int val_idx;
+		int len;
 
 		val_idx = var_ref_idx[field_pos];
 		str_val = (char *)(long)var_ref_vals[val_idx];
@@ -535,18 +526,18 @@ static void trace_event_raw_event_synth(void *__data,
 
 		fields_size += len;
 	}
+	return fields_size;
+}
 
-	/*
-	 * Avoid ring buffer recursion detection, as this event
-	 * is being performed within another event.
-	 */
-	buffer = trace_file->tr->array_buffer.buffer;
-	guard(ring_buffer_nest)(buffer);
-
-	entry = trace_event_buffer_reserve(&fbuffer, trace_file,
-					   sizeof(*entry) + fields_size);
-	if (!entry)
-		return;
+static __always_inline void write_synth_entry(struct synth_event *event,
+					      struct synth_trace_event *entry,
+					      u64 *var_ref_vals,
+					      unsigned int *var_ref_idx)
+{
+	int data_size = 0;
+	int i, n_u64;
+	int val_idx;
+	int len;
 
 	for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
 		val_idx = var_ref_idx[i];
@@ -587,10 +578,83 @@ static void trace_event_raw_event_synth(void *__data,
 			n_u64++;
 		}
 	}
+}
+
+static void trace_event_raw_event_synth(void *__data,
+					u64 *var_ref_vals,
+					unsigned int *var_ref_idx)
+{
+	struct trace_event_file *trace_file = __data;
+	struct synth_trace_event *entry;
+	struct trace_event_buffer fbuffer;
+	struct trace_buffer *buffer;
+	struct synth_event *event;
+	int fields_size;
+
+	event = trace_file->event_call->data;
+
+	if (trace_trigger_soft_disabled(trace_file))
+		return;
+
+	fields_size = get_field_size(event, var_ref_vals, var_ref_idx);
+
+	/*
+	 * Avoid ring buffer recursion detection, as this event
+	 * is being performed within another event.
+	 */
+	buffer = trace_file->tr->array_buffer.buffer;
+	guard(ring_buffer_nest)(buffer);
+
+	entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+					   sizeof(*entry) + fields_size);
+	if (!entry)
+		return;
+
+	write_synth_entry(event, entry, var_ref_vals, var_ref_idx);
 
 	trace_event_buffer_commit(&fbuffer);
 }
 
+#ifdef CONFIG_PERF_EVENTS
+static void perf_event_raw_event_synth(void *__data,
+				       u64 *var_ref_vals,
+				       unsigned int *var_ref_idx)
+{
+	struct trace_event_call *call = __data;
+	struct synth_trace_event *entry;
+	struct hlist_head *perf_head;
+	struct synth_event *event;
+	struct pt_regs *regs;
+	int fields_size;
+	size_t size;
+	int context;
+
+	event = call->data;
+
+	perf_head = this_cpu_ptr(call->perf_events);
+
+	if (!perf_head || hlist_empty(perf_head))
+		return;
+
+	fields_size = get_field_size(event, var_ref_vals, var_ref_idx);
+
+	size = ALIGN(sizeof(*entry) + fields_size, 8);
+
+	entry = perf_trace_buf_alloc(size, &regs, &context);
+
+	if (unlikely(!entry))
+		return;
+
+	write_synth_entry(event, entry, var_ref_vals, var_ref_idx);
+
+	perf_fetch_caller_regs(regs);
+
+	perf_trace_buf_submit(entry, size, context,
+			      call->event.type, 1, regs,
+			      perf_head, NULL);
+}
+#endif
+
 static void free_synth_event_print_fmt(struct trace_event_call *call)
 {
 	if (call) {
@@ -917,6 +981,9 @@ static int register_synth_event(struct synth_event *event)
 	call->flags = TRACE_EVENT_FL_TRACEPOINT;
 	call->class->reg = synth_event_reg;
 	call->class->probe = trace_event_raw_event_synth;
+#ifdef CONFIG_PERF_EVENTS
+	call->class->perf_probe = perf_event_raw_event_synth;
+#endif
 	call->data = event;
 	call->tp = event->tp;
 
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v6 0/7] locking: contended_release tracepoint instrumentation
From: Peter Zijlstra @ 2026-05-13 19:26 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team
In-Reply-To: <cover.1777999826.git.d@ilvokhin.com>

On Tue, May 05, 2026 at 05:09:29PM +0000, Dmitry Ilvokhin wrote:

> This series adds a contended_release tracepoint that fires on the
> holder side when a lock with waiters is released. This provides:
> 
> - Hold time estimation: when the holder's own acquisition was
>   contended, its contention_end (acquisition) and contended_release
>   can be correlated to measure how long the lock was held under
>   contention.
> 
> - The holder's stack at release time, which may differ from what perf lock
>   contention --lock-owner captures if the holder does significant work between
>   the waiter's arrival and the unlock.
> 
> Note: for reader/writer locks, the tracepoint fires for every reader
> releasing while a writer is waiting, not only for the last reader.

And for qspinlock.

I am really not sure this is worth the overhead for qspinlock / rwlock.

^ permalink raw reply

* Re: [PATCH v6 5/7] locking: Add contended_release tracepoint to qspinlock
From: Peter Zijlstra @ 2026-05-13 19:33 UTC (permalink / raw)
  To: Dmitry Ilvokhin
  Cc: Ingo Molnar, Will Deacon, Boqun Feng, Waiman Long,
	Thomas Bogendoerfer, Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Arnd Bergmann,
	Dennis Zhou, Tejun Heo, Christoph Lameter, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-kernel, linux-mips,
	virtualization, linux-arch, linux-mm, linux-trace-kernel,
	kernel-team, Paul E. McKenney
In-Reply-To: <5d7ea75ffe74a785e6b234ada9f23c6373d4b4c1.1777999826.git.d@ilvokhin.com>

On Tue, May 05, 2026 at 05:09:34PM +0000, Dmitry Ilvokhin wrote:
> Use the arch-overridable queued_spin_release(), introduced in the
> previous commit, to ensure the tracepoint works correctly across all
> architectures, including those with custom unlock implementations (e.g.
> x86 paravirt).
> 
> When the tracepoint is disabled, the only addition to the hot path is a
> single NOP instruction (the static branch). When enabled, the contention
> check, trace call, and unlock are combined in an out-of-line function to
> minimize hot path impact, avoiding the compiler needing to preserve the
> lock pointer in a callee-saved register across the trace call.
> 
> Binary size impact (x86_64, defconfig):
>   uninlined unlock (common case): +680 bytes  (+0.00%)
>   inlined unlock (worst case):    +83659 bytes (+0.21%)
> 
> The inlined unlock case could not be achieved through Kconfig options on
> x86_64 as PREEMPT_BUILD unconditionally selects UNINLINE_SPIN_UNLOCK on
> x86_64. The UNINLINE_SPIN_UNLOCK guards were manually inverted to force
> inline the unlock path and estimate the worst case binary size increase.
> 
> In practice, configurations with UNINLINE_SPIN_UNLOCK=n have already
> opted against binary size optimization, so the inlined worst case is
> unlikely to be a concern.

This is not quite accurate. You add the (5byte) NOP for the static
branch, but then you also add another 5 bytes for the CALL and at least
another 2 bytes (possibly 5) for a JMP back into the previous stream.
That is 12-15 bytes added to what was a single MOV instruction.

That is quite ludicrous.

I disagree that UNINLINE_SPIN_UNLOCK=n opts against binary size. For x86
the unlock is smaller than a function call.


I really don't see how this is worth it.

^ permalink raw reply

* Re: [PATCH v7 4/6] mm/memory-failure: short-circuit PG_reserved before get_hwpoison_page()
From: David Hildenbrand (Arm) @ 2026-05-13 19:49 UTC (permalink / raw)
  To: Breno Leitao, Miaohe Lin, Andrew Morton, Lorenzo Stoakes,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Shuah Khan, Naoya Horiguchi, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Liam R. Howlett
  Cc: linux-mm, linux-kernel, linux-doc, linux-kselftest,
	linux-trace-kernel, kernel-team, Lance Yang
In-Reply-To: <20260513-ecc_panic-v7-4-be2e578e61da@debian.org>

On 5/13/26 17:39, Breno Leitao wrote:
> The previous patch already classifies PG_reserved pages as
> MF_MSG_KERNEL through the long path: get_hwpoison_page() calls
> __get_hwpoison_page() which fails HWPoisonHandlable(), get_any_page()
> exhausts its shake_page() retry budget, and the resulting
> -ENOTRECOVERABLE is mapped to MF_MSG_KERNEL by the switch.  The
> outcome is correct but the work in between is wasted: shake_page()
> cannot turn a reserved page into a handlable one.

If really required, can we just move the check right there, into get_any_page() etc?

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH v7 1/6] mm/memory-failure: drop dead error_states[] entry for reserved pages
From: David Hildenbrand (Arm) @ 2026-05-13 20:10 UTC (permalink / raw)
  To: Breno Leitao, Miaohe Lin, Andrew Morton, Lorenzo Stoakes,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Shuah Khan, Naoya Horiguchi, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Liam R. Howlett
  Cc: linux-mm, linux-kernel, linux-doc, linux-kselftest,
	linux-trace-kernel, kernel-team
In-Reply-To: <20260513-ecc_panic-v7-1-be2e578e61da@debian.org>

On 5/13/26 17:39, Breno Leitao wrote:
> The first entry of error_states[],
> 
> 	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
> 
> is unreachable.  identify_page_state() has two callers, and neither
> one can dispatch a PG_reserved page to me_kernel():
> 
>   * memory_failure() reaches identify_page_state() only after
>     get_hwpoison_page() returned 1.  get_any_page() reaches that
>     return only via __get_hwpoison_page(), which gates the refcount
>     on HWPoisonHandlable().  HWPoisonHandlable() rejects PG_reserved
>     pages, so they fail with -EBUSY/-EIO long before
>     identify_page_state() runs.

You should clarify why they are rejected. There is no explicit check for
PG_reserved in there!

> 
>   * try_memory_failure_hugetlb() reaches identify_page_state() on
>     the MF_HUGETLB_IN_USED branch, but the page is necessarily a
>     hugetlb folio there.  The first table entry that matches a
>     hugetlb folio is { head, head, MF_MSG_HUGE, me_huge_page }, so
>     they dispatch to me_huge_page() before the (now-removed)
>     reserved entry would have matched, regardless of whether
>     PG_reserved happens to be set on the head page.

See hugetlb_folio_init_vmemmap(): we always clear PG_reserved for hugetlb folios
allocated from memblock.

> 
> me_kernel() never executes and the entry exists only to be matched
> against by code that cannot see it.
> 
> Drop the entry, the me_kernel() helper, and the now-unused
> "reserved" macro.  Leave the MF_MSG_KERNEL enum value in place: it
> remains part of the tracepoint and pr_err() string tables, and
> follow-on work to classify unrecoverable kernel pages can reuse it
> without churning the user-visible enum.
> 
> No functional change.
> 
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  mm/memory-failure.c | 14 --------------
>  1 file changed, 14 deletions(-)
> 
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 866c4428ac7ef..49bcfbd04d213 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -992,17 +992,6 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
>  	return false;
>  }
>  
> -/*
> - * Error hit kernel page.
> - * Do nothing, try to be lucky and not touch this instead. For a few cases we
> - * could be more sophisticated.
> - */
> -static int me_kernel(struct page_state *ps, struct page *p)
> -{
> -	unlock_page(p);
> -	return MF_IGNORED;
> -}
> -
>  /*
>   * Page in unknown state. Do nothing.
>   * This is a catch-all in case we fail to make sense of the page state.
> @@ -1211,10 +1200,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
>  #define mlock		(1UL << PG_mlocked)
>  #define lru		(1UL << PG_lru)
>  #define head		(1UL << PG_head)
> -#define reserved	(1UL << PG_reserved)
>  
>  static struct page_state error_states[] = {
> -	{ reserved,	reserved,	MF_MSG_KERNEL,	me_kernel },
>  	/*
>  	 * free pages are specially detected outside this table:
>  	 * PG_buddy pages only make a small fraction of all free pages.
> @@ -1246,7 +1233,6 @@ static struct page_state error_states[] = {
>  #undef mlock
>  #undef lru
>  #undef head
> -#undef reserved
>  
>  static void update_per_node_mf_stats(unsigned long pfn,
>  				     enum mf_result result)
> 

Yes, I think this should work.

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* [PATCH v2] perf/ftrace: Fix WARNING in __unregister_ftrace_function
From: Rik van Riel @ 2026-05-13 20:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Masami Hiramatsu, Mathieu Desnoyers, linux-kernel,
	linux-trace-kernel, kernel-team

perf_ftrace_function_unregister() unconditionally calls
unregister_ftrace_function() without checking whether the ftrace_ops
was ever successfully registered. This triggers a WARN_ON in
__unregister_ftrace_function() when the ops doesn't have
FTRACE_OPS_FL_ENABLED set.

This can happen during perf_event_alloc() error cleanup when
perf_trace_destroy() is called via __free_event() on an event whose
ftrace_ops registration failed or was already torn down by
perf_try_init_event()'s err_destroy path.

The call path is:
  perf_event_alloc() error cleanup
    -> __free_event()
      -> event->destroy() [tp_perf_event_destroy]
        -> perf_trace_destroy()
          -> perf_trace_event_close()
            -> TRACE_REG_PERF_CLOSE
              -> perf_ftrace_function_unregister()
                -> unregister_ftrace_function()
                  -> __unregister_ftrace_function()
                    -> WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))

Fix this by checking FTRACE_OPS_FL_ENABLED before attempting to
unregister. If the ops is not enabled, just free the filter and
return success.

Assisted-by: Claude:claude-opus-4.7 syzkaller
Signed-off-by: Rik van Riel <riel@surriel.com>
---
 kernel/trace/trace_event_perf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a6bb7577e8c5..58e1b427b576 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -497,7 +497,11 @@ static int perf_ftrace_function_register(struct perf_event *event)
 static int perf_ftrace_function_unregister(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
-	int ret = unregister_ftrace_function(ops);
+	int ret = 0;
+
+	if (ops->flags & FTRACE_OPS_FL_ENABLED)
+		ret = unregister_ftrace_function(ops);
+
 	ftrace_free_filter(ops);
 	return ret;
 }
-- 
2.52.0



^ permalink raw reply related

* Re: [RFC PATCH v2 18/28] mm/damon: trace probe_hits
From: SeongJae Park @ 2026-05-14  0:06 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: SeongJae Park, Andrew Morton, Masami Hiramatsu, Mathieu Desnoyers,
	damon, linux-kernel, linux-mm, linux-trace-kernel
In-Reply-To: <20260513140732.2320c563@gandalf.local.home>

On Wed, 13 May 2026 14:07:32 -0400 Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 12 May 2026 07:36:33 -0700
> SeongJae Park <sj@kernel.org> wrote:
> 
> > Introduce a new tracepoint for exposing the per-region per-probe
> > positive sample count via tracefs.
> > 
> > Signed-off-by: SeongJae Park <sj@kernel.org>
> > ---
> >  include/trace/events/damon.h | 36 ++++++++++++++++++++++++++++++++++++
> >  mm/damon/core.c              |  7 +++++++
> >  2 files changed, 43 insertions(+)
> > 
> > diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h
> > index 7e25f4469b81b..d7b94c7640217 100644
> > --- a/include/trace/events/damon.h
> > +++ b/include/trace/events/damon.h
> > @@ -130,6 +130,42 @@ TRACE_EVENT(damon_monitor_intervals_tune,
> >  	TP_printk("sample_us=%lu", __entry->sample_us)
> >  );
> >  
> > +TRACE_EVENT(damon_aggregated_v2,
> > +
> > +	TP_PROTO(unsigned int target_id, struct damon_region *r,
> > +		unsigned int nr_regions, unsigned int nr_probes),
> > +
> > +	TP_ARGS(target_id, r, nr_regions, nr_probes),
> > +
> > +	TP_STRUCT__entry(
> > +		__field(unsigned long, target_id)
> > +		__field(unsigned long, start)
> > +		__field(unsigned long, end)
> > +		__field(unsigned int, nr_regions)
> > +		__field(unsigned int, nr_accesses)
> > +		__field(unsigned int, age)
> > +		__dynamic_array(unsigned char, probe_hits, nr_probes)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->target_id = target_id;
> > +		__entry->start = r->ar.start;
> > +		__entry->end = r->ar.end;
> > +		__entry->nr_regions = nr_regions;
> > +		__entry->nr_accesses = r->nr_accesses;
> > +		__entry->age = r->age;
> > +		memcpy(__get_dynamic_array(probe_hits), r->probe_hits,
> > +			sizeof(*r->probe_hits) * nr_probes);
> > +	),
> > +
> > +	TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u probe_hits=%s",
> > +			__entry->target_id, __entry->nr_regions,
> > +			__entry->start, __entry->end,
> > +			__entry->nr_accesses, __entry->age,
> > +			__print_hex(__get_dynamic_array(probe_hits),
> > +				__get_dynamic_array_len(probe_hits)))
> > +);
> > +
> >  TRACE_EVENT(damon_aggregated,
> >  
> >  	TP_PROTO(unsigned int target_id, struct damon_region *r,
> > diff --git a/mm/damon/core.c b/mm/damon/core.c
> > index fe6c789f2cecb..14b15c9876516 100644
> > --- a/mm/damon/core.c
> > +++ b/mm/damon/core.c
> > @@ -1905,6 +1905,11 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
> >  {
> >  	struct damon_target *t;
> >  	unsigned int ti = 0;	/* target's index */
> > +	unsigned int nr_probes = 0;
> > +	struct damon_probe *probe;
> > +
> > +	damon_for_each_probe(probe, c)
> > +		nr_probes++;
> 
> Is the above logic needed when the tracepoint isn't enabled? If not, then you could add:
> 
> 	if (trace_damon_aggregated_v2_enabled()) {
> 		damon_for_each_probe(probe, c)
> 			nr_probes++;
> 	}
> 
> And change the tracepoint to be a conditional tracepoint:
> 
> TRACE_EVENT_CONDITION(damon_aggregated_v2,
> 
> 	TP_PROTO(..),
> 
> 	TP_ARGS(..),
> 
> 	TP_CONDITION(nr_probes > 0),
> 
> 	[..]
> 
> And then the tracepoint is only triggered if nr_probes is greater than zero
> (to handle races between the tracepoint being enabled in between the above
> check and where it triggers).

It is not needed when the tracepoint isn't enabled.  I will follow your
suggestion in the next revision.  Thank you for the nice suggestion, Steven!

Btw, if you don't mind, may I ask your opinion about the name having '_v2'
suffix?  I chose that as an RFC phase temporal name that doesn't break the
compatibility, planning to give it a better name later.  But I start feeling
just extending the original one might be another option because tracepoints are
not strict stable ABI to my understanding, and the change of the TP_prink
format should be simple enough (append the probe_hits= part) that the user
space could reasonably deal with.


Thanks,
SJ

[...]

^ permalink raw reply

* Re: [RFC PATCH v2 18/28] mm/damon: trace probe_hits
From: Steven Rostedt @ 2026-05-14  0:32 UTC (permalink / raw)
  To: SeongJae Park
  Cc: Andrew Morton, Masami Hiramatsu, Mathieu Desnoyers, damon,
	linux-kernel, linux-mm, linux-trace-kernel
In-Reply-To: <20260514000611.147809-1-sj@kernel.org>

On Wed, 13 May 2026 17:06:10 -0700
SeongJae Park <sj@kernel.org> wrote:

> Btw, if you don't mind, may I ask your opinion about the name having '_v2'
> suffix?  I chose that as an RFC phase temporal name that doesn't break the
> compatibility, planning to give it a better name later.  But I start feeling
> just extending the original one might be another option because tracepoints are
> not strict stable ABI to my understanding, and the change of the TP_prink
> format should be simple enough (append the probe_hits= part) that the user
> space could reasonably deal with.

It's only a stable ABI if some useful userspace tooling depends on it.
Otherwise, feel free to change.

Nothing really should be parsing the TP_printk() format part as it is
really inefficient to do so. That's why I created libtraceevent and
libtracefs to do the parsing of the raw data for you.

-- Steve

^ permalink raw reply

* Re: [RFC PATCH v2 18/28] mm/damon: trace probe_hits
From: SeongJae Park @ 2026-05-14  2:08 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: SeongJae Park, Andrew Morton, Masami Hiramatsu, Mathieu Desnoyers,
	damon, linux-kernel, linux-mm, linux-trace-kernel
In-Reply-To: <20260513203237.3b1b3286@gandalf.local.home>

On Wed, 13 May 2026 20:32:37 -0400 Steven Rostedt <rostedt@goodmis.org> wrote:

> On Wed, 13 May 2026 17:06:10 -0700
> SeongJae Park <sj@kernel.org> wrote:
> 
> > Btw, if you don't mind, may I ask your opinion about the name having '_v2'
> > suffix?  I chose that as an RFC phase temporal name that doesn't break the
> > compatibility, planning to give it a better name later.  But I start feeling
> > just extending the original one might be another option because tracepoints are
> > not strict stable ABI to my understanding, and the change of the TP_prink
> > format should be simple enough (append the probe_hits= part) that the user
> > space could reasonably deal with.
> 
> It's only a stable ABI if some useful userspace tooling depends on it.
> Otherwise, feel free to change.

Makes perfect sense, thank you Steven!

> 
> Nothing really should be parsing the TP_printk() format part as it is
> really inefficient to do so. That's why I created libtraceevent and
> libtracefs to do the parsing of the raw data for you.

I will try to make DAMON user-space tool directly uses those.  At the moment,
it is lazily parsing trace-cmd or perf outputs.


Thanks,
SJ

[...]

^ permalink raw reply

* Re: [PATCH mm-unstable v17 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Wei Yang @ 2026-05-14  3:10 UTC (permalink / raw)
  To: Lance Yang
  Cc: npache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
	aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, david,
	dev.jain, gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, liam, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260512074202.10253-1-lance.yang@linux.dev>

On Tue, May 12, 2026 at 03:42:02PM +0800, Lance Yang wrote:
>
>On Mon, May 11, 2026 at 12:58:04PM -0600, Nico Pache wrote:
>>generalize the order of the __collapse_huge_page_* and collapse_max_*
>>functions to support future mTHP collapse.
>>
>>The current mechanism for determining collapse with the
>>khugepaged_max_ptes_none value is not designed with mTHP in mind. This
>>raises a key design issue: if we support user defined max_pte_none values
>>(even those scaled by order), a collapse of a lower order can introduces
>>an feedback loop, or "creep", when max_ptes_none is set to a value greater
>>than HPAGE_PMD_NR / 2. [1]
>>
>>With this configuration, a successful collapse to order N will populate
>>enough pages to satisfy the collapse condition on order N+1 on the next
>>scan. This leads to unnecessary work and memory churn.
>>
>>To fix this issue introduce a helper function that will limit mTHP
>>collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
>>This effectively supports two modes: [2]
>>
>>- max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
>>  that maps the shared zeropage. Consequently, no memory bloat.
>>- max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
>>  available mTHP order.
>>
>>This removes the possiblilty of "creep", while not modifying any uAPI
>>expectations. A warning will be emitted if any non-supported
>>max_ptes_none value is configured with mTHP enabled.
>>
>>mTHP collapse will not honor the khugepaged_max_ptes_shared or
>>khugepaged_max_ptes_swap parameters, and will fail if it encounters a
>>shared or swapped entry.
>>
>>No functional changes in this patch; however it defines future behavior
>>for mTHP collapse.
>>
>>[1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
>>[2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
>>
>>Co-developed-by: Dev Jain <dev.jain@arm.com>
>>Signed-off-by: Dev Jain <dev.jain@arm.com>
>>Signed-off-by: Nico Pache <npache@redhat.com>
>>---
>> include/trace/events/huge_memory.h |   3 +-
>> mm/khugepaged.c                    | 117 ++++++++++++++++++++---------
>> 2 files changed, 85 insertions(+), 35 deletions(-)
>>
>>diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
>>index bcdc57eea270..443e0bd13fdb 100644
>>--- a/include/trace/events/huge_memory.h
>>+++ b/include/trace/events/huge_memory.h
>>@@ -39,7 +39,8 @@
>> 	EM( SCAN_STORE_FAILED,		"store_failed")			\
>> 	EM( SCAN_COPY_MC,		"copy_poisoned_page")		\
>> 	EM( SCAN_PAGE_FILLED,		"page_filled")			\
>>-	EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
>>+	EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")	\
>>+	EMe(SCAN_INVALID_PTES_NONE,	"invalid_ptes_none")
>> 
>> #undef EM
>> #undef EMe
>>diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>index f68853b3caa7..27465161fa6d 100644
>>--- a/mm/khugepaged.c
>>+++ b/mm/khugepaged.c
>>@@ -61,6 +61,7 @@ enum scan_result {
>> 	SCAN_COPY_MC,
>> 	SCAN_PAGE_FILLED,
>> 	SCAN_PAGE_DIRTY_OR_WRITEBACK,
>>+	SCAN_INVALID_PTES_NONE,
>> };
>> 
>> #define CREATE_TRACE_POINTS
>>@@ -353,37 +354,60 @@ static bool pte_none_or_zero(pte_t pte)
>>  * PTEs for the given collapse operation.
>>  * @cc: The collapse control struct
>>  * @vma: The vma to check for userfaultfd
>>+ * @order: The folio order being collapsed to
>>  *
>>  * Return: Maximum number of none-page or zero-page PTEs allowed for the
>>  * collapse operation.
>>  */
>>-static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
>>-		struct vm_area_struct *vma)
>>+static int collapse_max_ptes_none(struct collapse_control *cc,
>>+		struct vm_area_struct *vma, unsigned int order)
>> {
>>+	unsigned int max_ptes_none = khugepaged_max_ptes_none;
>> 	// If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
>
>One thing I still want to call out: kernel code usually uses C-style
>comments :)
>
>> 	if (vma && userfaultfd_armed(vma))
>> 		return 0;
>> 	// for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
>> 	if (!cc->is_khugepaged)
>> 		return HPAGE_PMD_NR;
>>-	// For all other cases repect the user defined maximum.
>>-	return khugepaged_max_ptes_none;
>>+	// for PMD collapse, respect the user defined maximum.
>>+	if (is_pmd_order(order))
>>+		return max_ptes_none;
>>+	/* Zero/non-present collapse disabled. */
>>+	if (!max_ptes_none)
>>+		return 0;
>>+	// for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
>>+	// scale the maximum number of PTEs to the order of the collapse.
>>+	if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
>>+		return (1 << order) - 1;
>>+
>>+	// We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
>>+	// Emit a warning and return -EINVAL.
>>+	pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
>>+		      KHUGEPAGED_MAX_PTES_LIMIT);
>
>Maybe fallback to 0 instead, as David suggested earlier?
>

It looks reasonable to fallback to 0.

But as the updated Document says in patch 14:

  For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) are supported. Any other
  value will emit a warning and no mTHP collapse will be attempted.

This is why it does like this now.

    mthp_collapse()
        max_ptes_none = collapse_max_ptes_none();
        if (max_ptes_none < 0)
            return collapsed;

>max_ptes_none is mostly legacy PMD THP behavior. mTHP is new, and any
>intermediate value in (0, KHUGEPAGED_MAX_PTES_LIMIT) would implicitly
>disable it :(
>

So it depends on what we want to do here :-)

For me, I would vote for fallback to 0.

>Treating those values as 0 feels like the least surprising behavior,
>IMHO. It also gives mTHP a cleaner staring point, rather than carry over
>all the old PMD knob semantics :)
>
>Otherwise, LGTM!
>Reviewed-by: Lance Yang <lance.yang@linux.dev>
>
>>+	return -EINVAL;

-- 
Wei Yang
Help you, Help me

^ permalink raw reply

* [RFC PATCH 0/3] trace: stack trace deduplication for ftrace ring buffer
From: Li Pengfei @ 2026-05-14  3:49 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28

From: Pengfei Li <lipengfei28@xiaomi.com>

Hi Steven, all,

This series adds stack trace deduplication to ftrace, reducing ring
buffer usage by ~80% when stacktrace is enabled.

Problem:
When the stacktrace option is enabled, each trace event stores a full
kernel stack (typically 10-20 frames x 8 bytes = 80-160 bytes). On
production devices with 4-8MB trace buffers, this fills the buffer in
seconds, limiting the usefulness of boot-time tracing and always-on
performance monitoring.

Solution:
A lock-free hash map (modeled after tracing_map.c as suggested by
Steven [1]) that deduplicates stack traces. The ring buffer stores
only a 4-byte stack_id; full stacks are exported via tracefs.

Design (following tracing_map.c pattern):
- Lock-free insert via cmpxchg (NMI/IRQ/any context safe)
- Pre-allocated element pool (zero allocation on hot path)
- Linear probing with 2x over-provisioned table
- Per-trace_array instance support

We adopted the same lock-free algorithm as tracing_map but with a
purpose-built data structure, because tracing_map's API is designed
for histogram aggregation with fixed-size keys and sum/var fields,
while our use case requires variable-length stack traces with
reference counting.

Test results (ARM64, Qualcomm SM8850, kernel 6.12):
- kmem_cache_alloc events, 1 second capture:
  774 unique stacks, 8264 hits, 0 drops, 100% hit rate
  Ring buffer savings: 795KB -> 176KB (78% reduction)
- Function tracer, 3 seconds:
  3632 unique stacks, 25466 hits, 0 drops
  Ring buffer savings: 2.5MB -> 653KB (74% reduction)

Note: An earlier prototype using rhashtable crashed in IRQ context
(BUG at rhashtable.h:912), which led us to adopt the tracing_map
cmpxchg-based approach.

Usage:
  echo 1 > /sys/kernel/debug/tracing/options/stackmap
  echo 1 > /sys/kernel/debug/tracing/options/stacktrace
  # trace output: <stack_id 42>
  # resolve:      cat /sys/kernel/debug/tracing/stack_map

[1] https://lore.kernel.org/all/20260513085145.30dd23e0@fedora/

Pengfei Li (3):
  trace: add lock-free stackmap for stack trace deduplication
  trace: integrate stackmap into ftrace stack recording path
  trace: add documentation, selftest and tooling for stackmap

 Documentation/trace/ftrace-stackmap.rst       | 111 ++++
 kernel/trace/Kconfig                          |  21 +
 kernel/trace/Makefile                         |   1 +
 kernel/trace/trace.c                          |  46 ++
 kernel/trace/trace.h                          |  16 +
 kernel/trace/trace_entries.h                  |  15 +
 kernel/trace/trace_output.c                   |  23 +
 kernel/trace/trace_stackmap.c                 | 569 ++++++++++++++++++
 kernel/trace/trace_stackmap.h                 |  54 ++
 .../ftrace/test.d/ftrace/stackmap-basic.tc    |  74 +++
 tools/tracing/stackmap_dump.py                | 120 ++++
 11 files changed, 1050 insertions(+)
 create mode 100644 Documentation/trace/ftrace-stackmap.rst
 create mode 100644 kernel/trace/trace_stackmap.c
 create mode 100644 kernel/trace/trace_stackmap.h
 create mode 100755 tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
 create mode 100755 tools/tracing/stackmap_dump.py

-- 
2.34.1


^ permalink raw reply

* [RFC PATCH 1/3] trace: add lock-free stackmap for stack trace deduplication
From: Li Pengfei @ 2026-05-14  3:49 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28
In-Reply-To: <20260514034916.2162517-1-lipengfei28@xiaomi.com>

From: Pengfei Li <lipengfei28@xiaomi.com>

Add a lock-free hash map (ftrace_stackmap) that deduplicates kernel
stack traces for the ftrace ring buffer. Instead of storing full
stack traces (80-160 bytes each) in the ring buffer for every event,
ftrace can store a 4-byte stack_id when the stackmap option is enabled.

The implementation is modeled after tracing_map.c (used by hist
triggers), using the same lock-free design based on Dr. Cliff Click's
non-blocking hash table algorithm:

- Lock-free insert via cmpxchg (safe in NMI/IRQ/any context)
- Pre-allocated element pool (zero allocation on hot path)
- Linear probing with 2x over-provisioned table
- Per-trace_array instance support

The stackmap is exported via three tracefs nodes:
- stack_map: text export with symbol resolution
- stack_map_stat: statistics (entries, hits, drops, hit_rate)
- stack_map_bin: binary export for efficient userspace consumption

Kernel command line parameter:
- ftrace_stackmap.bits=N: set map capacity (2^N unique stacks)

Test results on ARM64 (SM8850, Android 16, kernel 6.12):
- 774 unique stacks from kmem_cache_alloc in 1 second
- 100% hit rate, 0 drops
- 92% hit rate under heavy load (all kmem events)

Signed-off-by: Pengfei Li <lipengfei28@xiaomi.com>
---
 kernel/trace/Kconfig          |  21 ++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace_stackmap.c | 569 ++++++++++++++++++++++++++++++++++
 kernel/trace/trace_stackmap.h |  54 ++++
 4 files changed, 645 insertions(+)
 create mode 100644 kernel/trace/trace_stackmap.c
 create mode 100644 kernel/trace/trace_stackmap.h

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e130da35808f..2a63fd2c9a96 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -412,6 +412,27 @@ config STACK_TRACER
 
 	  Say N if unsure.
 
+config FTRACE_STACKMAP
+	bool "Ftrace stack map deduplication"
+	depends on TRACING
+	depends on STACKTRACE
+	select KALLSYMS
+	help
+	  This enables a global stack trace hash table for ftrace, inspired
+	  by eBPF's BPF_MAP_TYPE_STACK_TRACE. When enabled, ftrace can store
+	  only a stack_id in the ring buffer instead of the full stack trace,
+	  significantly reducing trace buffer usage when the same call stacks
+	  appear repeatedly.
+
+	  The deduplicated stacks are exported via:
+	    /sys/kernel/debug/tracing/stack_map
+
+	  Writing to this file resets the stack map. Reading shows all unique
+	  stacks with their stack_id and reference count.
+
+	  Say Y if you want to reduce ftrace buffer usage for stack traces.
+	  Say N if unsure.
+
 config TRACE_PREEMPT_TOGGLE
 	bool
 	help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1decdce8cbef..f1b6175099cc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -85,6 +85,7 @@ obj-$(CONFIG_HWLAT_TRACER) += trace_hwlat.o
 obj-$(CONFIG_OSNOISE_TRACER) += trace_osnoise.o
 obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
+obj-$(CONFIG_FTRACE_STACKMAP) += trace_stackmap.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
diff --git a/kernel/trace/trace_stackmap.c b/kernel/trace/trace_stackmap.c
new file mode 100644
index 000000000000..c402e7e7f902
--- /dev/null
+++ b/kernel/trace/trace_stackmap.c
@@ -0,0 +1,569 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ftrace Stack Map - Lock-free stack trace deduplication for ftrace
+ *
+ * Modeled after tracing_map.c (used by hist triggers), this provides
+ * a lock-free hash map optimized for the ftrace hot path. The design
+ * is based on Dr. Cliff Click's non-blocking hash table algorithm.
+ *
+ * Key properties:
+ * - Lock-free insert via cmpxchg (safe in NMI/IRQ/any context)
+ * - Pre-allocated element pool (zero allocation on hot path)
+ * - Linear probing with 2x over-provisioned table
+ * - Per-trace_array instance support
+ *
+ * The 32-bit jhash of the stack IPs is used as the hash table key.
+ * On hash collision (different stacks, same 32-bit hash), linear
+ * probing finds the next slot. Full stack comparison (memcmp) is
+ * used to confirm matches.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+#include <linux/random.h>
+
+#include "trace.h"
+#include "trace_stackmap.h"
+
+/*
+ * Each pre-allocated element holds one unique stack trace.
+ * Fixed size: MAX_DEPTH entries regardless of actual depth.
+ */
+struct stackmap_elt {
+	u32		nr;		/* actual number of IPs */
+	atomic_t	ref_count;
+	unsigned long	ips[FTRACE_STACKMAP_MAX_DEPTH];
+};
+
+/*
+ * Hash table entry: a 32-bit key (jhash of stack) + pointer to elt.
+ * key == 0 means the slot is free.
+ */
+struct stackmap_entry {
+	u32			key;	/* 0 = free, non-zero = jhash */
+	struct stackmap_elt	*val;	/* NULL until fully published */
+};
+
+struct ftrace_stackmap {
+	unsigned int		map_bits;
+	unsigned int		map_size;	/* 1 << (map_bits + 1) */
+	unsigned int		max_elts;	/* 1 << map_bits */
+	atomic_t		next_elt;	/* index into elts pool */
+	struct stackmap_entry	*entries;	/* hash table */
+	struct stackmap_elt	**elts;		/* pre-allocated pool */
+	atomic_t		resetting;
+	atomic64_t		hits;
+	atomic64_t		drops;
+};
+
+static u32 stackmap_hash_seed;
+
+static unsigned int stackmap_map_bits = 14;	/* 16384 elts, 32768 slots */
+static int __init stackmap_bits_setup(char *str)
+{
+	unsigned long val;
+
+	if (kstrtoul(str, 0, &val))
+		return -EINVAL;
+	val = clamp_val(val, 10, 20);	/* 1K - 1M elts */
+	stackmap_map_bits = val;
+	return 0;
+}
+early_param("ftrace_stackmap.bits", stackmap_bits_setup);
+
+/* --- Element pool --- */
+
+static struct stackmap_elt *stackmap_get_elt(struct ftrace_stackmap *smap)
+{
+	int idx;
+
+	idx = atomic_fetch_add_unless(&smap->next_elt, 1, smap->max_elts);
+	if (idx < smap->max_elts)
+		return smap->elts[idx];
+	return NULL;
+}
+
+static int stackmap_alloc_elts(struct ftrace_stackmap *smap)
+{
+	unsigned int i;
+
+	smap->elts = vzalloc(sizeof(*smap->elts) * smap->max_elts);
+	if (!smap->elts)
+		return -ENOMEM;
+
+	for (i = 0; i < smap->max_elts; i++) {
+		smap->elts[i] = kzalloc(sizeof(struct stackmap_elt), GFP_KERNEL);
+		if (!smap->elts[i])
+			goto fail;
+	}
+	return 0;
+fail:
+	while (i--)
+		kfree(smap->elts[i]);
+	vfree(smap->elts);
+	smap->elts = NULL;
+	return -ENOMEM;
+}
+
+static void stackmap_free_elts(struct ftrace_stackmap *smap)
+{
+	unsigned int i;
+
+	if (!smap->elts)
+		return;
+	for (i = 0; i < smap->max_elts; i++)
+		kfree(smap->elts[i]);
+	vfree(smap->elts);
+	smap->elts = NULL;
+}
+
+/* --- Create / Destroy / Reset --- */
+
+struct ftrace_stackmap *ftrace_stackmap_create(void)
+{
+	struct ftrace_stackmap *smap;
+	static bool seed_initialized;
+	int err;
+
+	smap = kzalloc(sizeof(*smap), GFP_KERNEL);
+	if (!smap)
+		return ERR_PTR(-ENOMEM);
+
+	smap->map_bits = stackmap_map_bits;
+	smap->max_elts = 1 << smap->map_bits;
+	smap->map_size = smap->max_elts * 2;	/* 2x over-provision */
+
+	smap->entries = vzalloc(sizeof(*smap->entries) * smap->map_size);
+	if (!smap->entries) {
+		kfree(smap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = stackmap_alloc_elts(smap);
+	if (err) {
+		vfree(smap->entries);
+		kfree(smap);
+		return ERR_PTR(err);
+	}
+
+	atomic_set(&smap->next_elt, 0);
+	atomic_set(&smap->resetting, 0);
+	atomic64_set(&smap->hits, 0);
+	atomic64_set(&smap->drops, 0);
+
+	if (!seed_initialized) {
+		stackmap_hash_seed = get_random_u32();
+		seed_initialized = true;
+	}
+
+	return smap;
+}
+
+void ftrace_stackmap_destroy(struct ftrace_stackmap *smap)
+{
+	if (!smap || IS_ERR(smap))
+		return;
+	stackmap_free_elts(smap);
+	vfree(smap->entries);
+	kfree(smap);
+}
+
+void ftrace_stackmap_reset(struct ftrace_stackmap *smap)
+{
+	unsigned int i;
+
+	if (!smap)
+		return;
+
+	/*
+	 * Reset protocol:
+	 *
+	 * 1. Set resetting=1 so get_id() returns -EINVAL immediately.
+	 *    get_id() callers in NMI/IRQ context will see this and bail
+	 *    out before touching entries or elts.
+	 *
+	 * 2. smp_mb() ensures the resetting store is visible to all CPUs
+	 *    before we start clearing entries.  Any get_id() that already
+	 *    passed the resetting check will complete its cmpxchg and
+	 *    WRITE_ONCE(entry->val) before we memset, because:
+	 *    - the cmpxchg claims the slot atomically
+	 *    - WRITE_ONCE(entry->val) happens before we clear entries
+	 *    We accept that a handful of in-flight inserts may write into
+	 *    entries that we are about to clear; those entries will simply
+	 *    be wiped by the memset below, which is safe.
+	 *
+	 * 3. Clear entries table, then reset elt pool.
+	 *
+	 * 4. Clear resetting=0 with another smp_mb() so new get_id()
+	 *    calls see a fully reset map.
+	 */
+	atomic_set(&smap->resetting, 1);
+	smp_mb();
+
+	/* Clear hash table */
+	memset(smap->entries, 0, sizeof(*smap->entries) * smap->map_size);
+
+	/* Reset elt pool */
+	for (i = 0; i < smap->max_elts; i++)
+		memset(smap->elts[i], 0, sizeof(struct stackmap_elt));
+
+	atomic_set(&smap->next_elt, 0);
+	atomic64_set(&smap->hits, 0);
+	atomic64_set(&smap->drops, 0);
+
+	smp_mb();
+	atomic_set(&smap->resetting, 0);
+}
+
+/* --- Core: get_id (lock-free, NMI-safe) --- */
+
+int ftrace_stackmap_get_id(struct ftrace_stackmap *smap,
+			   unsigned long *ips, unsigned int nr_entries)
+{
+	u32 key_hash, idx, test_key, trace_len;
+	struct stackmap_entry *entry;
+	struct stackmap_elt *val;
+	int dup_try = 0;
+
+	if (!smap || !nr_entries || atomic_read(&smap->resetting))
+		return -EINVAL;
+	if (nr_entries > FTRACE_STACKMAP_MAX_DEPTH)
+		nr_entries = FTRACE_STACKMAP_MAX_DEPTH;
+
+	trace_len = nr_entries * sizeof(unsigned long);
+	/*
+	 * jhash2() requires the length in u32 units and the data to be
+	 * u32-aligned. On 64-bit kernels sizeof(unsigned long)==8, so
+	 * trace_len is always a multiple of 8 (hence of 4). Use jhash2
+	 * directly; the cast to u32* is safe because ips[] is naturally
+	 * aligned to sizeof(unsigned long) >= 4.
+	 */
+	key_hash = jhash2((const u32 *)ips, trace_len / sizeof(u32),
+			  stackmap_hash_seed);
+	if (key_hash == 0)
+		key_hash = 1;	/* 0 means free slot */
+
+	idx = key_hash >> (32 - (smap->map_bits + 1));
+
+	while (1) {
+		idx &= (smap->map_size - 1);
+		entry = &smap->entries[idx];
+		test_key = entry->key;
+
+		if (test_key && test_key == key_hash) {
+			val = READ_ONCE(entry->val);
+			if (val && val->nr == nr_entries &&
+			    memcmp(val->ips, ips, trace_len) == 0) {
+				atomic_inc(&val->ref_count);
+				atomic64_inc(&smap->hits);
+				return (int)idx;
+			} else if (unlikely(!val)) {
+				/* Another CPU is mid-insert; retry */
+				dup_try++;
+				if (dup_try > smap->map_size) {
+					atomic64_inc(&smap->drops);
+					break;
+				}
+				continue;
+			}
+		}
+
+		if (!test_key) {
+			/* Free slot: try to claim it */
+			if (!cmpxchg(&entry->key, 0, key_hash)) {
+				struct stackmap_elt *elt;
+
+				elt = stackmap_get_elt(smap);
+				if (!elt) {
+					/*
+					 * Pool exhausted. We claimed this slot with
+					 * cmpxchg but cannot fill it. Leave key set
+					 * so the slot stays "claimed but empty" —
+					 * future lookups will skip it (val == NULL
+					 * triggers the mid-insert retry path which
+					 * will eventually drop). This is safer than
+					 * writing key=0 without cmpxchg, which could
+					 * race with another CPU's cmpxchg on the same
+					 * slot.
+					 */
+					atomic64_inc(&smap->drops);
+					break;
+				}
+
+				elt->nr = nr_entries;
+				atomic_set(&elt->ref_count, 1);
+				memcpy(elt->ips, ips, trace_len);
+
+				/* Ensure elt is fully visible before publish */
+				smp_wmb();
+				WRITE_ONCE(entry->val, elt);
+				atomic64_inc(&smap->hits);
+				return (int)idx;
+			} else {
+				/* cmpxchg failed; someone else claimed it */
+				dup_try++;
+				continue;
+			}
+		}
+
+		idx++;
+		dup_try++;
+		if (dup_try > smap->map_size) {
+			atomic64_inc(&smap->drops);
+			break;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+/* --- Text export: /sys/kernel/debug/tracing/stack_map --- */
+
+struct stackmap_seq_private {
+	struct ftrace_stackmap	*smap;
+};
+
+static void *stackmap_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct stackmap_seq_private *priv = m->private;
+	struct ftrace_stackmap *smap = priv->smap;
+	u32 i;
+
+	if (!smap)
+		return NULL;
+	for (i = *pos; i < smap->map_size; i++) {
+		if (smap->entries[i].key && smap->entries[i].val) {
+			*pos = i;
+			return &smap->entries[i];
+		}
+	}
+	return NULL;
+}
+
+static void *stackmap_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct stackmap_seq_private *priv = m->private;
+	struct ftrace_stackmap *smap = priv->smap;
+	u32 i;
+
+	for (i = *pos + 1; i < smap->map_size; i++) {
+		if (smap->entries[i].key && smap->entries[i].val) {
+			*pos = i;
+			return &smap->entries[i];
+		}
+	}
+	*pos = i;
+	return NULL;
+}
+
+static void stackmap_seq_stop(struct seq_file *m, void *v) { }
+
+static int stackmap_seq_show(struct seq_file *m, void *v)
+{
+	struct stackmap_entry *entry = v;
+	struct stackmap_elt *elt = entry->val;
+	struct stackmap_seq_private *priv = m->private;
+	u32 idx = entry - priv->smap->entries;
+	u32 i;
+
+	if (!elt)
+		return 0;
+
+	seq_printf(m, "stack_id %u [ref %u, depth %u]\n",
+		   idx, atomic_read(&elt->ref_count), elt->nr);
+	for (i = 0; i < elt->nr; i++)
+		seq_printf(m, "  [%u] %pS\n", i, (void *)elt->ips[i]);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static const struct seq_operations stackmap_seq_ops = {
+	.start	= stackmap_seq_start,
+	.next	= stackmap_seq_next,
+	.stop	= stackmap_seq_stop,
+	.show	= stackmap_seq_show,
+};
+
+static int stackmap_open(struct inode *inode, struct file *file)
+{
+	struct stackmap_seq_private *priv;
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open_private(file, &stackmap_seq_ops,
+			       sizeof(struct stackmap_seq_private));
+	if (ret)
+		return ret;
+	m = file->private_data;
+	priv = m->private;
+	priv->smap = inode->i_private;
+	return 0;
+}
+
+static ssize_t stackmap_write(struct file *file, const char __user *ubuf,
+			      size_t count, loff_t *ppos)
+{
+	struct seq_file *m = file->private_data;
+	struct stackmap_seq_private *priv = m->private;
+	char buf[8];
+	size_t n = min(count, sizeof(buf) - 1);
+
+	if (copy_from_user(buf, ubuf, n))
+		return -EFAULT;
+	buf[n] = '\0';
+	if (n == 0 || (buf[0] != '0' && strncmp(buf, "reset", 5) != 0))
+		return -EINVAL;
+
+	ftrace_stackmap_reset(priv->smap);
+	return count;
+}
+
+const struct file_operations ftrace_stackmap_fops = {
+	.open		= stackmap_open,
+	.read		= seq_read,
+	.write		= stackmap_write,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+};
+
+/* --- Stats --- */
+
+static int stackmap_stat_show(struct seq_file *m, void *v)
+{
+	struct ftrace_stackmap *smap = m->private;
+	u32 entries;
+	u64 hits, drops;
+
+	if (!smap) {
+		seq_puts(m, "stackmap not initialized\n");
+		return 0;
+	}
+
+	entries = atomic_read(&smap->next_elt);
+	hits = atomic64_read(&smap->hits);
+	drops = atomic64_read(&smap->drops);
+
+	seq_printf(m, "entries:    %u / %u\n", entries, smap->max_elts);
+	seq_printf(m, "table_size: %u\n", smap->map_size);
+	seq_printf(m, "hits:       %llu\n", hits);
+	seq_printf(m, "drops:      %llu\n", drops);
+	if (hits + drops > 0)
+		seq_printf(m, "hit_rate:   %llu%%\n",
+			   hits * 100 / (hits + drops));
+	return 0;
+}
+
+static int stackmap_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stackmap_stat_show, inode->i_private);
+}
+
+const struct file_operations ftrace_stackmap_stat_fops = {
+	.open		= stackmap_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+/* --- Binary export --- */
+
+struct stackmap_bin_snapshot {
+	size_t	size;
+	char	data[];
+};
+
+static int stackmap_bin_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_stackmap *smap = inode->i_private;
+	struct stackmap_bin_snapshot *snap;
+	struct ftrace_stackmap_bin_header *hdr;
+	size_t alloc_size, off;
+	u32 i, nr_stacks;
+
+	if (!smap)
+		return -ENODEV;
+
+	/*
+	 * Allocate based on actual entry count, not max_elts worst case.
+	 * Each entry needs a header struct plus up to MAX_DEPTH u64 IPs.
+	 * Add 1 to nr_entries to avoid zero-size alloc on empty map.
+	 */
+	{
+		u32 nr_entries = atomic_read(&smap->next_elt);
+
+		alloc_size = sizeof(*hdr) + (nr_entries + 1) *
+			     (sizeof(struct ftrace_stackmap_bin_entry) +
+			      FTRACE_STACKMAP_MAX_DEPTH * sizeof(u64));
+	}
+
+	snap = vmalloc(sizeof(*snap) + alloc_size);
+	if (!snap)
+		return -ENOMEM;
+
+	hdr = (struct ftrace_stackmap_bin_header *)snap->data;
+	hdr->magic = FTRACE_STACKMAP_BIN_MAGIC;
+	hdr->version = FTRACE_STACKMAP_BIN_VERSION;
+	hdr->reserved = 0;
+	off = sizeof(*hdr);
+	nr_stacks = 0;
+
+	for (i = 0; i < smap->map_size; i++) {
+		struct stackmap_entry *entry = &smap->entries[i];
+		struct stackmap_elt *elt;
+		struct ftrace_stackmap_bin_entry *e;
+		u64 *ips_out;
+		u32 k;
+
+		if (!entry->key)
+			continue;
+		elt = READ_ONCE(entry->val);
+		if (!elt)
+			continue;
+
+		e = (struct ftrace_stackmap_bin_entry *)(snap->data + off);
+		e->stack_id = i;
+		e->nr = elt->nr;
+		e->ref_count = atomic_read(&elt->ref_count);
+		e->reserved = 0;
+		off += sizeof(*e);
+
+		ips_out = (u64 *)(snap->data + off);
+		for (k = 0; k < elt->nr; k++)
+			ips_out[k] = (u64)elt->ips[k];
+		off += elt->nr * sizeof(u64);
+		nr_stacks++;
+	}
+
+	hdr->nr_stacks = nr_stacks;
+	snap->size = off;
+	file->private_data = snap;
+	return 0;
+}
+
+static ssize_t stackmap_bin_read(struct file *file, char __user *ubuf,
+				 size_t count, loff_t *ppos)
+{
+	struct stackmap_bin_snapshot *snap = file->private_data;
+
+	if (!snap)
+		return -EINVAL;
+	return simple_read_from_buffer(ubuf, count, ppos, snap->data, snap->size);
+}
+
+static int stackmap_bin_release(struct inode *inode, struct file *file)
+{
+	vfree(file->private_data);
+	return 0;
+}
+
+const struct file_operations ftrace_stackmap_bin_fops = {
+	.open		= stackmap_bin_open,
+	.read		= stackmap_bin_read,
+	.llseek		= default_llseek,
+	.release	= stackmap_bin_release,
+};
diff --git a/kernel/trace/trace_stackmap.h b/kernel/trace/trace_stackmap.h
new file mode 100644
index 000000000000..74ad649a79f7
--- /dev/null
+++ b/kernel/trace/trace_stackmap.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TRACE_STACKMAP_H
+#define _TRACE_STACKMAP_H
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+
+#define FTRACE_STACKMAP_MAX_DEPTH	64
+
+/* Binary export format */
+#define FTRACE_STACKMAP_BIN_MAGIC	0x464D5342	/* 'FSMB' */
+#define FTRACE_STACKMAP_BIN_VERSION	2
+
+struct ftrace_stackmap_bin_header {
+	u32 magic;
+	u32 version;
+	u32 nr_stacks;
+	u32 reserved;
+};
+
+struct ftrace_stackmap_bin_entry {
+	u32 stack_id;
+	u32 nr;
+	u32 ref_count;
+	u32 reserved;
+	/* followed by u64 ips[nr] */
+};
+
+#ifdef CONFIG_FTRACE_STACKMAP
+
+struct ftrace_stackmap;
+
+struct ftrace_stackmap *ftrace_stackmap_create(void);
+void ftrace_stackmap_destroy(struct ftrace_stackmap *smap);
+int ftrace_stackmap_get_id(struct ftrace_stackmap *smap,
+			   unsigned long *ips, unsigned int nr_entries);
+void ftrace_stackmap_reset(struct ftrace_stackmap *smap);
+
+extern const struct file_operations ftrace_stackmap_fops;
+extern const struct file_operations ftrace_stackmap_stat_fops;
+extern const struct file_operations ftrace_stackmap_bin_fops;
+
+#else
+
+struct ftrace_stackmap;
+static inline struct ftrace_stackmap *ftrace_stackmap_create(void) { return NULL; }
+static inline void ftrace_stackmap_destroy(struct ftrace_stackmap *s) { }
+static inline int ftrace_stackmap_get_id(struct ftrace_stackmap *s,
+					 unsigned long *ips, unsigned int n)
+{ return -ENOSYS; }
+static inline void ftrace_stackmap_reset(struct ftrace_stackmap *s) { }
+
+#endif
+#endif /* _TRACE_STACKMAP_H */
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH 2/3] trace: integrate stackmap into ftrace stack recording path
From: Li Pengfei @ 2026-05-14  3:49 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28
In-Reply-To: <20260514034916.2162517-1-lipengfei28@xiaomi.com>

From: Pengfei Li <lipengfei28@xiaomi.com>

Add TRACE_STACK_ID event type and integrate ftrace_stackmap into
__ftrace_trace_stack(). When the 'stackmap' trace option is enabled,
the stack recording path stores a 4-byte stack_id in the ring buffer
instead of the full stack trace.

Changes:
- New TRACE_STACK_ID in trace_type enum
- New stack_id_entry in trace_entries.h (just 'int stack_id')
- New TRACE_ITER_STACKMAP trace option flag
- Modified __ftrace_trace_stack() to call ftrace_stackmap_get_id()
  when stackmap option is active
- Added stack_id print handler in trace_output.c
- Added stackmap field to struct trace_array (per-instance support)

The stack_id event is committed unconditionally (no filter check)
since it is a synthetic side-event tied to the parent event which
was already subject to filtering.

Fallback behavior: if stackmap returns an error (pool exhausted or
resetting), the full stack trace is recorded as before.

Usage:
  echo 1 > /sys/kernel/debug/tracing/options/stackmap
  echo 1 > /sys/kernel/debug/tracing/options/stacktrace

Signed-off-by: Pengfei Li <lipengfei28@xiaomi.com>
---
 kernel/trace/trace.c         | 46 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 16 +++++++++++++
 kernel/trace/trace_entries.h | 15 ++++++++++++
 kernel/trace/trace_output.c  | 23 ++++++++++++++++++
 4 files changed, 100 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6eb4d3097a4d..c72cb8491217 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -57,6 +57,7 @@
 
 #include "trace.h"
 #include "trace_output.h"
+#include "trace_stackmap.h"
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 /*
@@ -2184,6 +2185,37 @@ void __ftrace_trace_stack(struct trace_array *tr,
 	}
 #endif
 
+#ifdef CONFIG_FTRACE_STACKMAP
+	/*
+	 * If stackmap dedup is enabled, try to store only the stack_id
+	 * in the ring buffer instead of the full stack trace.
+	 */
+	if (tr->trace_flags & TRACE_ITER_STACKMAP) {
+		struct stack_id_entry *sid_entry;
+		int sid;
+
+		sid = ftrace_stackmap_get_id(tr->stackmap, fstack->calls, nr_entries);
+		if (sid >= 0) {
+			event = __trace_buffer_lock_reserve(buffer,
+					TRACE_STACK_ID,
+					sizeof(*sid_entry), trace_ctx);
+			if (!event)
+				goto out;
+			sid_entry = ring_buffer_event_data(event);
+			sid_entry->stack_id = sid;
+			/*
+			 * stack_id is a synthetic side-event attached to a
+			 * primary trace event that was already subject to
+			 * filtering. No per-event filter is defined for
+			 * TRACE_STACK_ID, so commit unconditionally.
+			 */
+			__buffer_unlock_commit(buffer, event);
+			goto out;
+		}
+		/* Fall through to full stack on stackmap failure */
+	}
+#endif
+
 	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,
 				    struct_size(entry, caller, nr_entries),
 				    trace_ctx);
@@ -9222,6 +9254,20 @@ static __init void tracer_init_tracefs_work_func(struct work_struct *work)
 			NULL, &tracing_dyn_info_fops);
 #endif
 
+#ifdef CONFIG_FTRACE_STACKMAP
+	global_trace.stackmap = ftrace_stackmap_create();
+	if (!IS_ERR(global_trace.stackmap)) {
+		trace_create_file("stack_map", TRACE_MODE_WRITE, NULL,
+				global_trace.stackmap, &ftrace_stackmap_fops);
+		trace_create_file("stack_map_stat", TRACE_MODE_READ, NULL,
+				global_trace.stackmap, &ftrace_stackmap_stat_fops);
+		trace_create_file("stack_map_bin", TRACE_MODE_READ, NULL,
+				global_trace.stackmap, &ftrace_stackmap_bin_fops);
+	} else {
+		pr_warn("ftrace stackmap init failed, dedup disabled\n");
+		global_trace.stackmap = NULL;
+	}
+#endif
 	create_trace_instances(NULL);
 
 	update_tracer_options();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 80fe152af1dd..74f421a89347 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -57,6 +57,7 @@ enum trace_type {
 	TRACE_TIMERLAT,
 	TRACE_RAW_DATA,
 	TRACE_FUNC_REPEATS,
+	TRACE_STACK_ID,
 
 	__TRACE_LAST_TYPE,
 };
@@ -453,6 +454,9 @@ struct trace_array {
 	struct cond_snapshot	*cond_snapshot;
 #endif
 	struct trace_func_repeats	__percpu *last_func_repeats;
+#ifdef CONFIG_FTRACE_STACKMAP
+	struct ftrace_stackmap		*stackmap;
+#endif
 	/*
 	 * On boot up, the ring buffer is set to the minimum size, so that
 	 * we do not waste memory on systems that are not using tracing.
@@ -579,6 +583,8 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct func_repeats_entry,		\
 			  TRACE_FUNC_REPEATS);				\
+		IF_ASSIGN(var, ent, struct stack_id_entry,		\
+			  TRACE_STACK_ID);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -1449,7 +1455,16 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 # define STACK_FLAGS
 #endif
 
+#ifdef CONFIG_FTRACE_STACKMAP
+# define STACKMAP_FLAGS				\
+			C(STACKMAP,		"stackmap"),
+#else
+# define STACKMAP_FLAGS
+# define TRACE_ITER_STACKMAP		0UL
+#endif
+
 #ifdef CONFIG_FUNCTION_PROFILER
+
 # define PROFILER_FLAGS					\
 		C(PROF_TEXT_OFFSET,	"prof-text-offset"),
 # ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -1506,6 +1521,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\
+		STACKMAP_FLAGS					\
 		BRANCH_FLAGS					\
 		PROFILER_FLAGS					\
 		FPROFILE_FLAGS
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 54417468fdeb..89ed14b7e5fd 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -250,6 +250,21 @@ FTRACE_ENTRY(user_stack, userstack_entry,
 		 (void *)__entry->caller[6], (void *)__entry->caller[7])
 );
 
+/*
+ * Stack ID entry - stores only a stack_id referencing the stackmap.
+ * Used when CONFIG_FTRACE_STACKMAP is enabled to deduplicate stacks.
+ */
+FTRACE_ENTRY(stack_id, stack_id_entry,
+
+	TRACE_STACK_ID,
+
+	F_STRUCT(
+		__field(	int,		stack_id	)
+	),
+
+	F_printk("<stack_id %d>", __entry->stack_id)
+);
+
 /*
  * trace_printk entry:
  */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5ad76175d10..68678ea88159 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1517,6 +1517,28 @@ static struct trace_event trace_user_stack_event = {
 	.funcs		= &trace_user_stack_funcs,
 };
 
+/* TRACE_STACK_ID */
+static enum print_line_t trace_stack_id_print(struct trace_iterator *iter,
+					      int flags, struct trace_event *event)
+{
+	struct stack_id_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+	trace_seq_printf(s, "<stack_id %d>\n", field->stack_id);
+
+	return trace_handle_return(s);
+}
+
+static struct trace_event_functions trace_stack_id_funcs = {
+	.trace		= trace_stack_id_print,
+};
+
+static struct trace_event trace_stack_id_event = {
+	.type		= TRACE_STACK_ID,
+	.funcs		= &trace_stack_id_funcs,
+};
+
 /* TRACE_HWLAT */
 static enum print_line_t
 trace_hwlat_print(struct trace_iterator *iter, int flags,
@@ -1908,6 +1930,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_wake_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_stack_id_event,
 	&trace_bputs_event,
 	&trace_bprint_event,
 	&trace_print_event,
-- 
2.34.1


^ permalink raw reply related

* [RFC PATCH 3/3] trace: add documentation, selftest and tooling for stackmap
From: Li Pengfei @ 2026-05-14  3:49 UTC (permalink / raw)
  To: linux-trace-kernel
  Cc: rostedt, mhiramat, linux-kernel, cmllamas, zhangbo56, lipengfei28
In-Reply-To: <20260514034916.2162517-1-lipengfei28@xiaomi.com>

From: Pengfei Li <lipengfei28@xiaomi.com>

Add supporting files for the ftrace stackmap feature:

Documentation/trace/ftrace-stackmap.rst:
  Comprehensive documentation covering design, usage, tracefs
  interface, binary format, and performance characteristics.

tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc:
  Basic functional selftest that verifies:
  - stackmap tracefs nodes exist
  - enabling stackmap + stacktrace produces stack_id events
  - stack_map_stat shows non-zero hits
  - reset clears entries

tools/tracing/stackmap_dump.py:
  Python script to parse the binary stack_map_bin export.
  Supports offline symbol resolution via addr2line, JSON output,
  and top-N filtering by ref_count.

Signed-off-by: Pengfei Li <lipengfei28@xiaomi.com>
---
 Documentation/trace/ftrace-stackmap.rst       | 111 ++++++++++++++++
 .../ftrace/test.d/ftrace/stackmap-basic.tc    |  74 +++++++++++
 tools/tracing/stackmap_dump.py                | 120 ++++++++++++++++++
 3 files changed, 305 insertions(+)
 create mode 100644 Documentation/trace/ftrace-stackmap.rst
 create mode 100755 tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
 create mode 100755 tools/tracing/stackmap_dump.py

diff --git a/Documentation/trace/ftrace-stackmap.rst b/Documentation/trace/ftrace-stackmap.rst
new file mode 100644
index 000000000000..8f6410d4258c
--- /dev/null
+++ b/Documentation/trace/ftrace-stackmap.rst
@@ -0,0 +1,111 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+Ftrace Stack Map
+======================
+
+:Author: Pengfei Li <lipengfei28@xiaomi.com>
+
+Overview
+========
+
+The ftrace stack map provides stack trace deduplication for the ftrace
+ring buffer. When enabled, instead of storing full kernel stack traces
+(typically 80-160 bytes each) in the ring buffer for every event, ftrace
+stores only a 4-byte ``stack_id``. The full stacks are maintained in a
+separate hash table and exported via tracefs for userspace to resolve.
+
+This is inspired by eBPF's ``BPF_MAP_TYPE_STACK_TRACE`` but integrated
+into ftrace's infrastructure, requiring no userspace daemon.
+
+Configuration
+=============
+
+Enable ``CONFIG_FTRACE_STACKMAP=y`` in the kernel config.
+
+Kernel command line parameters:
+
+- ``ftrace_stackmap.bits=N`` - Set map capacity to 2^N unique stacks (default: 14, range: 10-20)
+
+Usage
+=====
+
+Enable stack deduplication::
+
+    echo 1 > /sys/kernel/debug/tracing/options/stackmap
+    echo 1 > /sys/kernel/debug/tracing/options/stacktrace
+    echo function > /sys/kernel/debug/tracing/current_tracer
+
+The trace output will show ``<stack_id N>`` instead of full stack traces::
+
+    sh-1234 [006] d.h.. 123.456789: <stack_id 42>
+
+To view the actual stacks::
+
+    cat /sys/kernel/debug/tracing/stack_map
+
+Output format::
+
+    stack_id 42 [ref 1337, depth 8]
+      [0] schedule+0x48/0xc0
+      [1] schedule_timeout+0x1c/0x30
+      ...
+
+To view statistics::
+
+    cat /sys/kernel/debug/tracing/stack_map_stat
+
+Output::
+
+    entries:    2500
+    table_size: 5000
+    hits:       148923
+    drops:      0
+    hit_rate:   98%
+
+To reset the stack map::
+
+    echo 0 > /sys/kernel/debug/tracing/stack_map
+
+Tracefs Nodes
+=============
+
+``stack_map``
+    Text export of all deduplicated stacks with symbol resolution.
+    Writing ``0`` or ``reset`` clears all entries.
+
+``stack_map_stat``
+    Statistics: entry count, hits, drops, and hit rate.
+
+``stack_map_bin``
+    Binary export for efficient userspace consumption. Format:
+
+    - Header (16 bytes): magic(u32) + version(u32) + nr_stacks(u32) + reserved(u32)
+    - Per stack: stack_id(u32) + nr(u32) + ref_count(u32) + reserved(u32) + ips(u64 × nr)
+
+    Magic: ``0x464D5342`` ('FSMB'), Version: 2
+
+Design
+======
+
+The stack map is modeled after ``tracing_map.c`` (used by hist triggers),
+using a lock-free design based on Dr. Cliff Click's non-blocking hash table
+algorithm:
+
+- **Lookup/Insert**: Lock-free via ``cmpxchg``, safe in NMI/IRQ/any context
+- **Memory**: Pre-allocated element pool, zero allocation on the hot path
+  (no GFP_ATOMIC failures under memory pressure)
+- **Collision**: Linear probing with a 2x over-provisioned table
+- **Per-instance**: Each trace_array has its own stackmap, supporting
+  multiple ftrace instances
+- **Hash**: 32-bit jhash of stack IPs; full ``memcmp`` confirms matches
+
+Performance
+===========
+
+Typical results on ARM64 Android device (function tracer, 2 seconds):
+
+- Unique stacks: ~3000
+- Hit rate: 84-98% (depends on workload diversity)
+- Ring buffer savings: ~80% for stack data
+- Overhead per event: ~50ns (one jhash + hash table lookup)
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc b/tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
new file mode 100755
index 000000000000..3b0a7f60769f
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/stackmap-basic.tc
@@ -0,0 +1,74 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - stackmap basic functionality
+# requires: stack_map options/stackmap
+
+# Test that ftrace stackmap deduplication works:
+# 1. Enable stackmap + stacktrace options
+# 2. Run function tracer briefly
+# 3. Verify stack_map has entries
+# 4. Verify stack_map_stat shows hits
+# 5. Verify trace contains <stack_id> events
+# 6. Verify reset works
+
+fail() {
+    echo "FAIL: $1"
+    exit_fail
+}
+
+disable_tracing
+clear_trace
+
+# Verify stackmap files exist
+test -f stack_map || fail "stack_map file missing"
+test -f stack_map_stat || fail "stack_map_stat file missing"
+test -f stack_map_bin || fail "stack_map_bin file missing"
+
+# Enable stackmap dedup
+echo 1 > options/stackmap
+echo 1 > options/stacktrace
+
+# Run function tracer briefly
+echo function > current_tracer
+enable_tracing
+sleep 1
+disable_tracing
+echo nop > current_tracer
+echo 0 > options/stackmap
+
+# Check stack_map_stat has entries
+entries=$(cat stack_map_stat | grep "^entries:" | awk '{print $2}')
+if [ "$entries" -eq 0 ]; then
+    fail "stackmap has zero entries after tracing"
+fi
+
+# Check hits > 0
+hits=$(cat stack_map_stat | grep "^hits:" | awk '{print $2}')
+if [ "$hits" -eq 0 ]; then
+    fail "stackmap has zero hits"
+fi
+
+# Check drops == 0 (pool should be large enough for 1s trace)
+drops=$(cat stack_map_stat | grep "^drops:" | awk '{print $2}')
+
+# Check stack_map text output is parseable
+first_id=$(cat stack_map | grep "^stack_id" | head -1 | awk '{print $2}')
+if [ -z "$first_id" ]; then
+    fail "stack_map output has no stack_id entries"
+fi
+
+# Check trace has stack_id events
+count=$(cat trace | grep -c "stack_id" || true)
+if [ "$count" -eq 0 ]; then
+    fail "trace has no <stack_id> events"
+fi
+
+# Test reset
+echo 0 > stack_map
+entries_after=$(cat stack_map_stat | grep "^entries:" | awk '{print $2}')
+if [ "$entries_after" -ne 0 ]; then
+    fail "stackmap reset did not clear entries"
+fi
+
+echo "stackmap basic test passed: $entries unique stacks, $hits hits, $drops drops"
+exit 0
diff --git a/tools/tracing/stackmap_dump.py b/tools/tracing/stackmap_dump.py
new file mode 100755
index 000000000000..91ce80c681ea
--- /dev/null
+++ b/tools/tracing/stackmap_dump.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+"""
+stackmap_dump.py - Parse and display ftrace stack_map_bin binary export.
+
+Usage:
+    # Pull from device and parse
+    adb pull /sys/kernel/debug/tracing/stack_map_bin /tmp/stack_map.bin
+    python3 stackmap_dump.py /tmp/stack_map.bin
+
+    # With vmlinux for offline symbol resolution
+    python3 stackmap_dump.py /tmp/stack_map.bin --vmlinux vmlinux
+
+    # JSON output for tooling
+    python3 stackmap_dump.py /tmp/stack_map.bin --json
+"""
+
+import struct
+import sys
+import argparse
+import json
+import subprocess
+
+MAGIC = 0x464D5342  # 'FSMB'
+HEADER_FMT = '<IIII'  # magic, version, nr_stacks, reserved
+ENTRY_FMT = '<IIII'   # stack_id, nr, ref_count, reserved
+HEADER_SIZE = struct.calcsize(HEADER_FMT)
+ENTRY_SIZE = struct.calcsize(ENTRY_FMT)
+
+
+def addr2line(vmlinux, addr):
+    """Resolve address to symbol using addr2line."""
+    try:
+        result = subprocess.run(
+            ['addr2line', '-f', '-e', vmlinux, hex(addr)],
+            capture_output=True, text=True, timeout=5
+        )
+        lines = result.stdout.strip().split('\n')
+        if len(lines) >= 1 and lines[0] != '??':
+            return lines[0]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+    return None
+
+
+def parse_stackmap_bin(data):
+    """Parse binary stackmap data, yield (stack_id, ref_count, [ips])."""
+    if len(data) < HEADER_SIZE:
+        raise ValueError("File too small for header")
+
+    magic, version, nr_stacks, _ = struct.unpack_from(HEADER_FMT, data, 0)
+    if magic != MAGIC:
+        raise ValueError(f"Bad magic: 0x{magic:08x}, expected 0x{MAGIC:08x}")
+    if version not in (1, 2):
+        raise ValueError(f"Unsupported version: {version}")
+
+    offset = HEADER_SIZE
+    for _ in range(nr_stacks):
+        if offset + ENTRY_SIZE > len(data):
+            break
+        stack_id, nr, ref_count, _ = struct.unpack_from(ENTRY_FMT, data, offset)
+        offset += ENTRY_SIZE
+
+        ips_size = nr * 8
+        if offset + ips_size > len(data):
+            break
+        ips = struct.unpack_from(f'<{nr}Q', data, offset)
+        offset += ips_size
+
+        yield stack_id, ref_count, list(ips)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Parse ftrace stack_map_bin')
+    parser.add_argument('file', help='Path to stack_map_bin file')
+    parser.add_argument('--vmlinux', help='Path to vmlinux for symbol resolution')
+    parser.add_argument('--json', action='store_true', help='JSON output')
+    parser.add_argument('--top', type=int, default=0,
+                        help='Show only top N stacks by ref_count')
+    args = parser.parse_args()
+
+    with open(args.file, 'rb') as f:
+        data = f.read()
+
+    stacks = list(parse_stackmap_bin(data))
+
+    if args.top > 0:
+        stacks.sort(key=lambda x: x[1], reverse=True)
+        stacks = stacks[:args.top]
+
+    if args.json:
+        output = []
+        for stack_id, ref_count, ips in stacks:
+            entry = {
+                'stack_id': stack_id,
+                'ref_count': ref_count,
+                'ips': [f'0x{ip:x}' for ip in ips]
+            }
+            if args.vmlinux:
+                entry['symbols'] = [addr2line(args.vmlinux, ip) or f'0x{ip:x}'
+                                    for ip in ips]
+            output.append(entry)
+        print(json.dumps(output, indent=2))
+    else:
+        for stack_id, ref_count, ips in stacks:
+            print(f"stack_id {stack_id} [ref {ref_count}, depth {len(ips)}]")
+            for i, ip in enumerate(ips):
+                sym = ''
+                if args.vmlinux:
+                    resolved = addr2line(args.vmlinux, ip)
+                    if resolved:
+                        sym = f' {resolved}'
+                print(f"  [{i}] 0x{ip:x}{sym}")
+            print()
+
+    print(f"Total: {len(stacks)} unique stacks", file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
-- 
2.34.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox