From: Steven Rostedt <rostedt@kernel.org>
To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>,
Mark Rutland <mark.rutland@arm.com>,
Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
Andrew Morton <akpm@linux-foundation.org>,
Peter Zijlstra <peterz@infradead.org>,
Thomas Gleixner <tglx@linutronix.de>,
Ian Rogers <irogers@google.com>,
Namhyung Kim <namhyung@kernel.org>,
Arnaldo Carvalho de Melo <acme@kernel.org>,
Jiri Olsa <jolsa@kernel.org>,
Douglas Raillard <douglas.raillard@arm.com>
Subject: [POC][RFC][PATCH 2/3] ftrace: Add perf counters to function tracing
Date: Mon, 17 Nov 2025 19:29:52 -0500 [thread overview]
Message-ID: <20251118003531.805883570@kernel.org> (raw)
In-Reply-To: 20251118002950.680329246@kernel.org
From: Steven Rostedt <rostedt@goodmis.org>
Add option to trigger perf events to function tracing.
Two new options are added: func-cpu-cycles and func-cache-misses
# cd /sys/kernel/tracing
# echo 1 > options/func-cache-misses
# echo function > current_tracer
# cat trace
[..]
sshd-session-1014 [005] ..... 327.836708: __x64_sys_read <-do_syscall_64
sshd-session-1014 [005] ..... 327.836708: cache_misses: 741719054
sshd-session-1014 [005] ..... 327.836712: ksys_read <-do_syscall_64
sshd-session-1014 [005] ..... 327.836713: cache_misses: 741720271
sshd-session-1014 [005] ..... 327.836716: fdget_pos <-ksys_read
sshd-session-1014 [005] ..... 327.836717: cache_misses: 741721483
sshd-session-1014 [005] ..... 327.836720: vfs_read <-ksys_read
sshd-session-1014 [005] ..... 327.836721: cache_misses: 741722726
sshd-session-1014 [005] ..... 327.836724: rw_verify_area <-vfs_read
sshd-session-1014 [005] ..... 327.836725: cache_misses: 741723940
sshd-session-1014 [005] ..... 327.836728: security_file_permission <-rw_verify_area
sshd-session-1014 [005] ..... 327.836729: cache_misses: 741725151
The option will cause the perf event to be triggered after every function
called.
If cpu_cycles is also enabled:
# echo 1 > options/func-cpu-cycles
# cat trace
[..]
sshd-session-1014 [005] b..1. 536.844538: preempt_count_sub <-_raw_spin_unlock
sshd-session-1014 [005] b..1. 536.844539: cpu_cycles: 1919425978 cache_misses: 3431216952
sshd-session-1014 [005] b.... 536.844545: validate_xmit_skb_list <-sch_direct_xmit
sshd-session-1014 [005] b.... 536.844545: cpu_cycles: 1919429935 cache_misses: 3431218535
sshd-session-1014 [005] b.... 536.844551: validate_xmit_skb.isra.0 <-validate_xmit_skb_list
sshd-session-1014 [005] b.... 536.844552: cpu_cycles: 1919433763 cache_misses: 3431220112
sshd-session-1014 [005] b.... 536.844557: netif_skb_features <-validate_xmit_skb.isra.0
sshd-session-1014 [005] b.... 536.844558: cpu_cycles: 1919437574 cache_misses: 3431221688
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/trace_recursion.h | 5 +-
kernel/trace/trace.c | 58 ++++++++++++---
kernel/trace/trace.h | 6 ++
kernel/trace/trace_functions.c | 124 ++++++++++++++++++++++++++++++--
4 files changed, 178 insertions(+), 15 deletions(-)
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index ae04054a1be3..c42d86d81afa 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -132,9 +132,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
* will think a recursion occurred, and the event will be dropped.
* Let a single instance happen via the TRANSITION_BIT to
* not drop those events.
+ *
+ * When ip is zero, the caller is purposely trying causing
+ * recursion. Don't record it.
*/
bit = TRACE_CTX_TRANSITION + start;
- if (val & (1 << bit)) {
+ if ((val & (1 << bit)) && ip) {
do_ftrace_record_recursion(ip, pip);
return -1;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64d966a3ec8b..42bf1c046de1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2915,21 +2915,18 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
#ifdef CONFIG_PERF_EVENTS
-static inline void record_perf_event(struct trace_array *tr,
- struct trace_buffer *buffer,
- unsigned int trace_ctx)
+static inline void trace_perf_event(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ int entries, u64 flags,
+ unsigned int trace_ctx)
{
struct ring_buffer_event *event;
struct perf_event_entry *entry;
- int entries = READ_ONCE(tr->perf_events);
struct trace_array_cpu *data;
u64 *value;
int size;
int cpu;
- if (!entries)
- return;
-
guard(preempt_notrace)();
cpu = smp_processor_id();
@@ -2949,12 +2946,12 @@ static inline void record_perf_event(struct trace_array *tr,
entry = ring_buffer_event_data(event);
value = entry->values;
- if (tr->trace_flags & TRACE_ITER(PERF_CYCLES)) {
+ if (flags & TRACE_ITER(PERF_CYCLES)) {
*value++ = TRACE_PERF_VALUE(PERF_TRACE_CYCLES);
entries--;
}
- if (entries && tr->trace_flags & TRACE_ITER(PERF_CACHE)) {
+ if (entries && flags & TRACE_ITER(PERF_CACHE)) {
*value++ = TRACE_PERF_VALUE(PERF_TRACE_CACHE);
entries--;
}
@@ -2968,6 +2965,49 @@ static inline void record_perf_event(struct trace_array *tr,
local_dec(&data->disabled);
}
+static inline void record_perf_event(struct trace_array *tr,
+ struct trace_buffer *buffer,
+ unsigned int trace_ctx)
+{
+ int entries = READ_ONCE(tr->perf_events);
+
+ if (!entries)
+ return;
+
+ trace_perf_event(tr, buffer, entries, tr->trace_flags, trace_ctx);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+void ftrace_perf_events(struct trace_array *tr, int perf_events,
+ u64 perf_mask, unsigned int trace_ctx)
+{
+ struct trace_buffer *buffer;
+ int bit;
+
+ /*
+ * Prevent any ftrace recursion.
+ * The ftrace_test_recursion_trylock() allows one nested loop
+ * to handle the case where an interrupt comes in and traces
+ * before the preempt_count is updated to the new context.
+ * This one instance allows that function to still be traced.
+ *
+ * The trace_perf_cache_misses() will call functions that function
+ * tracing will want to trace. Prevent this one loop from happening
+ * by taking the the lock again. If an interrupt comes in now,
+ * it may still be dropped, but there's really nothing that can
+ * be done about that until all those locations get fixed.
+ */
+ bit = ftrace_test_recursion_trylock(0, 0);
+
+ buffer = tr->array_buffer.buffer;
+ trace_perf_event(tr, buffer, perf_events, perf_mask, trace_ctx);
+
+ /* bit < 0 means the trylock failed and does not need to be unlocked */
+ if (bit >= 0)
+ ftrace_test_recursion_unlock(bit);
+}
+#endif
+
static int handle_perf_event(struct trace_array *tr, u64 mask, int enabled)
{
int ret = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 094a156b0c70..bb764a2255c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -365,6 +365,8 @@ struct trace_array {
int buffer_disabled;
int perf_events;
+ int ftrace_perf_events;
+ u64 ftrace_perf_mask;
struct trace_pid_list __rcu *filtered_pids;
struct trace_pid_list __rcu *filtered_no_pids;
@@ -1402,6 +1404,10 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
u64 do_trace_perf_event(int type);
int trace_perf_event_enable(int type);
void trace_perf_event_disable(int type);
+#ifdef CONFIG_FUNCTION_TRACER
+void ftrace_perf_events(struct trace_array *tr, int perf_events,
+ u64 perf_mask, unsigned int trace_ctx);
+#endif
#else
# define PERF_FLAGS
static inline u64 do_trace_perf_event(int type) { return 0; }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c12795c2fb39..97f46ac7ef21 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,8 +47,12 @@ enum {
TRACE_FUNC_OPT_NO_REPEATS = 0x2,
TRACE_FUNC_OPT_ARGS = 0x4,
- /* Update this to next highest bit. */
- TRACE_FUNC_OPT_HIGHEST_BIT = 0x8
+ /* Update this to next highest function bit. */
+ TRACE_FUNC_OPT_HIGHEST_BIT = 0x8,
+
+ /* These are just other options */
+ TRACE_FUNC_OPT_PERF_CYCLES = 0x10,
+ TRACE_FUNC_OPT_PERF_CACHE = 0x20,
};
#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1)
@@ -143,6 +147,105 @@ static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
return true;
}
+#ifdef CONFIG_PERF_EVENTS
+static inline void
+do_trace_function(struct trace_array *tr, unsigned long ip,
+ unsigned long parent_ip, unsigned int trace_ctx,
+ struct ftrace_regs *fregs)
+{
+ trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+
+ if (likely(!tr->ftrace_perf_events))
+ return;
+
+ ftrace_perf_events(tr, tr->ftrace_perf_events, tr->ftrace_perf_mask, trace_ctx);
+}
+
+static bool handle_perf_event_flag(struct trace_array *tr, int bit, int set, int *err)
+{
+ u64 mask;
+ int type;
+
+ *err = 0;
+
+ switch (bit) {
+ case TRACE_FUNC_OPT_PERF_CYCLES:
+ mask = TRACE_ITER(PERF_CYCLES);
+ type = PERF_TRACE_CYCLES;
+ break;
+
+ case TRACE_FUNC_OPT_PERF_CACHE:
+ mask = TRACE_ITER(PERF_CACHE);
+ type = PERF_TRACE_CACHE;
+ break;
+
+ default:
+ return 0;
+ }
+
+ if (set)
+ *err = trace_perf_event_enable(type);
+ else
+ trace_perf_event_disable(type);
+
+ if (*err < 0)
+ return 1;
+
+ if (set) {
+ tr->ftrace_perf_events++;
+ tr->ftrace_perf_mask |= mask;
+ } else {
+ tr->ftrace_perf_mask &= ~mask;
+ tr->ftrace_perf_events--;
+ }
+ return 1;
+}
+
+static void ftrace_perf_enable(struct trace_array *tr, int bit)
+{
+ int err;
+
+ if (!(tr->current_trace_flags->val & bit))
+ return;
+
+ handle_perf_event_flag(tr, bit, 1, &err);
+ if (err < 0)
+ tr->current_trace_flags->val &= ~bit;
+}
+
+static void ftrace_perf_disable(struct trace_array *tr, int bit)
+{
+ int err;
+
+ /* Only disable if it was enabled */
+ if (!(tr->current_trace_flags->val & bit))
+ return;
+
+ handle_perf_event_flag(tr, bit, 0, &err);
+}
+
+static void ftrace_perf_init(struct trace_array *tr)
+{
+ ftrace_perf_enable(tr, TRACE_FUNC_OPT_PERF_CYCLES);
+ ftrace_perf_enable(tr, TRACE_FUNC_OPT_PERF_CACHE);
+}
+
+static void ftrace_perf_reset(struct trace_array *tr)
+{
+ ftrace_perf_disable(tr, TRACE_FUNC_OPT_PERF_CYCLES);
+ ftrace_perf_disable(tr, TRACE_FUNC_OPT_PERF_CACHE);
+}
+#else
+#define do_trace_function trace_function
+static inline bool handle_perf_event_flag(struct trace_array *tr, int bit,
+ int set, int *err)
+{
+ return 0;
+}
+static inline void ftrace_perf_init(struct trace_array *tr) { }
+static inline void ftrace_perf_reset(struct trace_array *tr) { }
+#endif /* CONFIG_PERF_EVENTS */
+
static int function_trace_init(struct trace_array *tr)
{
ftrace_func_t func;
@@ -165,6 +268,8 @@ static int function_trace_init(struct trace_array *tr)
tr->array_buffer.cpu = raw_smp_processor_id();
+ ftrace_perf_init(tr);
+
tracing_start_cmdline_record();
tracing_start_function_trace(tr);
return 0;
@@ -172,6 +277,7 @@ static int function_trace_init(struct trace_array *tr)
static void function_trace_reset(struct trace_array *tr)
{
+ ftrace_perf_reset(tr);
tracing_stop_function_trace(tr);
tracing_stop_cmdline_record();
ftrace_reset_array_ops(tr);
@@ -223,7 +329,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_dec();
- trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+ do_trace_function(tr, ip, parent_ip, trace_ctx, NULL);
ftrace_test_recursion_unlock(bit);
}
@@ -245,7 +351,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx();
- trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+ do_trace_function(tr, ip, parent_ip, trace_ctx, fregs);
ftrace_test_recursion_unlock(bit);
}
@@ -372,7 +478,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
- trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+ do_trace_function(tr, ip, parent_ip, trace_ctx, NULL);
out:
ftrace_test_recursion_unlock(bit);
@@ -428,6 +534,10 @@ static struct tracer_opt func_opts[] = {
{ TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
#ifdef CONFIG_FUNCTION_TRACE_ARGS
{ TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) },
+#endif
+#if CONFIG_PERF_EVENTS
+ { TRACER_OPT(func-cpu-cycles, TRACE_FUNC_OPT_PERF_CYCLES) },
+ { TRACER_OPT(func-cache-misses, TRACE_FUNC_OPT_PERF_CACHE) },
#endif
{ } /* Always set a last empty entry */
};
@@ -457,6 +567,7 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
ftrace_func_t func;
u32 new_flags;
+ int err;
/* Do nothing if already set. */
if (!!set == !!(tr->current_trace_flags->val & bit))
@@ -466,6 +577,9 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (tr->current_trace != &function_trace)
return 0;
+ if (handle_perf_event_flag(tr, bit, set, &err))
+ return err;
+
new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0);
func = select_trace_function(new_flags);
if (!func)
--
2.51.0
next prev parent reply other threads:[~2025-11-18 0:35 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-18 0:29 [POC][RFC][PATCH 0/3] tracing: Add perf events to trace buffer Steven Rostedt
2025-11-18 0:29 ` [POC][RFC][PATCH 1/3] tracing: Add perf events Steven Rostedt
2025-11-18 8:35 ` Peter Zijlstra
2025-11-18 13:42 ` Steven Rostedt
2025-11-18 20:24 ` Steven Rostedt
2025-11-20 9:01 ` kernel test robot
2025-11-20 9:23 ` kernel test robot
2025-11-18 0:29 ` Steven Rostedt [this message]
2025-11-20 11:39 ` [POC][RFC][PATCH 2/3] ftrace: Add perf counters to function tracing kernel test robot
2025-11-18 0:29 ` [POC][RFC][PATCH 3/3] fgraph: Add perf counters to function graph tracer Steven Rostedt
2025-11-18 3:08 ` [POC][RFC][PATCH 0/3] tracing: Add perf events to trace buffer Masami Hiramatsu
2025-11-18 3:42 ` Steven Rostedt
2025-11-18 8:11 ` Masami Hiramatsu
2025-11-18 13:53 ` Steven Rostedt
2025-11-18 13:57 ` Steven Rostedt
2025-11-18 16:31 ` Steven Rostedt
2025-11-18 7:25 ` Namhyung Kim
2025-11-18 16:24 ` Steven Rostedt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251118003531.805883570@kernel.org \
--to=rostedt@kernel.org \
--cc=acme@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=douglas.raillard@arm.com \
--cc=irogers@google.com \
--cc=jolsa@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=mark.rutland@arm.com \
--cc=mathieu.desnoyers@efficios.com \
--cc=mhiramat@kernel.org \
--cc=namhyung@kernel.org \
--cc=peterz@infradead.org \
--cc=tglx@linutronix.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.