All of lore.kernel.org
 help / color / mirror / Atom feed
From: Steven Rostedt <rostedt@kernel.org>
To: linux-kernel@vger.kernel.org, linux-trace-kernel@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ian Rogers <irogers@google.com>,
	Namhyung Kim <namhyung@kernel.org>,
	Arnaldo Carvalho de Melo <acme@kernel.org>,
	Jiri Olsa <jolsa@kernel.org>,
	Douglas Raillard <douglas.raillard@arm.com>
Subject: [POC][RFC][PATCH 2/3] ftrace: Add perf counters to function tracing
Date: Mon, 17 Nov 2025 19:29:52 -0500	[thread overview]
Message-ID: <20251118003531.805883570@kernel.org> (raw)
In-Reply-To: 20251118002950.680329246@kernel.org

From: Steven Rostedt <rostedt@goodmis.org>

Add option to trigger perf events to function tracing.

Two new options are added: func-cpu-cycles and func-cache-misses

 # cd /sys/kernel/tracing
 # echo 1 > options/func-cache-misses
 # echo function > current_tracer
 # cat trace
[..]
    sshd-session-1014    [005] .....   327.836708: __x64_sys_read <-do_syscall_64
    sshd-session-1014    [005] .....   327.836708: cache_misses: 741719054
    sshd-session-1014    [005] .....   327.836712: ksys_read <-do_syscall_64
    sshd-session-1014    [005] .....   327.836713: cache_misses: 741720271
    sshd-session-1014    [005] .....   327.836716: fdget_pos <-ksys_read
    sshd-session-1014    [005] .....   327.836717: cache_misses: 741721483
    sshd-session-1014    [005] .....   327.836720: vfs_read <-ksys_read
    sshd-session-1014    [005] .....   327.836721: cache_misses: 741722726
    sshd-session-1014    [005] .....   327.836724: rw_verify_area <-vfs_read
    sshd-session-1014    [005] .....   327.836725: cache_misses: 741723940
    sshd-session-1014    [005] .....   327.836728: security_file_permission <-rw_verify_area
    sshd-session-1014    [005] .....   327.836729: cache_misses: 741725151

The option will cause the perf event to be triggered after every function
called.

If cpu_cycles is also enabled:

 # echo 1 > options/func-cpu-cycles
 # cat trace
[..]
    sshd-session-1014    [005] b..1.   536.844538: preempt_count_sub <-_raw_spin_unlock
    sshd-session-1014    [005] b..1.   536.844539: cpu_cycles: 1919425978 cache_misses: 3431216952
    sshd-session-1014    [005] b....   536.844545: validate_xmit_skb_list <-sch_direct_xmit
    sshd-session-1014    [005] b....   536.844545: cpu_cycles: 1919429935 cache_misses: 3431218535
    sshd-session-1014    [005] b....   536.844551: validate_xmit_skb.isra.0 <-validate_xmit_skb_list
    sshd-session-1014    [005] b....   536.844552: cpu_cycles: 1919433763 cache_misses: 3431220112
    sshd-session-1014    [005] b....   536.844557: netif_skb_features <-validate_xmit_skb.isra.0
    sshd-session-1014    [005] b....   536.844558: cpu_cycles: 1919437574 cache_misses: 3431221688

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h |   5 +-
 kernel/trace/trace.c            |  58 ++++++++++++---
 kernel/trace/trace.h            |   6 ++
 kernel/trace/trace_functions.c  | 124 ++++++++++++++++++++++++++++++--
 4 files changed, 178 insertions(+), 15 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index ae04054a1be3..c42d86d81afa 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -132,9 +132,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 		 * will think a recursion occurred, and the event will be dropped.
 		 * Let a single instance happen via the TRANSITION_BIT to
 		 * not drop those events.
+		 *
+		 * When ip is zero, the caller is purposely trying causing
+		 * recursion. Don't record it.
 		 */
 		bit = TRACE_CTX_TRANSITION + start;
-		if (val & (1 << bit)) {
+		if ((val & (1 << bit)) && ip) {
 			do_ftrace_record_recursion(ip, pip);
 			return -1;
 		}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 64d966a3ec8b..42bf1c046de1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2915,21 +2915,18 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
 EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
 
 #ifdef CONFIG_PERF_EVENTS
-static inline void record_perf_event(struct trace_array *tr,
-				     struct trace_buffer *buffer,
-				     unsigned int trace_ctx)
+static inline void trace_perf_event(struct trace_array *tr,
+				    struct trace_buffer *buffer,
+				    int entries, u64 flags,
+				    unsigned int trace_ctx)
 {
 	struct ring_buffer_event *event;
 	struct perf_event_entry *entry;
-	int entries = READ_ONCE(tr->perf_events);
 	struct trace_array_cpu *data;
 	u64 *value;
 	int size;
 	int cpu;
 
-	if (!entries)
-		return;
-
 	guard(preempt_notrace)();
 	cpu = smp_processor_id();
 
@@ -2949,12 +2946,12 @@ static inline void record_perf_event(struct trace_array *tr,
 	entry			= ring_buffer_event_data(event);
 	value			= entry->values;
 
-	if (tr->trace_flags & TRACE_ITER(PERF_CYCLES)) {
+	if (flags & TRACE_ITER(PERF_CYCLES)) {
 		*value++ = TRACE_PERF_VALUE(PERF_TRACE_CYCLES);
 		entries--;
 	}
 
-	if (entries && tr->trace_flags & TRACE_ITER(PERF_CACHE)) {
+	if (entries && flags & TRACE_ITER(PERF_CACHE)) {
 		*value++ = TRACE_PERF_VALUE(PERF_TRACE_CACHE);
 		entries--;
 	}
@@ -2968,6 +2965,49 @@ static inline void record_perf_event(struct trace_array *tr,
 	local_dec(&data->disabled);
 }
 
+static inline void record_perf_event(struct trace_array *tr,
+				     struct trace_buffer *buffer,
+				     unsigned int trace_ctx)
+{
+	int entries = READ_ONCE(tr->perf_events);
+
+	if (!entries)
+		return;
+
+	trace_perf_event(tr, buffer, entries, tr->trace_flags, trace_ctx);
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+void ftrace_perf_events(struct trace_array *tr, int perf_events,
+			u64 perf_mask, unsigned int trace_ctx)
+{
+	struct trace_buffer *buffer;
+	int bit;
+
+	/*
+	 * Prevent any ftrace recursion.
+	 * The ftrace_test_recursion_trylock() allows one nested loop
+	 * to handle the case where an interrupt comes in and traces
+	 * before the preempt_count is updated to the new context.
+	 * This one instance allows that function to still be traced.
+	 *
+	 * The trace_perf_cache_misses() will call functions that function
+	 * tracing will want to trace. Prevent this one loop from happening
+	 * by taking the the lock again. If an interrupt comes in now,
+	 * it may still be dropped, but there's really nothing that can
+	 * be done about that until all those locations get fixed.
+	 */
+	bit = ftrace_test_recursion_trylock(0, 0);
+
+	buffer = tr->array_buffer.buffer;
+	trace_perf_event(tr, buffer, perf_events, perf_mask, trace_ctx);
+
+	/* bit < 0 means the trylock failed and does not need to be unlocked */
+	if (bit >= 0)
+		ftrace_test_recursion_unlock(bit);
+}
+#endif
+
 static int handle_perf_event(struct trace_array *tr, u64 mask, int enabled)
 {
 	int ret = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 094a156b0c70..bb764a2255c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -365,6 +365,8 @@ struct trace_array {
 	int			buffer_disabled;
 
 	int			perf_events;
+	int			ftrace_perf_events;
+	u64			ftrace_perf_mask;
 
 	struct trace_pid_list	__rcu *filtered_pids;
 	struct trace_pid_list	__rcu *filtered_no_pids;
@@ -1402,6 +1404,10 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 u64 do_trace_perf_event(int type);
 int trace_perf_event_enable(int type);
 void trace_perf_event_disable(int type);
+#ifdef CONFIG_FUNCTION_TRACER
+void ftrace_perf_events(struct trace_array *tr, int perf_events,
+			u64 perf_mask, unsigned int trace_ctx);
+#endif
 #else
 # define PERF_FLAGS
 static inline u64 do_trace_perf_event(int type) { return 0; }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c12795c2fb39..97f46ac7ef21 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,8 +47,12 @@ enum {
 	TRACE_FUNC_OPT_NO_REPEATS	= 0x2,
 	TRACE_FUNC_OPT_ARGS		= 0x4,
 
-	/* Update this to next highest bit. */
-	TRACE_FUNC_OPT_HIGHEST_BIT	= 0x8
+	/* Update this to next highest function bit. */
+	TRACE_FUNC_OPT_HIGHEST_BIT	= 0x8,
+
+	/* These are just other options */
+	TRACE_FUNC_OPT_PERF_CYCLES	= 0x10,
+	TRACE_FUNC_OPT_PERF_CACHE	= 0x20,
 };
 
 #define TRACE_FUNC_OPT_MASK	(TRACE_FUNC_OPT_HIGHEST_BIT - 1)
@@ -143,6 +147,105 @@ static bool handle_func_repeats(struct trace_array *tr, u32 flags_val)
 	return true;
 }
 
+#ifdef CONFIG_PERF_EVENTS
+static inline void
+do_trace_function(struct trace_array *tr, unsigned long ip,
+		  unsigned long parent_ip, unsigned int trace_ctx,
+		  struct ftrace_regs *fregs)
+{
+	trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+
+	if (likely(!tr->ftrace_perf_events))
+		return;
+
+	ftrace_perf_events(tr, tr->ftrace_perf_events, tr->ftrace_perf_mask, trace_ctx);
+}
+
+static bool handle_perf_event_flag(struct trace_array *tr, int bit, int set, int *err)
+{
+	u64 mask;
+	int type;
+
+	*err = 0;
+
+	switch (bit) {
+	case TRACE_FUNC_OPT_PERF_CYCLES:
+		mask = TRACE_ITER(PERF_CYCLES);
+		type = PERF_TRACE_CYCLES;
+		break;
+
+	case TRACE_FUNC_OPT_PERF_CACHE:
+		mask = TRACE_ITER(PERF_CACHE);
+		type = PERF_TRACE_CACHE;
+		break;
+
+	default:
+		return 0;
+	}
+
+	if (set)
+		*err = trace_perf_event_enable(type);
+	else
+		trace_perf_event_disable(type);
+
+	if (*err < 0)
+		return 1;
+
+	if (set) {
+		tr->ftrace_perf_events++;
+		tr->ftrace_perf_mask |= mask;
+	} else {
+		tr->ftrace_perf_mask &= ~mask;
+		tr->ftrace_perf_events--;
+	}
+	return 1;
+}
+
+static void ftrace_perf_enable(struct trace_array *tr, int bit)
+{
+	int err;
+
+	if (!(tr->current_trace_flags->val & bit))
+		return;
+
+	handle_perf_event_flag(tr, bit, 1, &err);
+	if (err < 0)
+		tr->current_trace_flags->val &= ~bit;
+}
+
+static void ftrace_perf_disable(struct trace_array *tr, int bit)
+{
+	int err;
+
+	/* Only disable if it was enabled */
+	if (!(tr->current_trace_flags->val & bit))
+		return;
+
+	handle_perf_event_flag(tr, bit, 0, &err);
+}
+
+static void ftrace_perf_init(struct trace_array *tr)
+{
+	ftrace_perf_enable(tr, TRACE_FUNC_OPT_PERF_CYCLES);
+	ftrace_perf_enable(tr, TRACE_FUNC_OPT_PERF_CACHE);
+}
+
+static void ftrace_perf_reset(struct trace_array *tr)
+{
+	ftrace_perf_disable(tr, TRACE_FUNC_OPT_PERF_CYCLES);
+	ftrace_perf_disable(tr, TRACE_FUNC_OPT_PERF_CACHE);
+}
+#else
+#define do_trace_function trace_function
+static inline bool handle_perf_event_flag(struct trace_array *tr, int bit,
+					  int set, int *err)
+{
+	return 0;
+}
+static inline void ftrace_perf_init(struct trace_array *tr) { }
+static inline void ftrace_perf_reset(struct trace_array *tr) { }
+#endif /* CONFIG_PERF_EVENTS */
+
 static int function_trace_init(struct trace_array *tr)
 {
 	ftrace_func_t func;
@@ -165,6 +268,8 @@ static int function_trace_init(struct trace_array *tr)
 
 	tr->array_buffer.cpu = raw_smp_processor_id();
 
+	ftrace_perf_init(tr);
+
 	tracing_start_cmdline_record();
 	tracing_start_function_trace(tr);
 	return 0;
@@ -172,6 +277,7 @@ static int function_trace_init(struct trace_array *tr)
 
 static void function_trace_reset(struct trace_array *tr)
 {
+	ftrace_perf_reset(tr);
 	tracing_stop_function_trace(tr);
 	tracing_stop_cmdline_record();
 	ftrace_reset_array_ops(tr);
@@ -223,7 +329,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 
 	trace_ctx = tracing_gen_ctx_dec();
 
-	trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+	do_trace_function(tr, ip, parent_ip, trace_ctx, NULL);
 
 	ftrace_test_recursion_unlock(bit);
 }
@@ -245,7 +351,7 @@ function_args_trace_call(unsigned long ip, unsigned long parent_ip,
 
 	trace_ctx = tracing_gen_ctx();
 
-	trace_function(tr, ip, parent_ip, trace_ctx, fregs);
+	do_trace_function(tr, ip, parent_ip, trace_ctx, fregs);
 
 	ftrace_test_recursion_unlock(bit);
 }
@@ -372,7 +478,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
 	trace_ctx = tracing_gen_ctx_dec();
 	process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
 
-	trace_function(tr, ip, parent_ip, trace_ctx, NULL);
+	do_trace_function(tr, ip, parent_ip, trace_ctx, NULL);
 
 out:
 	ftrace_test_recursion_unlock(bit);
@@ -428,6 +534,10 @@ static struct tracer_opt func_opts[] = {
 	{ TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) },
 #ifdef CONFIG_FUNCTION_TRACE_ARGS
 	{ TRACER_OPT(func-args, TRACE_FUNC_OPT_ARGS) },
+#endif
+#if CONFIG_PERF_EVENTS
+	{ TRACER_OPT(func-cpu-cycles, TRACE_FUNC_OPT_PERF_CYCLES) },
+	{ TRACER_OPT(func-cache-misses, TRACE_FUNC_OPT_PERF_CACHE) },
 #endif
 	{ } /* Always set a last empty entry */
 };
@@ -457,6 +567,7 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	ftrace_func_t func;
 	u32 new_flags;
+	int err;
 
 	/* Do nothing if already set. */
 	if (!!set == !!(tr->current_trace_flags->val & bit))
@@ -466,6 +577,9 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 	if (tr->current_trace != &function_trace)
 		return 0;
 
+	if (handle_perf_event_flag(tr, bit, set, &err))
+		return err;
+
 	new_flags = (tr->current_trace_flags->val & ~bit) | (set ? bit : 0);
 	func = select_trace_function(new_flags);
 	if (!func)
-- 
2.51.0



  parent reply	other threads:[~2025-11-18  0:35 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-18  0:29 [POC][RFC][PATCH 0/3] tracing: Add perf events to trace buffer Steven Rostedt
2025-11-18  0:29 ` [POC][RFC][PATCH 1/3] tracing: Add perf events Steven Rostedt
2025-11-18  8:35   ` Peter Zijlstra
2025-11-18 13:42     ` Steven Rostedt
2025-11-18 20:24       ` Steven Rostedt
2025-11-20  9:01   ` kernel test robot
2025-11-20  9:23   ` kernel test robot
2025-11-18  0:29 ` Steven Rostedt [this message]
2025-11-20 11:39   ` [POC][RFC][PATCH 2/3] ftrace: Add perf counters to function tracing kernel test robot
2025-11-18  0:29 ` [POC][RFC][PATCH 3/3] fgraph: Add perf counters to function graph tracer Steven Rostedt
2025-11-18  3:08 ` [POC][RFC][PATCH 0/3] tracing: Add perf events to trace buffer Masami Hiramatsu
2025-11-18  3:42   ` Steven Rostedt
2025-11-18  8:11     ` Masami Hiramatsu
2025-11-18 13:53       ` Steven Rostedt
2025-11-18 13:57         ` Steven Rostedt
2025-11-18 16:31       ` Steven Rostedt
2025-11-18  7:25 ` Namhyung Kim
2025-11-18 16:24   ` Steven Rostedt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251118003531.805883570@kernel.org \
    --to=rostedt@kernel.org \
    --cc=acme@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=douglas.raillard@arm.com \
    --cc=irogers@google.com \
    --cc=jolsa@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mhiramat@kernel.org \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.