* [for-next][PATCH 01/10] tracing: Declare system call tracepoints with TRACE_EVENT_SYSCALL
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 02/10] tracing/ftrace: disable preemption in syscall probe Steven Rostedt
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
In preparation for allowing system call tracepoints to handle page
faults, introduce TRACE_EVENT_SYSCALL to declare the sys_enter/sys_exit
tracepoints.
Move the common code between __DECLARE_TRACE and __DECLARE_TRACE_SYSCALL
into __DECLARE_TRACE_COMMON.
This change is not meant to alter the generated code, and only prepares
the following modifications.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-2-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/tracepoint.h | 53 +++++++++++++++++++++++++--------
include/trace/bpf_probe.h | 3 ++
include/trace/define_trace.h | 5 ++++
include/trace/events/syscalls.h | 4 +--
include/trace/perf.h | 3 ++
include/trace/trace_events.h | 28 +++++++++++++++++
6 files changed, 81 insertions(+), 15 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3d33b9872cec..76e441b39a96 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -197,7 +197,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* it_func[0] is never NULL because there is at least one element in the array
* when the array itself is non NULL.
*/
-#define __DO_TRACE(name, args, cond) \
+#define __DO_TRACE(name, args, cond, syscall) \
do { \
int __maybe_unused __idx = 0; \
\
@@ -222,21 +222,10 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
* site if it is not watching, as it will need to be active when the
* tracepoint is enabled.
*/
-#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+#define __DECLARE_TRACE_COMMON(name, proto, args, cond, data_proto) \
extern int __traceiter_##name(data_proto); \
DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \
extern struct tracepoint __tracepoint_##name; \
- static inline void trace_##name(proto) \
- { \
- if (static_branch_unlikely(&__tracepoint_##name.key)) \
- __DO_TRACE(name, \
- TP_ARGS(args), \
- TP_CONDITION(cond)); \
- if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
- WARN_ONCE(!rcu_is_watching(), \
- "RCU not watching for tracepoint"); \
- } \
- } \
static inline int \
register_trace_##name(void (*probe)(data_proto), void *data) \
{ \
@@ -266,6 +255,34 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
return static_branch_unlikely(&__tracepoint_##name.key);\
}
+#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \
+ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+ static inline void trace_##name(proto) \
+ { \
+ if (static_branch_unlikely(&__tracepoint_##name.key)) \
+ __DO_TRACE(name, \
+ TP_ARGS(args), \
+ TP_CONDITION(cond), 0); \
+ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
+ WARN_ONCE(!rcu_is_watching(), \
+ "RCU not watching for tracepoint"); \
+ } \
+ }
+
+#define __DECLARE_TRACE_SYSCALL(name, proto, args, cond, data_proto) \
+ __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), cond, PARAMS(data_proto)) \
+ static inline void trace_##name(proto) \
+ { \
+ if (static_branch_unlikely(&__tracepoint_##name.key)) \
+ __DO_TRACE(name, \
+ TP_ARGS(args), \
+ TP_CONDITION(cond), 1); \
+ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
+ WARN_ONCE(!rcu_is_watching(), \
+ "RCU not watching for tracepoint"); \
+ } \
+ }
+
/*
* We have no guarantee that gcc and the linker won't up-align the tracepoint
* structures, so we create an array of pointers that will be used for iteration
@@ -348,6 +365,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
return false; \
}
+#define __DECLARE_TRACE_SYSCALL __DECLARE_TRACE
+
#define DEFINE_TRACE_FN(name, reg, unreg, proto, args)
#define DEFINE_TRACE(name, proto, args)
#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
@@ -409,6 +428,11 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
PARAMS(void *__data, proto))
+#define DECLARE_TRACE_SYSCALL(name, proto, args) \
+ __DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args), \
+ cpu_online(raw_smp_processor_id()), \
+ PARAMS(void *__data, proto))
+
#define TRACE_EVENT_FLAGS(event, flag)
#define TRACE_EVENT_PERF_PERM(event, expr...)
@@ -546,6 +570,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
struct, assign, print) \
DECLARE_TRACE_CONDITION(name, PARAMS(proto), \
PARAMS(args), PARAMS(cond))
+#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, \
+ print, reg, unreg) \
+ DECLARE_TRACE_SYSCALL(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT_FLAGS(event, flag)
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index a2ea11cc912e..c85bbce5aaa5 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -53,6 +53,9 @@ __bpf_trace_##call(void *__data, proto) \
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
/*
* This part is compiled out, it is only here as a build time check
* to make sure that if the tracepoint handling changes, the
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 00723935dcc7..ff5fa17a6259 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -46,6 +46,10 @@
assign, print, reg, unreg) \
DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
+#undef TRACE_EVENT_SYSCALL
+#define TRACE_EVENT_SYSCALL(name, proto, args, struct, assign, print, reg, unreg) \
+ DEFINE_TRACE_FN(name, reg, unreg, PARAMS(proto), PARAMS(args))
+
#undef TRACE_EVENT_NOP
#define TRACE_EVENT_NOP(name, proto, args, struct, assign, print)
@@ -107,6 +111,7 @@
#undef TRACE_EVENT
#undef TRACE_EVENT_FN
#undef TRACE_EVENT_FN_COND
+#undef TRACE_EVENT_SYSCALL
#undef TRACE_EVENT_CONDITION
#undef TRACE_EVENT_NOP
#undef DEFINE_EVENT_NOP
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
index b6e0cbc2c71f..f31ff446b468 100644
--- a/include/trace/events/syscalls.h
+++ b/include/trace/events/syscalls.h
@@ -15,7 +15,7 @@
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
-TRACE_EVENT_FN(sys_enter,
+TRACE_EVENT_SYSCALL(sys_enter,
TP_PROTO(struct pt_regs *regs, long id),
@@ -41,7 +41,7 @@ TRACE_EVENT_FN(sys_enter,
TRACE_EVENT_FLAGS(sys_enter, TRACE_EVENT_FL_CAP_ANY)
-TRACE_EVENT_FN(sys_exit,
+TRACE_EVENT_SYSCALL(sys_exit,
TP_PROTO(struct pt_regs *regs, long ret),
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 2c11181c82e0..ded997af481e 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -55,6 +55,9 @@ perf_trace_##call(void *__data, proto) \
head, __task); \
}
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
/*
* This part is compiled out, it is only here as a build time check
* to make sure that if the tracepoint handling changes, the
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index c2f9cabf154d..8bcbb9ee44de 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -45,6 +45,16 @@
PARAMS(print)); \
DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
+#undef TRACE_EVENT_SYSCALL
+#define TRACE_EVENT_SYSCALL(name, proto, args, tstruct, assign, print, reg, unreg) \
+ DECLARE_EVENT_SYSCALL_CLASS(name, \
+ PARAMS(proto), \
+ PARAMS(args), \
+ PARAMS(tstruct), \
+ PARAMS(assign), \
+ PARAMS(print)); \
+ DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));
+
#include "stages/stage1_struct_define.h"
#undef DECLARE_EVENT_CLASS
@@ -57,6 +67,9 @@
\
static struct trace_event_class event_class_##name;
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, name, proto, args) \
static struct trace_event_call __used \
@@ -117,6 +130,9 @@
tstruct; \
};
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, name, proto, args)
@@ -208,6 +224,9 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \
.trace = trace_raw_output_##call, \
};
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#undef DEFINE_EVENT_PRINT
#define DEFINE_EVENT_PRINT(template, call, proto, args, print) \
static notrace enum print_line_t \
@@ -265,6 +284,9 @@ static inline notrace int trace_event_get_offsets_##call( \
return __data_size; \
}
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
/*
@@ -409,6 +431,9 @@ trace_event_raw_event_##call(void *__data, proto) \
* fail to compile unless it too is updated.
*/
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
static inline void ftrace_test_probe_##call(void) \
@@ -434,6 +459,9 @@ static struct trace_event_class __used __refdata event_class_##call = { \
_TRACE_PERF_INIT(call) \
};
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
\
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 02/10] tracing/ftrace: disable preemption in syscall probe
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 01/10] tracing: Declare system call tracepoints with TRACE_EVENT_SYSCALL Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 03/10] tracing/perf: " Steven Rostedt
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
In preparation for allowing system call enter/exit instrumentation to
handle page faults, make sure that ftrace can handle this change by
explicitly disabling preemption within the ftrace system call tracepoint
probes to respect the current expectations within ftrace ring buffer
code.
This change does not yet allow ftrace to take page faults per se within
its probe, but allows its existing probes to adapt to the upcoming
change.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-3-mathieu.desnoyers@efficios.com
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/trace_events.h | 39 ++++++++++++++++++++++++++++-------
kernel/trace/trace_syscalls.c | 12 +++++++++++
2 files changed, 44 insertions(+), 7 deletions(-)
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 8bcbb9ee44de..63071aa5923d 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -263,6 +263,9 @@ static struct trace_event_fields trace_event_fields_##call[] = { \
tstruct \
{} };
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+
#undef DEFINE_EVENT_PRINT
#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
@@ -396,11 +399,11 @@ static inline notrace int trace_event_get_offsets_##call( \
#include "stages/stage6_event_callback.h"
-#undef DECLARE_EVENT_CLASS
-#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
- \
+
+#undef __DECLARE_EVENT_CLASS
+#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
-trace_event_raw_event_##call(void *__data, proto) \
+do_trace_event_raw_event_##call(void *__data, proto) \
{ \
struct trace_event_file *trace_file = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
@@ -425,15 +428,35 @@ trace_event_raw_event_##call(void *__data, proto) \
\
trace_event_buffer_commit(&fbuffer); \
}
+
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
+__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
+ PARAMS(assign), PARAMS(print)) \
+static notrace void \
+trace_event_raw_event_##call(void *__data, proto) \
+{ \
+ do_trace_event_raw_event_##call(__data, args); \
+}
+
+#undef DECLARE_EVENT_SYSCALL_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \
+__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
+ PARAMS(assign), PARAMS(print)) \
+static notrace void \
+trace_event_raw_event_##call(void *__data, proto) \
+{ \
+ preempt_disable_notrace(); \
+ do_trace_event_raw_event_##call(__data, args); \
+ preempt_enable_notrace(); \
+}
+
/*
* The ftrace_test_probe is compiled out, it is only here as a build time check
* to make sure that if the tracepoint handling changes, the ftrace probe will
* fail to compile unless it too is updated.
*/
-#undef DECLARE_EVENT_SYSCALL_CLASS
-#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
-
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
static inline void ftrace_test_probe_##call(void) \
@@ -443,6 +466,8 @@ static inline void ftrace_test_probe_##call(void) \
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+#undef __DECLARE_EVENT_CLASS
+
#include "stages/stage7_class_define.h"
#undef DECLARE_EVENT_CLASS
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 785733245ead..f9b21bac9d45 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -299,6 +299,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int syscall_nr;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -338,6 +344,12 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct trace_event_buffer fbuffer;
int syscall_nr;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 03/10] tracing/perf: disable preemption in syscall probe
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 01/10] tracing: Declare system call tracepoints with TRACE_EVENT_SYSCALL Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 02/10] tracing/ftrace: disable preemption in syscall probe Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 04/10] tracing/bpf: " Steven Rostedt
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
In preparation for allowing system call enter/exit instrumentation to
handle page faults, make sure that perf can handle this change by
explicitly disabling preemption within the perf system call tracepoint
probes to respect the current expectations within perf ring buffer code.
This change does not yet allow perf to take page faults per se within
its probe, but allows its existing probes to adapt to the upcoming
change.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-4-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/perf.h | 42 +++++++++++++++++++++++++++++++----
kernel/trace/trace_syscalls.c | 12 ++++++++++
2 files changed, 50 insertions(+), 4 deletions(-)
diff --git a/include/trace/perf.h b/include/trace/perf.h
index ded997af481e..15cde7eac8b4 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -12,10 +12,10 @@
#undef __perf_task
#define __perf_task(t) (__task = (t))
-#undef DECLARE_EVENT_CLASS
-#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
+#undef __DECLARE_EVENT_CLASS
+#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
-perf_trace_##call(void *__data, proto) \
+do_perf_trace_##call(void *__data, proto) \
{ \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
@@ -55,8 +55,39 @@ perf_trace_##call(void *__data, proto) \
head, __task); \
}
+/*
+ * Define unused __count and __task variables to use @args to pass
+ * arguments to do_perf_trace_##call. This is needed because the
+ * macros __perf_count and __perf_task introduce the side-effect to
+ * store copies into those local variables.
+ */
+#undef DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
+__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
+ PARAMS(assign), PARAMS(print)) \
+static notrace void \
+perf_trace_##call(void *__data, proto) \
+{ \
+ u64 __count __attribute__((unused)); \
+ struct task_struct *__task __attribute__((unused)); \
+ \
+ do_perf_trace_##call(__data, args); \
+}
+
#undef DECLARE_EVENT_SYSCALL_CLASS
-#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \
+__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
+ PARAMS(assign), PARAMS(print)) \
+static notrace void \
+perf_trace_##call(void *__data, proto) \
+{ \
+ u64 __count __attribute__((unused)); \
+ struct task_struct *__task __attribute__((unused)); \
+ \
+ preempt_disable_notrace(); \
+ do_perf_trace_##call(__data, args); \
+ preempt_enable_notrace(); \
+}
/*
* This part is compiled out, it is only here as a build time check
@@ -76,4 +107,7 @@ static inline void perf_test_probe_##call(void) \
DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef __DECLARE_EVENT_CLASS
+
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f9b21bac9d45..b1cc19806f3d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -596,6 +596,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
@@ -698,6 +704,12 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int rctx;
int size;
+ /*
+ * Syscall probe called with preemption enabled, but the ring
+ * buffer and per-cpu data require preemption to be disabled.
+ */
+ guard(preempt_notrace)();
+
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 04/10] tracing/bpf: disable preemption in syscall probe
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (2 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 03/10] tracing/perf: " Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 05/10] tracing: Allow system call tracepoints to handle page faults Steven Rostedt
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes, Andrii Nakryiko
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
In preparation for allowing system call enter/exit instrumentation to
handle page faults, make sure that bpf can handle this change by
explicitly disabling preemption within the bpf system call tracepoint
probes to respect the current expectations within bpf tracing code.
This change does not yet allow bpf to take page faults per se within its
probe, but allows its existing probes to adapt to the upcoming change.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-5-mathieu.desnoyers@efficios.com
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org> # BPF parts
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/bpf_probe.h | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index c85bbce5aaa5..fec97c93e1c9 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -53,8 +53,18 @@ __bpf_trace_##call(void *__data, proto) \
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
__BPF_DECLARE_TRACE(call, PARAMS(proto), PARAMS(args))
+#define __BPF_DECLARE_TRACE_SYSCALL(call, proto, args) \
+static notrace void \
+__bpf_trace_##call(void *__data, proto) \
+{ \
+ preempt_disable_notrace(); \
+ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \
+ preempt_enable_notrace(); \
+}
+
#undef DECLARE_EVENT_SYSCALL_CLASS
-#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS
+#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \
+ __BPF_DECLARE_TRACE_SYSCALL(call, PARAMS(proto), PARAMS(args))
/*
* This part is compiled out, it is only here as a build time check
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 05/10] tracing: Allow system call tracepoints to handle page faults
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (3 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 04/10] tracing/bpf: " Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 06/10] tracing/ftrace: Add might_fault check to syscall probes Steven Rostedt
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Use Tasks Trace RCU to protect iteration of system call enter/exit
tracepoint probes to allow those probes to handle page faults.
In preparation for this change, all tracers registering to system call
enter/exit tracepoints should expect those to be called with preemption
enabled.
This allows tracers to fault-in userspace system call arguments such as
path strings within their probe callbacks.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-6-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/linux/tracepoint.h | 18 ++++++++++++++++--
init/Kconfig | 1 +
2 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 76e441b39a96..0dc67fad706c 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
#include <linux/tracepoint-defs.h>
#include <linux/static_call.h>
@@ -107,6 +108,7 @@ void for_each_tracepoint_in_module(struct module *mod,
#ifdef CONFIG_TRACEPOINTS
static inline void tracepoint_synchronize_unregister(void)
{
+ synchronize_rcu_tasks_trace();
synchronize_rcu();
}
#else
@@ -196,6 +198,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
/*
* it_func[0] is never NULL because there is at least one element in the array
* when the array itself is non NULL.
+ *
+ * With @syscall=0, the tracepoint callback array dereference is
+ * protected by disabling preemption.
+ * With @syscall=1, the tracepoint callback array dereference is
+ * protected by Tasks Trace RCU, which allows probes to handle page
+ * faults.
*/
#define __DO_TRACE(name, args, cond, syscall) \
do { \
@@ -204,11 +212,17 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
if (!(cond)) \
return; \
\
- preempt_disable_notrace(); \
+ if (syscall) \
+ rcu_read_lock_trace(); \
+ else \
+ preempt_disable_notrace(); \
\
__DO_TRACE_CALL(name, TP_ARGS(args)); \
\
- preempt_enable_notrace(); \
+ if (syscall) \
+ rcu_read_unlock_trace(); \
+ else \
+ preempt_enable_notrace(); \
} while (0)
/*
diff --git a/init/Kconfig b/init/Kconfig
index 530a382ee0fe..4ac3d1b48278 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1985,6 +1985,7 @@ config BINDGEN_VERSION_TEXT
#
config TRACEPOINTS
bool
+ select TASKS_TRACE_RCU
source "kernel/Kconfig.kexec"
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 06/10] tracing/ftrace: Add might_fault check to syscall probes
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (4 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 05/10] tracing: Allow system call tracepoints to handle page faults Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 07/10] tracing/perf: " Steven Rostedt
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Add a might_fault() check to validate that the ftrace sys_enter/sys_exit
probe callbacks are indeed called from a context where page faults can
be handled.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-7-mathieu.desnoyers@efficios.com
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/trace_events.h | 1 +
kernel/trace/trace_syscalls.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 63071aa5923d..4f22136fd465 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -446,6 +446,7 @@ __DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \
static notrace void \
trace_event_raw_event_##call(void *__data, proto) \
{ \
+ might_fault(); \
preempt_disable_notrace(); \
do_trace_event_raw_event_##call(__data, args); \
preempt_enable_notrace(); \
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b1cc19806f3d..6d6bbd56ed92 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -303,6 +303,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
+ might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -348,6 +349,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
+ might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 07/10] tracing/perf: Add might_fault check to syscall probes
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (5 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 06/10] tracing/ftrace: Add might_fault check to syscall probes Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 08/10] tracing/bpf: " Steven Rostedt
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Add a might_fault() check to validate that the perf sys_enter/sys_exit
probe callbacks are indeed called from a context where page faults can
be handled.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-8-mathieu.desnoyers@efficios.com
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/perf.h | 1 +
kernel/trace/trace_syscalls.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 15cde7eac8b4..a1754b73a8f5 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -84,6 +84,7 @@ perf_trace_##call(void *__data, proto) \
u64 __count __attribute__((unused)); \
struct task_struct *__task __attribute__((unused)); \
\
+ might_fault(); \
preempt_disable_notrace(); \
do_perf_trace_##call(__data, args); \
preempt_enable_notrace(); \
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 6d6bbd56ed92..46aab0ab9350 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -602,6 +602,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
+ might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -710,6 +711,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
* Syscall probe called with preemption enabled, but the ring
* buffer and per-cpu data require preemption to be disabled.
*/
+ might_fault();
guard(preempt_notrace)();
syscall_nr = trace_get_syscall_nr(current, regs);
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 08/10] tracing/bpf: Add might_fault check to syscall probes
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (6 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 07/10] tracing/perf: " Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 09/10] trace/trace_event_perf: remove duplicate samples on the first tracepoint event Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 10/10] tracing: Use atomic64_inc_return() in trace_clock_counter() Steven Rostedt
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Michael Jeanson, Peter Zijlstra, Alexei Starovoitov,
Yonghong Song, Paul E. McKenney, Ingo Molnar,
Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim,
Andrii Nakryiko, bpf, Joel Fernandes, Andrii Nakryiko
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Add a might_fault() check to validate that the bpf sys_enter/sys_exit
probe callbacks are indeed called from a context where page faults can
be handled.
Cc: Michael Jeanson <mjeanson@efficios.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Yonghong Song <yhs@fb.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Cc: bpf@vger.kernel.org
Cc: Joel Fernandes <joel@joelfernandes.org>
Link: https://lore.kernel.org/20241009010718.2050182-9-mathieu.desnoyers@efficios.com
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org> # BPF parts
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
include/trace/bpf_probe.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index fec97c93e1c9..183fa2aa2935 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -57,6 +57,7 @@ __bpf_trace_##call(void *__data, proto) \
static notrace void \
__bpf_trace_##call(void *__data, proto) \
{ \
+ might_fault(); \
preempt_disable_notrace(); \
CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \
preempt_enable_notrace(); \
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 09/10] trace/trace_event_perf: remove duplicate samples on the first tracepoint event
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (7 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 08/10] tracing/bpf: " Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
2024-10-10 14:25 ` [for-next][PATCH 10/10] tracing: Use atomic64_inc_return() in trace_clock_counter() Steven Rostedt
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Levi Yun, Namhyung Kim
From: Levi Yun <yeoreum.yun@arm.com>
When a tracepoint event is created with attr.freq = 1,
'hwc->period_left' is not initialized correctly. As a result,
in the perf_swevent_overflow() function, when the first time the event occurs,
it calculates the event overflow and the perf_swevent_set_period() returns 3,
this leads to the event are recorded for three duplicate times.
Step to reproduce:
1. Enable the tracepoint event & starting tracing
$ echo 1 > /sys/kernel/tracing/events/module/module_free
$ echo 1 > /sys/kernel/tracing/tracing_on
2. Record with perf
$ perf record -a --strict-freq -F 1 -e "module:module_free"
3. Trigger module_free event.
$ modprobe -i sunrpc
$ modprobe -r sunrpc
Result:
- Trace pipe result:
$ cat trace_pipe
modprobe-174509 [003] ..... 6504.868896: module_free: sunrpc
- perf sample:
modprobe 174509 [003] 6504.868980: module:module_free: sunrpc
modprobe 174509 [003] 6504.868980: module:module_free: sunrpc
modprobe 174509 [003] 6504.868980: module:module_free: sunrpc
By setting period_left via perf_swevent_set_period() as other sw_event did,
This problem could be solved.
After patch:
- Trace pipe result:
$ cat trace_pipe
modprobe 1153096 [068] 613468.867774: module:module_free: xfs
- perf sample
modprobe 1153096 [068] 613468.867794: module:module_free: xfs
Link: https://lore.kernel.org/20240913021347.595330-1-yeoreum.yun@arm.com
Fixes: bd2b5b12849a ("perf_counter: More aggressive frequency adjustment")
Signed-off-by: Levi Yun <yeoreum.yun@arm.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace_event_perf.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 05e791241812..3ff9caa4a71b 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -352,10 +352,16 @@ void perf_uprobe_destroy(struct perf_event *p_event)
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
+ struct hw_perf_event *hwc = &p_event->hw;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
+ if (is_sampling_event(p_event)) {
+ hwc->last_period = hwc->sample_period;
+ perf_swevent_set_period(p_event);
+ }
+
/*
* If TRACE_REG_PERF_ADD returns false; no custom action was performed
* and we need to take the default action of enqueueing our event on
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread* [for-next][PATCH 10/10] tracing: Use atomic64_inc_return() in trace_clock_counter()
2024-10-10 14:25 [for-next][PATCH 00/10] tracing: Updates for 6.13 Steven Rostedt
` (8 preceding siblings ...)
2024-10-10 14:25 ` [for-next][PATCH 09/10] trace/trace_event_perf: remove duplicate samples on the first tracepoint event Steven Rostedt
@ 2024-10-10 14:25 ` Steven Rostedt
9 siblings, 0 replies; 11+ messages in thread
From: Steven Rostedt @ 2024-10-10 14:25 UTC (permalink / raw)
To: linux-kernel
Cc: Masami Hiramatsu, Mark Rutland, Mathieu Desnoyers, Andrew Morton,
Uros Bizjak
From: Uros Bizjak <ubizjak@gmail.com>
Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref)
to use optimized implementation and ease register pressure around
the primitive for targets that implement optimized variant.
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://lore.kernel.org/20241007085651.48544-1-ubizjak@gmail.com
Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
kernel/trace/trace_clock.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 4702efb00ff2..4cb2ebc439be 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -154,5 +154,5 @@ static atomic64_t trace_counter;
*/
u64 notrace trace_clock_counter(void)
{
- return atomic64_add_return(1, &trace_counter);
+ return atomic64_inc_return(&trace_counter);
}
--
2.45.2
^ permalink raw reply related [flat|nested] 11+ messages in thread