[PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs

public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
  2026-03-16 21:46 [PATCH bpf-next v5 0/5] bpf: Add support for sleepable " Mykyta Yatsenko
@ 2026-03-16 21:46 ` Mykyta Yatsenko
  2026-03-16 22:22   ` bot+bpf-ci
  2026-03-23 20:38   ` Kumar Kartikeya Dwivedi
  0 siblings, 2 replies; 7+ messages in thread
From: Mykyta Yatsenko @ 2026-03-16 21:46 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel, kafai, kernel-team, eddyz87; +Cc: Mykyta Yatsenko

From: Mykyta Yatsenko <yatsenko@meta.com>

Add trace_call_bpf_faultable(), a variant of trace_call_bpf() for
faultable tracepoints that supports sleepable BPF programs. It uses
rcu_tasks_trace for lifetime protection and bpf_prog_run_array_uprobe()
for per-program RCU flavor selection, following the uprobe_prog_run()
pattern. Uses preempt-safe this_cpu_inc_return/this_cpu_dec for the
bpf_prog_active recursion counter since preemption is enabled in this
context.

Restructure perf_syscall_enter() and perf_syscall_exit() to run BPF
filter before perf event processing. Previously, BPF ran after the
per-cpu perf trace buffer was allocated under preempt_disable,
requiring cleanup via perf_swevent_put_recursion_context() on filter.
Now BPF runs in faultable context before preempt_disable, reading
syscall arguments from local variables instead of the per-cpu trace
record, removing the dependency on buffer allocation. This allows
sleepable BPF programs to execute and avoids unnecessary buffer
allocation when BPF filters the event. The perf event submission
path (buffer allocation, fill, submit) remains under preempt_disable
as before.

Add an attach-time check in __perf_event_set_bpf_prog() to reject
sleepable BPF_PROG_TYPE_TRACEPOINT programs on non-syscall
tracepoints, since only syscall tracepoints run in faultable context.

This prepares the classic tracepoint runtime and attach paths for
sleepable programs. The verifier changes to allow loading sleepable
BPF_PROG_TYPE_TRACEPOINT programs are in a subsequent patch.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
---
 include/linux/trace_events.h  |   6 +++
 kernel/events/core.c          |   9 ++++
 kernel/trace/bpf_trace.c      |  39 ++++++++++++++++
 kernel/trace/trace_syscalls.c | 104 +++++++++++++++++++++++-------------------
 4 files changed, 110 insertions(+), 48 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 37eb2f0f3dd8..5fbbeb9ec4b9 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -767,6 +767,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
@@ -789,6 +790,11 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
 	return 1;
 }
 
+static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+	return 1;
+}
+
 static inline int
 perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
 {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1f5699b339ec..46b733d3dd41 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11647,6 +11647,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
 		/* only uprobe programs are allowed to be sleepable */
 		return -EINVAL;
 
+	if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) {
+		/*
+		 * Sleepable tracepoint programs can only attach to faultable
+		 * tracepoints. Currently only syscall tracepoints are faultable.
+		 */
+		if (!is_syscall_tp)
+			return -EINVAL;
+	}
+
 	/* Kprobe override only works for kprobes, not uprobes. */
 	if (prog->kprobe_override && !is_kprobe)
 		return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 35ed53807cfd..69c9a5539e65 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -152,6 +152,45 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 	return ret;
 }
 
+/**
+ * trace_call_bpf_faultable - invoke BPF program in faultable context
+ * @call: tracepoint event
+ * @ctx: opaque context pointer
+ *
+ * Variant of trace_call_bpf() for faultable tracepoints (e.g. syscall
+ * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
+ * for lifetime protection and per-program rcu_read_lock for non-sleepable
+ * programs, following the uprobe_prog_run() pattern.
+ *
+ * Must be called from a faultable/preemptible context.
+ */
+unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
+{
+	struct bpf_prog_array *prog_array;
+	unsigned int ret;
+
+	might_fault();
+
+	guard(rcu_tasks_trace)();
+	guard(migrate)();
+
+	if (unlikely(this_cpu_inc_return(bpf_prog_active) != 1)) {
+		scoped_guard(rcu) {
+			bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
+		}
+		this_cpu_dec(bpf_prog_active);
+		return 0;
+	}
+
+	prog_array = rcu_dereference_check(call->prog_array,
+					   rcu_read_lock_trace_held());
+	ret = bpf_prog_run_array_uprobe(prog_array, ctx, bpf_prog_run);
+
+	this_cpu_dec(bpf_prog_active);
+
+	return ret;
+}
+
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
 {
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 37317b81fcda..8a7677c15f93 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1372,26 +1372,27 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+static int perf_call_bpf_enter(struct trace_event_call *call,
 			       struct syscall_metadata *sys_data,
-			       struct syscall_trace_enter *rec)
+			       int syscall_nr, unsigned long *args)
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
 		int syscall_nr;
 		unsigned long args[SYSCALL_DEFINE_MAXARGS];
 	} __aligned(8) param;
+	struct pt_regs regs = {};
 	int i;
 
 	BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
 
-	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
-	perf_fetch_caller_regs(regs);
-	*(struct pt_regs **)&param = regs;
-	param.syscall_nr = rec->nr;
+	/* bpf prog requires 'regs' to be the first member in the ctx */
+	perf_fetch_caller_regs(&regs);
+	*(struct pt_regs **)&param = &regs;
+	param.syscall_nr = syscall_nr;
 	for (i = 0; i < sys_data->nb_args; i++)
-		param.args[i] = rec->args[i];
-	return trace_call_bpf(call, &param);
+		param.args[i] = args[i];
+	return trace_call_bpf_faultable(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -1411,12 +1412,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	int size = 0;
 	int uargs = 0;
 
-	/*
-	 * Syscall probe called with preemption enabled, but the ring
-	 * buffer and per-cpu data require preemption to be disabled.
-	 */
 	might_fault();
-	guard(preempt_notrace)();
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1430,6 +1426,26 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	syscall_get_arguments(current, regs, args);
 
+	/*
+	 * Run BPF filter in faultable context before per-cpu buffer
+	 * allocation, allowing sleepable BPF programs to execute.
+	 */
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (valid_prog_array &&
+	    !perf_call_bpf_enter(sys_data->enter_event, sys_data,
+				 syscall_nr, args))
+		return;
+
+	/*
+	 * Per-cpu ring buffer and perf event list operations require
+	 * preemption to be disabled.
+	 */
+	guard(preempt_notrace)();
+
+	head = this_cpu_ptr(sys_data->enter_event->perf_events);
+	if (hlist_empty(head))
+		return;
+
 	/* Check if this syscall event faults in user space memory */
 	mayfault = sys_data->user_mask != 0;
 
@@ -1439,11 +1455,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 			return;
 	}
 
-	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
-	if (!valid_prog_array && hlist_empty(head))
-		return;
-
 	/* get the size after alignment with the u32 buffer size field */
 	size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
@@ -1459,13 +1470,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (mayfault)
 		syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
 
-	if ((valid_prog_array &&
-	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
-	    hlist_empty(head)) {
-		perf_swevent_put_recursion_context(rctx);
-		return;
-	}
-
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
 			      head, NULL);
@@ -1515,21 +1519,22 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 		syscall_fault_buffer_disable();
 }
 
-static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec)
+static int perf_call_bpf_exit(struct trace_event_call *call,
+			      int syscall_nr, long ret_val)
 {
 	struct syscall_tp_t {
 		struct trace_entry ent;
 		int syscall_nr;
 		unsigned long ret;
 	} __aligned(8) param;
-
-	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
-	perf_fetch_caller_regs(regs);
-	*(struct pt_regs **)&param = regs;
-	param.syscall_nr = rec->nr;
-	param.ret = rec->ret;
-	return trace_call_bpf(call, &param);
+	struct pt_regs regs = {};
+
+	/* bpf prog requires 'regs' to be the first member in the ctx */
+	perf_fetch_caller_regs(&regs);
+	*(struct pt_regs **)&param = &regs;
+	param.syscall_nr = syscall_nr;
+	param.ret = ret_val;
+	return trace_call_bpf_faultable(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -1543,12 +1548,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	int rctx;
 	int size;
 
-	/*
-	 * Syscall probe called with preemption enabled, but the ring
-	 * buffer and per-cpu data require preemption to be disabled.
-	 */
 	might_fault();
-	guard(preempt_notrace)();
 
 	syscall_nr = trace_get_syscall_nr(current, regs);
 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
@@ -1560,9 +1560,24 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	head = this_cpu_ptr(sys_data->exit_event->perf_events);
+	/*
+	 * Run BPF filter in faultable context before per-cpu buffer
+	 * allocation, allowing sleepable BPF programs to execute.
+	 */
 	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
-	if (!valid_prog_array && hlist_empty(head))
+	if (valid_prog_array &&
+	    !perf_call_bpf_exit(sys_data->exit_event, syscall_nr,
+				syscall_get_return_value(current, regs)))
+		return;
+
+	/*
+	 * Per-cpu ring buffer and perf event list operations require
+	 * preemption to be disabled.
+	 */
+	guard(preempt_notrace)();
+
+	head = this_cpu_ptr(sys_data->exit_event->perf_events);
+	if (hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -1576,13 +1591,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((valid_prog_array &&
-	     !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
-	    hlist_empty(head)) {
-		perf_swevent_put_recursion_context(rctx);
-		return;
-	}
-
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
 			      1, regs, head, NULL);
 }

-- 
2.52.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
  2026-03-16 21:46 ` [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic " Mykyta Yatsenko
@ 2026-03-16 22:22   ` bot+bpf-ci
  2026-03-23 20:38   ` Kumar Kartikeya Dwivedi
  1 sibling, 0 replies; 7+ messages in thread
From: bot+bpf-ci @ 2026-03-16 22:22 UTC (permalink / raw)
  To: mykyta.yatsenko5, bpf, ast, andrii, daniel, kafai, kernel-team,
	eddyz87
  Cc: yatsenko, ast, andrii, daniel, martin.lau, eddyz87, yonghong.song,
	clm, ihor.solodrai

[-- Attachment #1: Type: text/plain, Size: 1299 bytes --]

> diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
> index 37317b81fcda..8a7677c15f93 100644
> --- a/kernel/trace/trace_syscalls.c
> +++ b/kernel/trace/trace_syscalls.c

[ ... ]

> @@ -1411,12 +1412,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
>  	int size = 0;
>  	int uargs = 0;

perf_syscall_enter() still declares fake_regs:

    struct pt_regs *fake_regs;

and passes it to perf_trace_buf_alloc():

    rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);

but the value is never read after that. Previously it was passed
to perf_call_bpf_enter() for perf_fetch_caller_regs(), but
perf_call_bpf_enter() now creates its own stack-local pt_regs.

The same applies to fake_regs in perf_syscall_exit().

Could these be cleaned up, perhaps by passing NULL to
perf_trace_buf_alloc() since it already handles that case?

    void *perf_trace_buf_alloc(int size, struct pt_regs **regs, ...)
    {
        ...
        if (regs)
            *regs = this_cpu_ptr(&__perf_regs[rctx]);
        ...
    }


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/23167661157

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
  2026-03-16 21:46 ` [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic " Mykyta Yatsenko
  2026-03-16 22:22   ` bot+bpf-ci
@ 2026-03-23 20:38   ` Kumar Kartikeya Dwivedi
  2026-03-23 20:57     ` Mykyta Yatsenko
  1 sibling, 1 reply; 7+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2026-03-23 20:38 UTC (permalink / raw)
  To: Mykyta Yatsenko
  Cc: bpf, ast, andrii, daniel, kafai, kernel-team, eddyz87,
	Mykyta Yatsenko

On Mon, 16 Mar 2026 at 22:47, Mykyta Yatsenko
<mykyta.yatsenko5@gmail.com> wrote:
>
> From: Mykyta Yatsenko <yatsenko@meta.com>
>
> Add trace_call_bpf_faultable(), a variant of trace_call_bpf() for
> faultable tracepoints that supports sleepable BPF programs. It uses
> rcu_tasks_trace for lifetime protection and bpf_prog_run_array_uprobe()
> for per-program RCU flavor selection, following the uprobe_prog_run()
> pattern. Uses preempt-safe this_cpu_inc_return/this_cpu_dec for the
> bpf_prog_active recursion counter since preemption is enabled in this
> context.
>
> Restructure perf_syscall_enter() and perf_syscall_exit() to run BPF
> filter before perf event processing. Previously, BPF ran after the
> per-cpu perf trace buffer was allocated under preempt_disable,
> requiring cleanup via perf_swevent_put_recursion_context() on filter.
> Now BPF runs in faultable context before preempt_disable, reading
> syscall arguments from local variables instead of the per-cpu trace
> record, removing the dependency on buffer allocation. This allows
> sleepable BPF programs to execute and avoids unnecessary buffer
> allocation when BPF filters the event. The perf event submission
> path (buffer allocation, fill, submit) remains under preempt_disable
> as before.
>
> Add an attach-time check in __perf_event_set_bpf_prog() to reject
> sleepable BPF_PROG_TYPE_TRACEPOINT programs on non-syscall
> tracepoints, since only syscall tracepoints run in faultable context.
>
> This prepares the classic tracepoint runtime and attach paths for
> sleepable programs. The verifier changes to allow loading sleepable
> BPF_PROG_TYPE_TRACEPOINT programs are in a subsequent patch.
>
> Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
> ---
>  include/linux/trace_events.h  |   6 +++
>  kernel/events/core.c          |   9 ++++
>  kernel/trace/bpf_trace.c      |  39 ++++++++++++++++
>  kernel/trace/trace_syscalls.c | 104 +++++++++++++++++++++++-------------------
>  4 files changed, 110 insertions(+), 48 deletions(-)
>
> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
> index 37eb2f0f3dd8..5fbbeb9ec4b9 100644
> --- a/include/linux/trace_events.h
> +++ b/include/linux/trace_events.h
> @@ -767,6 +767,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
>
>  #ifdef CONFIG_BPF_EVENTS
>  unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
> +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx);
>  int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
>  void perf_event_detach_bpf_prog(struct perf_event *event);
>  int perf_event_query_prog_array(struct perf_event *event, void __user *info);
> @@ -789,6 +790,11 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
>         return 1;
>  }
>
> +static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
> +{
> +       return 1;
> +}
> +
>  static inline int
>  perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
>  {
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 1f5699b339ec..46b733d3dd41 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -11647,6 +11647,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
>                 /* only uprobe programs are allowed to be sleepable */
>                 return -EINVAL;
>
> +       if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) {
> +               /*
> +                * Sleepable tracepoint programs can only attach to faultable
> +                * tracepoints. Currently only syscall tracepoints are faultable.
> +                */
> +               if (!is_syscall_tp)
> +                       return -EINVAL;
> +       }
> +
>         /* Kprobe override only works for kprobes, not uprobes. */
>         if (prog->kprobe_override && !is_kprobe)
>                 return -EINVAL;
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 35ed53807cfd..69c9a5539e65 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -152,6 +152,45 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>         return ret;
>  }
>
> +/**
> + * trace_call_bpf_faultable - invoke BPF program in faultable context
> + * @call: tracepoint event
> + * @ctx: opaque context pointer
> + *
> + * Variant of trace_call_bpf() for faultable tracepoints (e.g. syscall
> + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
> + * for lifetime protection and per-program rcu_read_lock for non-sleepable
> + * programs, following the uprobe_prog_run() pattern.
> + *
> + * Must be called from a faultable/preemptible context.
> + */
> +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
> +{
> +       struct bpf_prog_array *prog_array;
> +       unsigned int ret;
> +
> +       might_fault();
> +
> +       guard(rcu_tasks_trace)();
> +       guard(migrate)();
> +
> +       if (unlikely(this_cpu_inc_return(bpf_prog_active) != 1)) {

This seems a bit heavy handed, why not
bpf_prog_get_recursion_context()? Esp. since we can potentially sleep
and other tasks can preempt us on the same CPU.

> +               scoped_guard(rcu) {
> +                       bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
> +               }
> +               this_cpu_dec(bpf_prog_active);
> +               return 0;
> +       }
> +
> +       prog_array = rcu_dereference_check(call->prog_array,
> +                                          rcu_read_lock_trace_held());
> +       ret = bpf_prog_run_array_uprobe(prog_array, ctx, bpf_prog_run);

This confused me a bit, do we need to generalize it a bit? E.g. it
would now set run_ctx.is_uprobe = 1 for this case too.
Actually, except that bit, the rest looks reusable, so maybe it should
be renamed to __ version and expose two wrappers, one that passes
is_uprobe = false and one passing true?
Then do bpf_prog_run_array_trace() here?

> +
> [...]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
  2026-03-23 20:38   ` Kumar Kartikeya Dwivedi
@ 2026-03-23 20:57     ` Mykyta Yatsenko
  0 siblings, 0 replies; 7+ messages in thread
From: Mykyta Yatsenko @ 2026-03-23 20:57 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: bpf, ast, andrii, daniel, kafai, kernel-team, eddyz87,
	Mykyta Yatsenko

Kumar Kartikeya Dwivedi <memxor@gmail.com> writes:

> On Mon, 16 Mar 2026 at 22:47, Mykyta Yatsenko
> <mykyta.yatsenko5@gmail.com> wrote:
>>
>> From: Mykyta Yatsenko <yatsenko@meta.com>
>>
>> Add trace_call_bpf_faultable(), a variant of trace_call_bpf() for
>> faultable tracepoints that supports sleepable BPF programs. It uses
>> rcu_tasks_trace for lifetime protection and bpf_prog_run_array_uprobe()
>> for per-program RCU flavor selection, following the uprobe_prog_run()
>> pattern. Uses preempt-safe this_cpu_inc_return/this_cpu_dec for the
>> bpf_prog_active recursion counter since preemption is enabled in this
>> context.
>>
>> Restructure perf_syscall_enter() and perf_syscall_exit() to run BPF
>> filter before perf event processing. Previously, BPF ran after the
>> per-cpu perf trace buffer was allocated under preempt_disable,
>> requiring cleanup via perf_swevent_put_recursion_context() on filter.
>> Now BPF runs in faultable context before preempt_disable, reading
>> syscall arguments from local variables instead of the per-cpu trace
>> record, removing the dependency on buffer allocation. This allows
>> sleepable BPF programs to execute and avoids unnecessary buffer
>> allocation when BPF filters the event. The perf event submission
>> path (buffer allocation, fill, submit) remains under preempt_disable
>> as before.
>>
>> Add an attach-time check in __perf_event_set_bpf_prog() to reject
>> sleepable BPF_PROG_TYPE_TRACEPOINT programs on non-syscall
>> tracepoints, since only syscall tracepoints run in faultable context.
>>
>> This prepares the classic tracepoint runtime and attach paths for
>> sleepable programs. The verifier changes to allow loading sleepable
>> BPF_PROG_TYPE_TRACEPOINT programs are in a subsequent patch.
>>
>> Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
>> ---
>>  include/linux/trace_events.h  |   6 +++
>>  kernel/events/core.c          |   9 ++++
>>  kernel/trace/bpf_trace.c      |  39 ++++++++++++++++
>>  kernel/trace/trace_syscalls.c | 104 +++++++++++++++++++++++-------------------
>>  4 files changed, 110 insertions(+), 48 deletions(-)
>>
>> diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
>> index 37eb2f0f3dd8..5fbbeb9ec4b9 100644
>> --- a/include/linux/trace_events.h
>> +++ b/include/linux/trace_events.h
>> @@ -767,6 +767,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
>>
>>  #ifdef CONFIG_BPF_EVENTS
>>  unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
>> +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx);
>>  int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
>>  void perf_event_detach_bpf_prog(struct perf_event *event);
>>  int perf_event_query_prog_array(struct perf_event *event, void __user *info);
>> @@ -789,6 +790,11 @@ static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *c
>>         return 1;
>>  }
>>
>> +static inline unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
>> +{
>> +       return 1;
>> +}
>> +
>>  static inline int
>>  perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie)
>>  {
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 1f5699b339ec..46b733d3dd41 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -11647,6 +11647,15 @@ static int __perf_event_set_bpf_prog(struct perf_event *event,
>>                 /* only uprobe programs are allowed to be sleepable */
>>                 return -EINVAL;
>>
>> +       if (prog->type == BPF_PROG_TYPE_TRACEPOINT && prog->sleepable) {
>> +               /*
>> +                * Sleepable tracepoint programs can only attach to faultable
>> +                * tracepoints. Currently only syscall tracepoints are faultable.
>> +                */
>> +               if (!is_syscall_tp)
>> +                       return -EINVAL;
>> +       }
>> +
>>         /* Kprobe override only works for kprobes, not uprobes. */
>>         if (prog->kprobe_override && !is_kprobe)
>>                 return -EINVAL;
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index 35ed53807cfd..69c9a5539e65 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -152,6 +152,45 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
>>         return ret;
>>  }
>>
>> +/**
>> + * trace_call_bpf_faultable - invoke BPF program in faultable context
>> + * @call: tracepoint event
>> + * @ctx: opaque context pointer
>> + *
>> + * Variant of trace_call_bpf() for faultable tracepoints (e.g. syscall
>> + * tracepoints). Supports sleepable BPF programs by using rcu_tasks_trace
>> + * for lifetime protection and per-program rcu_read_lock for non-sleepable
>> + * programs, following the uprobe_prog_run() pattern.
>> + *
>> + * Must be called from a faultable/preemptible context.
>> + */
>> +unsigned int trace_call_bpf_faultable(struct trace_event_call *call, void *ctx)
>> +{
>> +       struct bpf_prog_array *prog_array;
>> +       unsigned int ret;
>> +
>> +       might_fault();
>> +
>> +       guard(rcu_tasks_trace)();
>> +       guard(migrate)();
>> +
>> +       if (unlikely(this_cpu_inc_return(bpf_prog_active) != 1)) {
>
> This seems a bit heavy handed, why not
> bpf_prog_get_recursion_context()? Esp. since we can potentially sleep
> and other tasks can preempt us on the same CPU.
Agree, the same point was raised by sashiko review. Thanks, though, for
the hint to look into bpf_prog_get_recursion_context(), I was not
aware of it.
>
>> +               scoped_guard(rcu) {
>> +                       bpf_prog_inc_misses_counters(rcu_dereference(call->prog_array));
>> +               }
>> +               this_cpu_dec(bpf_prog_active);
>> +               return 0;
>> +       }
>> +
>> +       prog_array = rcu_dereference_check(call->prog_array,
>> +                                          rcu_read_lock_trace_held());
>> +       ret = bpf_prog_run_array_uprobe(prog_array, ctx, bpf_prog_run);
>
> This confused me a bit, do we need to generalize it a bit? E.g. it
> would now set run_ctx.is_uprobe = 1 for this case too.
> Actually, except that bit, the rest looks reusable, so maybe it should
> be renamed to __ version and expose two wrappers, one that passes
> is_uprobe = false and one passing true?
> Then do bpf_prog_run_array_trace() here?
Thanks, I agree on your point.
>
>> +
>> [...]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
@ 2026-03-23 21:17 oskar
  2026-03-23 21:26 ` oskar
  0 siblings, 1 reply; 7+ messages in thread
From: oskar @ 2026-03-23 21:17 UTC (permalink / raw)
  To: mykyta.yatsenko5, bpf, ast, andrii, daniel, kafai, kernel-team,
	eddyz87, memxor, yatsenko

On Mon, Mar 23, 2026, Mykyta Yatsenko wrote:
> Agree, the same point was raised by sashiko review.

Hi,

I have a follow-up question on the recursion handling.

Even with migrate_disable(), this still runs in a preemptible and
sleepable context. So if a task sleeps after incrementing
bpf_prog_active, another task could run on the same CPU and also
modify the per-CPU counter.

Would this interleaving affect recursion tracking correctness here,
or is this accounted for?

Thanks,
Oskar Gerlicz Kowalczuk

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
  2026-03-23 21:17 [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs oskar
@ 2026-03-23 21:26 ` oskar
       [not found]   ` <e0443380-97d3-4bcc-b599-0883bb6c6a03@gmail.com>
  0 siblings, 1 reply; 7+ messages in thread
From: oskar @ 2026-03-23 21:26 UTC (permalink / raw)
  To: mykyta.yatsenko5
  Cc: bpf, ast, andrii, daniel, kafai, kernel-team, eddyz87, memxor,
	yatsenko

> +       if (unlikely(this_cpu_inc_return(bpf_prog_active) != 1)) {

Hi,

A question regarding the recursion handling.

Even with migrate_disable(), this still runs in a preemptible and
sleepable context. So if a task increments bpf_prog_active and then
sleeps, another task could run on the same CPU and also modify the
same per-CPU counter.

Would this interleaving affect recursion tracking correctness, or is
this accounted for?

Oskar Gerlicz Kowalczuk

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs
       [not found]   ` <e0443380-97d3-4bcc-b599-0883bb6c6a03@gmail.com>
@ 2026-03-24 14:32     ` Mykyta Yatsenko
  0 siblings, 0 replies; 7+ messages in thread
From: Mykyta Yatsenko @ 2026-03-24 14:32 UTC (permalink / raw)
  To: oskar
  Cc: bpf, ast, andrii, daniel, kafai, kernel-team, eddyz87, memxor,
	yatsenko

Mykyta Yatsenko <mykyta.yatsenko5@gmail.com> writes:

> On 3/23/26 9:26 PM, oskar@gerlicz.space wrote:
>>> +       if (unlikely(this_cpu_inc_return(bpf_prog_active) != 1)) {
>> 
>> Hi,
>> 
>> A question regarding the recursion handling.
>> 
>> Even with migrate_disable(), this still runs in a preemptible and
>> sleepable context. So if a task increments bpf_prog_active and then
>> sleeps, another task could run on the same CPU and also modify the
>> same per-CPU counter.
>> 
>> Would this interleaving affect recursion tracking correctness, or is
>> this accounted for?
>> 
>> Oskar Gerlicz Kowalczuk
>> 
> We discussed this with Kumar in the sibling thread, the initial thinking 
> behind this was that second task incrementing this counter won't see 1, 
> so it's going to
>
> this_cpu_dec(bpf_prog_active);
> return 0;
>
> I'm reworking this for the next version.
The initial thinking behind this was that the second task incrementing this counter won't see 1, so it's going to

this_cpu_dec(bpf_prog_active);
return 0;

I'm reworking this for the next version, as discussed in the sibling thread with Kumar.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2026-03-24 14:32 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-23 21:17 [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic tracepoint programs oskar
2026-03-23 21:26 ` oskar
     [not found]   ` <e0443380-97d3-4bcc-b599-0883bb6c6a03@gmail.com>
2026-03-24 14:32     ` Mykyta Yatsenko
  -- strict thread matches above, loose matches on Subject: below --
2026-03-16 21:46 [PATCH bpf-next v5 0/5] bpf: Add support for sleepable " Mykyta Yatsenko
2026-03-16 21:46 ` [PATCH bpf-next v5 2/5] bpf: Add sleepable support for classic " Mykyta Yatsenko
2026-03-16 22:22   ` bot+bpf-ci
2026-03-23 20:38   ` Kumar Kartikeya Dwivedi
2026-03-23 20:57     ` Mykyta Yatsenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox