[PATCHv3 bpf 1/2] bpf: Disable preemption in bpf_perf_event

public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCHv3 bpf 1/2] bpf: Disable preemption in bpf_perf_event_output
       [not found] <20230725084206.580930-1-jolsa@kernel.org>
@ 2023-07-25  8:42 ` Jiri Olsa
  2023-07-25  8:42 ` [PATCHv3 bpf 2/2] bpf: Disable preemption in bpf_event_output Jiri Olsa
  1 sibling, 0 replies; 3+ messages in thread
From: Jiri Olsa @ 2023-07-25  8:42 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: stable, Hou Tao, bpf, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo

The nesting protection in bpf_perf_event_output relies on disabled
preemption, which is guaranteed for kprobes and tracepoints.

However bpf_perf_event_output can be also called from uprobes context
through bpf_prog_run_array_sleepable function which disables migration,
but keeps preemption enabled.

This can cause task to be preempted by another one inside the nesting
protection and lead eventually to two tasks using same perf_sample_data
buffer and cause crashes like:

  kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
  BUG: unable to handle page fault for address: ffffffff82be3eea
  ...
  Call Trace:
   ? __die+0x1f/0x70
   ? page_fault_oops+0x176/0x4d0
   ? exc_page_fault+0x132/0x230
   ? asm_exc_page_fault+0x22/0x30
   ? perf_output_sample+0x12b/0x910
   ? perf_event_output+0xd0/0x1d0
   ? bpf_perf_event_output+0x162/0x1d0
   ? bpf_prog_c6271286d9a4c938_krava1+0x76/0x87
   ? __uprobe_perf_func+0x12b/0x540
   ? uprobe_dispatcher+0x2c4/0x430
   ? uprobe_notify_resume+0x2da/0xce0
   ? atomic_notifier_call_chain+0x7b/0x110
   ? exit_to_user_mode_prepare+0x13e/0x290
   ? irqentry_exit_to_user_mode+0x5/0x30
   ? asm_exc_int3+0x35/0x40

Fixing this by disabling preemption in bpf_perf_event_output.

Cc: stable@vger.kernel.org
Fixes: 8c7dcb84e3b7 ("bpf: implement sleepable uprobes by chaining gps")
Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 kernel/trace/bpf_trace.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5f2dcabad202..14c9a1a548c9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level);
 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
-	struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
-	int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
+	struct bpf_trace_sample_data *sds;
 	struct perf_raw_record raw = {
 		.frag = {
 			.size = size,
@@ -670,7 +669,12 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 		},
 	};
 	struct perf_sample_data *sd;
-	int err;
+	int nest_level, err;
+
+	preempt_disable();
+
+	sds = this_cpu_ptr(&bpf_trace_sds);
+	nest_level = this_cpu_inc_return(bpf_trace_nest_level);
 
 	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
 		err = -EBUSY;
@@ -691,6 +695,7 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
 
 out:
 	this_cpu_dec(bpf_trace_nest_level);
+	preempt_enable();
 	return err;
 }
 
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCHv3 bpf 2/2] bpf: Disable preemption in bpf_event_output
       [not found] <20230725084206.580930-1-jolsa@kernel.org>
  2023-07-25  8:42 ` [PATCHv3 bpf 1/2] bpf: Disable preemption in bpf_perf_event_output Jiri Olsa
@ 2023-07-25  8:42 ` Jiri Olsa
  2023-07-26  0:13   ` Alexei Starovoitov
  1 sibling, 1 reply; 3+ messages in thread
From: Jiri Olsa @ 2023-07-25  8:42 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
  Cc: stable, Hou Tao, bpf, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo

We received report [1] of kernel crash, which is caused by
using nesting protection without disabled preemption.

The bpf_event_output can be called by programs executed by
bpf_prog_run_array_cg function that disabled migration but
keeps preemption enabled.

This can cause task to be preempted by another one inside the
nesting protection and lead eventually to two tasks using same
perf_sample_data buffer and cause crashes like:

  BUG: kernel NULL pointer dereference, address: 0000000000000001
  #PF: supervisor instruction fetch in kernel mode
  #PF: error_code(0x0010) - not-present page
  ...
  ? perf_output_sample+0x12a/0x9a0
  ? finish_task_switch.isra.0+0x81/0x280
  ? perf_event_output+0x66/0xa0
  ? bpf_event_output+0x13a/0x190
  ? bpf_event_output_data+0x22/0x40
  ? bpf_prog_dfc84bbde731b257_cil_sock4_connect+0x40a/0xacb
  ? xa_load+0x87/0xe0
  ? __cgroup_bpf_run_filter_sock_addr+0xc1/0x1a0
  ? release_sock+0x3e/0x90
  ? sk_setsockopt+0x1a1/0x12f0
  ? udp_pre_connect+0x36/0x50
  ? inet_dgram_connect+0x93/0xa0
  ? __sys_connect+0xb4/0xe0
  ? udp_setsockopt+0x27/0x40
  ? __pfx_udp_push_pending_frames+0x10/0x10
  ? __sys_setsockopt+0xdf/0x1a0
  ? __x64_sys_connect+0xf/0x20
  ? do_syscall_64+0x3a/0x90
  ? entry_SYSCALL_64_after_hwframe+0x72/0xdc

Fixing this by disabling preemption in bpf_event_output.

[1] https://github.com/cilium/cilium/issues/26756
Cc: stable@vger.kernel.org
Reported-by: Oleg "livelace" Popov <o.popov@livelace.ru>
Closes: https://github.com/cilium/cilium/issues/26756
Fixes: 2a916f2f546c ("bpf: Use migrate_disable/enable in array macros and cgroup/lirc code.")
Acked-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 kernel/trace/bpf_trace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 14c9a1a548c9..6826ebf750b0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -720,7 +720,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
 {
-	int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
 	struct perf_raw_frag frag = {
 		.copy		= ctx_copy,
 		.size		= ctx_size,
@@ -737,8 +736,13 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	};
 	struct perf_sample_data *sd;
 	struct pt_regs *regs;
+	int nest_level;
 	u64 ret;
 
+	preempt_disable();
+
+	nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
+
 	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
 		ret = -EBUSY;
 		goto out;
@@ -753,6 +757,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 	ret = __bpf_perf_event_output(regs, map, flags, sd);
 out:
 	this_cpu_dec(bpf_event_output_nest_level);
+	preempt_enable();
 	return ret;
 }
 
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCHv3 bpf 2/2] bpf: Disable preemption in bpf_event_output
  2023-07-25  8:42 ` [PATCHv3 bpf 2/2] bpf: Disable preemption in bpf_event_output Jiri Olsa
@ 2023-07-26  0:13   ` Alexei Starovoitov
  0 siblings, 0 replies; 3+ messages in thread
From: Alexei Starovoitov @ 2023-07-26  0:13 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, stable,
	Hou Tao, bpf, Martin KaFai Lau, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo

On Tue, Jul 25, 2023 at 1:42 AM Jiri Olsa <jolsa@kernel.org> wrote:
>
> We received report [1] of kernel crash, which is caused by
> using nesting protection without disabled preemption.
>
> The bpf_event_output can be called by programs executed by
> bpf_prog_run_array_cg function that disabled migration but
> keeps preemption enabled.
>
> This can cause task to be preempted by another one inside the
> nesting protection and lead eventually to two tasks using same
> perf_sample_data buffer and cause crashes like:
>
>   BUG: kernel NULL pointer dereference, address: 0000000000000001
>   #PF: supervisor instruction fetch in kernel mode
>   #PF: error_code(0x0010) - not-present page
>   ...
>   ? perf_output_sample+0x12a/0x9a0
>   ? finish_task_switch.isra.0+0x81/0x280
>   ? perf_event_output+0x66/0xa0
>   ? bpf_event_output+0x13a/0x190
>   ? bpf_event_output_data+0x22/0x40
>   ? bpf_prog_dfc84bbde731b257_cil_sock4_connect+0x40a/0xacb
>   ? xa_load+0x87/0xe0
>   ? __cgroup_bpf_run_filter_sock_addr+0xc1/0x1a0
>   ? release_sock+0x3e/0x90
>   ? sk_setsockopt+0x1a1/0x12f0
>   ? udp_pre_connect+0x36/0x50
>   ? inet_dgram_connect+0x93/0xa0
>   ? __sys_connect+0xb4/0xe0
>   ? udp_setsockopt+0x27/0x40
>   ? __pfx_udp_push_pending_frames+0x10/0x10
>   ? __sys_setsockopt+0xdf/0x1a0
>   ? __x64_sys_connect+0xf/0x20
>   ? do_syscall_64+0x3a/0x90
>   ? entry_SYSCALL_64_after_hwframe+0x72/0xdc
>
> Fixing this by disabling preemption in bpf_event_output.
>
> [1] https://github.com/cilium/cilium/issues/26756
> Cc: stable@vger.kernel.org
> Reported-by: Oleg "livelace" Popov <o.popov@livelace.ru>
> Closes: https://github.com/cilium/cilium/issues/26756
> Fixes: 2a916f2f546c ("bpf: Use migrate_disable/enable in array macros and cgroup/lirc code.")
> Acked-by: Hou Tao <houtao1@huawei.com>
> Signed-off-by: Jiri Olsa <jolsa@kernel.org>
> ---
>  kernel/trace/bpf_trace.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 14c9a1a548c9..6826ebf750b0 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -720,7 +720,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
>  u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>                      void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
>  {
> -       int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
>         struct perf_raw_frag frag = {
>                 .copy           = ctx_copy,
>                 .size           = ctx_size,
> @@ -737,8 +736,13 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>         };
>         struct perf_sample_data *sd;
>         struct pt_regs *regs;
> +       int nest_level;
>         u64 ret;
>
> +       preempt_disable();
> +

Removed extra empty line here and in patch 1 and applied.

Thanks!

> +       nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
> +
>         if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
>                 ret = -EBUSY;
>                 goto out;
> @@ -753,6 +757,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
>         ret = __bpf_perf_event_output(regs, map, flags, sd);
>  out:
>         this_cpu_dec(bpf_event_output_nest_level);
> +       preempt_enable();
>         return ret;
>  }
>
> --
> 2.41.0
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-07-26  0:13 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20230725084206.580930-1-jolsa@kernel.org>
2023-07-25  8:42 ` [PATCHv3 bpf 1/2] bpf: Disable preemption in bpf_perf_event_output Jiri Olsa
2023-07-25  8:42 ` [PATCHv3 bpf 2/2] bpf: Disable preemption in bpf_event_output Jiri Olsa
2023-07-26  0:13   ` Alexei Starovoitov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox