[PATCH] perf trace: Refactor augmented_raw_syscalls using bpf

Linux Perf Users
 help / color / mirror / Atom feed

* [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop
@ 2026-06-23 11:25 Viktor Malik
  2026-06-23 11:39 ` sashiko-bot
  2026-06-23 15:27 ` Alexei Starovoitov
  0 siblings, 2 replies; 4+ messages in thread
From: Viktor Malik @ 2026-06-23 11:25 UTC (permalink / raw)
  To: linux-perf-users
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Howard Chu, Viktor Malik,
	linux-kernel, bpf, Michael Petlan, stable

The loop for processing syscall args in augment_raw_syscalls has a
history of breaking with Clang updates, see e.g. commit 013eb043f37b
("perf trace: Fix BPF loading failure (-E2BIG)") from Clang 15 to 16.

Now, a similar thing happened between Clang 21 and 22. While the issue
is mitigated on the main line by a recent verifier update, it remains
broken on the 6.12 and 6.18 stable branches:

    [linux-6.18.y]# sudo perf trace true
    libbpf: prog 'sys_enter': BPF program load failed: -E2BIG
    libbpf: prog 'sys_enter': -- BEGIN PROG LOAD LOG --
    [...]
    BPF program is too large. Processed 1000001 insn
    processed 1000001 insns (limit 1000000) max_states_per_insn 40 total_states 37941 peak_states 232 mark_read 0
    -- END PROG LOAD LOG --
    libbpf: prog 'sys_enter': failed to load: -E2BIG
    libbpf: failed to load object 'augmented_raw_syscalls_bpf'
    libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -E2BIG
    Error: failed to get syscall or beauty map fd
    [...]

The reason is that the loop is quite complex and the BPF verifier often
struggles to prove that it terminates.

Fix the issue by refactoring the loop body into a callback function and
calling the bpf_loop helper. This should prevent future breakages of
this kind since the callback function has no loops. It also allows to
drop a few artificial checks to help the verifier, including the changes
introduced by 013eb043f37b.

Signed-off-by: Viktor Malik <vmalik@redhat.com>
Fixes: a68fd6a6cdd3 ("perf trace: Collect augmented data using BPF")
Fixes: 013eb043f37b ("perf trace: Fix BPF loading failure (-E2BIG)")
Cc: stable@vger.kernel.org
---
 .../bpf_skel/augmented_raw_syscalls.bpf.c     | 157 +++++++++++-------
 1 file changed, 96 insertions(+), 61 deletions(-)

diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
index 2a6e61864ee0..6d553ed3ac23 100644
--- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -429,15 +429,96 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
 	return bpf_map_lookup_elem(pids, &pid) != NULL;
 }
 
+struct args_loop_ctx {
+	struct syscall_enter_args *args;
+	unsigned int *beauty_map;
+	void *payload_offset;
+	int value_size;
+	u64 *output;
+	bool *do_output;
+};
+
+static long process_arg_cb(u64 i, void *ctx)
+{
+	/*
+	 * Determine what type of argument and how many bytes to read from user space, using the
+	 * value in the beauty_map. This is the relation of parameter type and its corresponding
+	 * value in the beauty map, and how many bytes we read eventually:
+	 *
+	 * string: 1			      -> size of string
+	 * struct: size of struct	      -> size of struct
+	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
+	 */
+	struct augmented_arg *augmented_arg;
+	struct args_loop_ctx *loop_ctx;
+	int aug_size, size, index;
+	bool augmented;
+	void *arg;
+
+	/* Bounds check for the below map access to help the verifier */
+	if (i < 0 || i >= 6)
+		return 1;
+
+	loop_ctx = (struct args_loop_ctx *)ctx;
+	arg = (void *)loop_ctx->args->args[i];
+	augmented = false;
+	size = loop_ctx->beauty_map[i];
+	aug_size = size; /* size of the augmented data read from user space */
+	augmented_arg = (struct augmented_arg *)loop_ctx->payload_offset;
+
+	if (size == 0 || arg == NULL)
+		return 0; /* continue */
+
+	if (size == 1) { /* string */
+		aug_size = bpf_probe_read_user_str(augmented_arg->value, loop_ctx->value_size, arg);
+		augmented = true;
+	} else if (size > 0 && size <= loop_ctx->value_size) { /* struct */
+		if (!bpf_probe_read_user(augmented_arg->value, size, arg))
+			augmented = true;
+	} else if (size < 0 && size >= -6) { /* buffer */
+		index = -(size + 1);
+		barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
+		index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
+		aug_size = loop_ctx->args->args[index];
+
+		if (aug_size > TRACE_AUG_MAX_BUF)
+			aug_size = TRACE_AUG_MAX_BUF;
+
+		if (aug_size > 0) {
+			if (!bpf_probe_read_user(augmented_arg->value, aug_size, arg))
+				augmented = true;
+		}
+	}
+
+	/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
+	if (aug_size > loop_ctx->value_size)
+		aug_size = loop_ctx->value_size;
+
+	/* write data to payload */
+	if (augmented) {
+		int written = offsetof(struct augmented_arg, value) + aug_size;
+
+		if (written < 0 || written > sizeof(struct augmented_arg))
+			return 1; /* break */
+
+		augmented_arg->size = aug_size;
+		*loop_ctx->output += written;
+		loop_ctx->payload_offset += written;
+		*loop_ctx->do_output = true;
+	}
+
+	return 0;
+}
+
 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 {
-	bool augmented, do_output = false;
-	int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+	bool do_output = false;
+	int zero = 0, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
 	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
-	s64 aug_size, size;
 	unsigned int nr, *beauty_map;
 	struct beauty_payload_enter *payload;
-	void *arg, *payload_offset;
+	void *payload_offset;
+	long iters;
 
 	/* fall back to do predefined tail call */
 	if (args == NULL)
@@ -457,63 +538,17 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
 	/* copy the sys_enter header, which has the syscall_nr */
 	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
 
-	/*
-	 * Determine what type of argument and how many bytes to read from user space, using the
-	 * value in the beauty_map. This is the relation of parameter type and its corresponding
-	 * value in the beauty map, and how many bytes we read eventually:
-	 *
-	 * string: 1			      -> size of string
-	 * struct: size of struct	      -> size of struct
-	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
-	 */
-	for (int i = 0; i < 6; i++) {
-		arg = (void *)args->args[i];
-		augmented = false;
-		size = beauty_map[i];
-		aug_size = size; /* size of the augmented data read from user space */
-
-		if (size == 0 || arg == NULL)
-			continue;
-
-		if (size == 1) { /* string */
-			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
-			/* minimum of 0 to pass the verifier */
-			if (aug_size < 0)
-				aug_size = 0;
-
-			augmented = true;
-		} else if (size > 0 && size <= value_size) { /* struct */
-			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
-				augmented = true;
-		} else if ((int)size < 0 && size >= -6) { /* buffer */
-			index = -(size + 1);
-			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
-			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
-			aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
-
-			if (aug_size > 0) {
-				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
-					augmented = true;
-			}
-		}
-
-		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
-		if (aug_size > value_size)
-			aug_size = value_size;
-
-		/* write data to payload */
-		if (augmented) {
-			int written = offsetof(struct augmented_arg, value) + aug_size;
-
-			if (written < 0 || written > sizeof(struct augmented_arg))
-				return 1;
-
-			((struct augmented_arg *)payload_offset)->size = aug_size;
-			output += written;
-			payload_offset += written;
-			do_output = true;
-		}
-	}
+	struct args_loop_ctx loop_ctx = {
+		.args = args,
+		.beauty_map = beauty_map,
+		.payload_offset = payload_offset,
+		.value_size = value_size,
+		.output = &output,
+		.do_output = &do_output
+	};
+	iters = bpf_loop(6, process_arg_cb, &loop_ctx, 0);
+	if (iters != 6)
+		return 1;
 
 	if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
 		return 1;
-- 
2.54.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop
  2026-06-23 11:25 [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop Viktor Malik
@ 2026-06-23 11:39 ` sashiko-bot
  2026-06-23 15:27 ` Alexei Starovoitov
  1 sibling, 0 replies; 4+ messages in thread
From: sashiko-bot @ 2026-06-23 11:39 UTC (permalink / raw)
  To: Viktor Malik; +Cc: linux-perf-users, bpf

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [Medium] Removal of the `aug_size < 0` check causes negative BPF helper error codes to corrupt the perf event payload and abort tracing.
- [Medium] Truncation of 64-bit syscall buffer lengths to a 32-bit signed integer silently drops buffer augmentations or bypasses bounds checks.
--

commit 9622318b331a56255744b4baafdc95c2083ba210
Author: Viktor Malik <vmalik@redhat.com>
perf trace: Refactor augmented_raw_syscalls using bpf_loop

This patch refactors the loop body in augment_raw_syscalls into a
callback function to be used with the bpf_loop helper. This helps the
BPF verifier prove termination and avoids -E2BIG load failures.

> diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
[ ... ]
> +static long process_arg_cb(u64 i, void *ctx)
> +{
[ ... ]
> +	struct augmented_arg *augmented_arg;
> +	struct args_loop_ctx *loop_ctx;
> +	int aug_size, size, index;
> +	bool augmented;
> +	void *arg;
[ ... ]
> +	if (size == 1) { /* string */
> +		aug_size = bpf_probe_read_user_str(augmented_arg->value, loop_ctx->value_size, arg);

[Severity: Medium]
Since bpf_probe_read_user_str() can return a negative error code like
-EFAULT, does the removal of the previous clamping check cause an issue
here?

If a negative error code is assigned to aug_size, it might result in a
negative size being added to offsetof() later, which could abort the loop
prematurely or misalign payload offsets:

    int written = offsetof(struct augmented_arg, value) + aug_size;

    if (written < 0 || written > sizeof(struct augmented_arg))
        return 1; /* break */

> +		augmented = true;
> +	} else if (size > 0 && size <= loop_ctx->value_size) { /* struct */
> +		if (!bpf_probe_read_user(augmented_arg->value, size, arg))
> +			augmented = true;
> +	} else if (size < 0 && size >= -6) { /* buffer */
> +		index = -(size + 1);
> +		barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
> +		index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
> +		aug_size = loop_ctx->args->args[index];

[Severity: Medium]
Does assigning the 64-bit user-provided buffer length to a 32-bit signed
integer (aug_size) cause truncation before bounds checking is performed?

If a length over 2GB is provided, it might wrap to a negative number and
fail the subsequent aug_size > 0 check. Values over 4GB could wrap to small
positive numbers, bypass the max buffer size clamp, and result in capturing
fewer bytes than expected.

> +
> +		if (aug_size > TRACE_AUG_MAX_BUF)
> +			aug_size = TRACE_AUG_MAX_BUF;
> +
> +		if (aug_size > 0) {
> +			if (!bpf_probe_read_user(augmented_arg->value, aug_size, arg))
> +				augmented = true;
> +		}
> +	}
[ ... ]

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260623112533.1151502-1-vmalik@redhat.com?part=1

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop
  2026-06-23 11:25 [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop Viktor Malik
  2026-06-23 11:39 ` sashiko-bot
@ 2026-06-23 15:27 ` Alexei Starovoitov
  2026-06-23 17:10   ` Namhyung Kim
  1 sibling, 1 reply; 4+ messages in thread
From: Alexei Starovoitov @ 2026-06-23 15:27 UTC (permalink / raw)
  To: Viktor Malik, linux-perf-users
  Cc: Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo,
	Namhyung Kim, Mark Rutland, Alexander Shishkin, Jiri Olsa,
	Ian Rogers, Adrian Hunter, James Clark, Howard Chu, linux-kernel,
	bpf, Michael Petlan, stable

On Tue Jun 23, 2026 at 4:25 AM PDT, Viktor Malik wrote:
> The loop for processing syscall args in augment_raw_syscalls has a
> history of breaking with Clang updates, see e.g. commit 013eb043f37b
> ("perf trace: Fix BPF loading failure (-E2BIG)") from Clang 15 to 16.
>
> Now, a similar thing happened between Clang 21 and 22. While the issue
> is mitigated on the main line by a recent verifier update, it remains
> broken on the 6.12 and 6.18 stable branches:
>
>     [linux-6.18.y]# sudo perf trace true
>     libbpf: prog 'sys_enter': BPF program load failed: -E2BIG
>     libbpf: prog 'sys_enter': -- BEGIN PROG LOAD LOG --
>     [...]
>     BPF program is too large. Processed 1000001 insn
>     processed 1000001 insns (limit 1000000) max_states_per_insn 40 total_states 37941 peak_states 232 mark_read 0
>     -- END PROG LOAD LOG --
>     libbpf: prog 'sys_enter': failed to load: -E2BIG
>     libbpf: failed to load object 'augmented_raw_syscalls_bpf'
>     libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -E2BIG
>     Error: failed to get syscall or beauty map fd
>     [...]
>
> The reason is that the loop is quite complex and the BPF verifier often
> struggles to prove that it terminates.
>
> Fix the issue by refactoring the loop body into a callback function and
> calling the bpf_loop helper. This should prevent future breakages of
> this kind since the callback function has no loops. It also allows to
> drop a few artificial checks to help the verifier, including the changes
> introduced by 013eb043f37b.
>
> Signed-off-by: Viktor Malik <vmalik@redhat.com>
> Fixes: a68fd6a6cdd3 ("perf trace: Collect augmented data using BPF")
> Fixes: 013eb043f37b ("perf trace: Fix BPF loading failure (-E2BIG)")
> Cc: stable@vger.kernel.org
> ---
>  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 157 +++++++++++-------
>  1 file changed, 96 insertions(+), 61 deletions(-)
>
> diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> index 2a6e61864ee0..6d553ed3ac23 100644
> --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> @@ -429,15 +429,96 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
>  	return bpf_map_lookup_elem(pids, &pid) != NULL;
>  }
>  
> +struct args_loop_ctx {
> +	struct syscall_enter_args *args;
> +	unsigned int *beauty_map;
> +	void *payload_offset;
> +	int value_size;
> +	u64 *output;
> +	bool *do_output;
> +};
> +
> +static long process_arg_cb(u64 i, void *ctx)
> +{
> +	/*
> +	 * Determine what type of argument and how many bytes to read from user space, using the
> +	 * value in the beauty_map. This is the relation of parameter type and its corresponding
> +	 * value in the beauty map, and how many bytes we read eventually:
> +	 *
> +	 * string: 1			      -> size of string
> +	 * struct: size of struct	      -> size of struct
> +	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
> +	 */
> +	struct augmented_arg *augmented_arg;
> +	struct args_loop_ctx *loop_ctx;
> +	int aug_size, size, index;
> +	bool augmented;
> +	void *arg;
> +
> +	/* Bounds check for the below map access to help the verifier */
> +	if (i < 0 || i >= 6)
> +		return 1;
> +
> +	loop_ctx = (struct args_loop_ctx *)ctx;
> +	arg = (void *)loop_ctx->args->args[i];
> +	augmented = false;
> +	size = loop_ctx->beauty_map[i];
> +	aug_size = size; /* size of the augmented data read from user space */
> +	augmented_arg = (struct augmented_arg *)loop_ctx->payload_offset;
> +
> +	if (size == 0 || arg == NULL)
> +		return 0; /* continue */
> +
> +	if (size == 1) { /* string */
> +		aug_size = bpf_probe_read_user_str(augmented_arg->value, loop_ctx->value_size, arg);
> +		augmented = true;
> +	} else if (size > 0 && size <= loop_ctx->value_size) { /* struct */
> +		if (!bpf_probe_read_user(augmented_arg->value, size, arg))
> +			augmented = true;
> +	} else if (size < 0 && size >= -6) { /* buffer */
> +		index = -(size + 1);
> +		barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
> +		index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
> +		aug_size = loop_ctx->args->args[index];
> +
> +		if (aug_size > TRACE_AUG_MAX_BUF)
> +			aug_size = TRACE_AUG_MAX_BUF;
> +
> +		if (aug_size > 0) {
> +			if (!bpf_probe_read_user(augmented_arg->value, aug_size, arg))
> +				augmented = true;
> +		}
> +	}
> +
> +	/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
> +	if (aug_size > loop_ctx->value_size)
> +		aug_size = loop_ctx->value_size;
> +
> +	/* write data to payload */
> +	if (augmented) {
> +		int written = offsetof(struct augmented_arg, value) + aug_size;
> +
> +		if (written < 0 || written > sizeof(struct augmented_arg))
> +			return 1; /* break */
> +
> +		augmented_arg->size = aug_size;
> +		*loop_ctx->output += written;
> +		loop_ctx->payload_offset += written;
> +		*loop_ctx->do_output = true;
> +	}
> +
> +	return 0;
> +}
> +
>  static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
>  {
> -	bool augmented, do_output = false;
> -	int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
> +	bool do_output = false;
> +	int zero = 0, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
>  	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
> -	s64 aug_size, size;
>  	unsigned int nr, *beauty_map;
>  	struct beauty_payload_enter *payload;
> -	void *arg, *payload_offset;
> +	void *payload_offset;
> +	long iters;
>  
>  	/* fall back to do predefined tail call */
>  	if (args == NULL)
> @@ -457,63 +538,17 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
>  	/* copy the sys_enter header, which has the syscall_nr */
>  	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
>  
> -	/*
> -	 * Determine what type of argument and how many bytes to read from user space, using the
> -	 * value in the beauty_map. This is the relation of parameter type and its corresponding
> -	 * value in the beauty map, and how many bytes we read eventually:
> -	 *
> -	 * string: 1			      -> size of string
> -	 * struct: size of struct	      -> size of struct
> -	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
> -	 */
> -	for (int i = 0; i < 6; i++) {
> -		arg = (void *)args->args[i];
> -		augmented = false;
> -		size = beauty_map[i];
> -		aug_size = size; /* size of the augmented data read from user space */
> -
> -		if (size == 0 || arg == NULL)
> -			continue;
> -
> -		if (size == 1) { /* string */
> -			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
> -			/* minimum of 0 to pass the verifier */
> -			if (aug_size < 0)
> -				aug_size = 0;
> -
> -			augmented = true;
> -		} else if (size > 0 && size <= value_size) { /* struct */
> -			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
> -				augmented = true;
> -		} else if ((int)size < 0 && size >= -6) { /* buffer */
> -			index = -(size + 1);
> -			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
> -			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
> -			aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
> -
> -			if (aug_size > 0) {
> -				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
> -					augmented = true;
> -			}
> -		}
> -
> -		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
> -		if (aug_size > value_size)
> -			aug_size = value_size;
> -
> -		/* write data to payload */
> -		if (augmented) {
> -			int written = offsetof(struct augmented_arg, value) + aug_size;
> -
> -			if (written < 0 || written > sizeof(struct augmented_arg))
> -				return 1;
> -
> -			((struct augmented_arg *)payload_offset)->size = aug_size;
> -			output += written;
> -			payload_offset += written;
> -			do_output = true;
> -		}
> -	}
> +	struct args_loop_ctx loop_ctx = {
> +		.args = args,
> +		.beauty_map = beauty_map,
> +		.payload_offset = payload_offset,
> +		.value_size = value_size,
> +		.output = &output,
> +		.do_output = &do_output
> +	};
> +	iters = bpf_loop(6, process_arg_cb, &loop_ctx, 0);

bpf_loop() is old and generally not recommended.
Please use bpf_for() then the diff will be one line change and
can scale to any number of args. Not just 6.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop
  2026-06-23 15:27 ` Alexei Starovoitov
@ 2026-06-23 17:10   ` Namhyung Kim
  0 siblings, 0 replies; 4+ messages in thread
From: Namhyung Kim @ 2026-06-23 17:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Viktor Malik, linux-perf-users, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Mark Rutland, Alexander Shishkin,
	Jiri Olsa, Ian Rogers, Adrian Hunter, James Clark, Howard Chu,
	linux-kernel, bpf, Michael Petlan, stable

Hello,

On Tue, Jun 23, 2026 at 08:27:39AM -0700, Alexei Starovoitov wrote:
> On Tue Jun 23, 2026 at 4:25 AM PDT, Viktor Malik wrote:
> > The loop for processing syscall args in augment_raw_syscalls has a
> > history of breaking with Clang updates, see e.g. commit 013eb043f37b
> > ("perf trace: Fix BPF loading failure (-E2BIG)") from Clang 15 to 16.
> >
> > Now, a similar thing happened between Clang 21 and 22. While the issue
> > is mitigated on the main line by a recent verifier update, it remains
> > broken on the 6.12 and 6.18 stable branches:
> >
> >     [linux-6.18.y]# sudo perf trace true
> >     libbpf: prog 'sys_enter': BPF program load failed: -E2BIG
> >     libbpf: prog 'sys_enter': -- BEGIN PROG LOAD LOG --
> >     [...]
> >     BPF program is too large. Processed 1000001 insn
> >     processed 1000001 insns (limit 1000000) max_states_per_insn 40 total_states 37941 peak_states 232 mark_read 0
> >     -- END PROG LOAD LOG --
> >     libbpf: prog 'sys_enter': failed to load: -E2BIG
> >     libbpf: failed to load object 'augmented_raw_syscalls_bpf'
> >     libbpf: failed to load BPF skeleton 'augmented_raw_syscalls_bpf': -E2BIG
> >     Error: failed to get syscall or beauty map fd
> >     [...]
> >
> > The reason is that the loop is quite complex and the BPF verifier often
> > struggles to prove that it terminates.
> >
> > Fix the issue by refactoring the loop body into a callback function and
> > calling the bpf_loop helper. This should prevent future breakages of
> > this kind since the callback function has no loops. It also allows to
> > drop a few artificial checks to help the verifier, including the changes
> > introduced by 013eb043f37b.

Thanks for working on this.  I encountered this issue before and never
found time to take a deeper look yet.

> >
> > Signed-off-by: Viktor Malik <vmalik@redhat.com>
> > Fixes: a68fd6a6cdd3 ("perf trace: Collect augmented data using BPF")
> > Fixes: 013eb043f37b ("perf trace: Fix BPF loading failure (-E2BIG)")
> > Cc: stable@vger.kernel.org
> > ---
> >  .../bpf_skel/augmented_raw_syscalls.bpf.c     | 157 +++++++++++-------
> >  1 file changed, 96 insertions(+), 61 deletions(-)
> >
> > diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > index 2a6e61864ee0..6d553ed3ac23 100644
> > --- a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
> > @@ -429,15 +429,96 @@ static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
> >  	return bpf_map_lookup_elem(pids, &pid) != NULL;
> >  }
> >  
> > +struct args_loop_ctx {
> > +	struct syscall_enter_args *args;
> > +	unsigned int *beauty_map;
> > +	void *payload_offset;
> > +	int value_size;
> > +	u64 *output;
> > +	bool *do_output;
> > +};
> > +
> > +static long process_arg_cb(u64 i, void *ctx)
> > +{
> > +	/*
> > +	 * Determine what type of argument and how many bytes to read from user space, using the
> > +	 * value in the beauty_map. This is the relation of parameter type and its corresponding
> > +	 * value in the beauty map, and how many bytes we read eventually:
> > +	 *
> > +	 * string: 1			      -> size of string
> > +	 * struct: size of struct	      -> size of struct
> > +	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
> > +	 */
> > +	struct augmented_arg *augmented_arg;
> > +	struct args_loop_ctx *loop_ctx;
> > +	int aug_size, size, index;
> > +	bool augmented;
> > +	void *arg;
> > +
> > +	/* Bounds check for the below map access to help the verifier */
> > +	if (i < 0 || i >= 6)
> > +		return 1;
> > +
> > +	loop_ctx = (struct args_loop_ctx *)ctx;
> > +	arg = (void *)loop_ctx->args->args[i];
> > +	augmented = false;
> > +	size = loop_ctx->beauty_map[i];
> > +	aug_size = size; /* size of the augmented data read from user space */
> > +	augmented_arg = (struct augmented_arg *)loop_ctx->payload_offset;
> > +
> > +	if (size == 0 || arg == NULL)
> > +		return 0; /* continue */
> > +
> > +	if (size == 1) { /* string */
> > +		aug_size = bpf_probe_read_user_str(augmented_arg->value, loop_ctx->value_size, arg);
> > +		augmented = true;
> > +	} else if (size > 0 && size <= loop_ctx->value_size) { /* struct */
> > +		if (!bpf_probe_read_user(augmented_arg->value, size, arg))
> > +			augmented = true;
> > +	} else if (size < 0 && size >= -6) { /* buffer */
> > +		index = -(size + 1);
> > +		barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
> > +		index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
> > +		aug_size = loop_ctx->args->args[index];
> > +
> > +		if (aug_size > TRACE_AUG_MAX_BUF)
> > +			aug_size = TRACE_AUG_MAX_BUF;
> > +
> > +		if (aug_size > 0) {
> > +			if (!bpf_probe_read_user(augmented_arg->value, aug_size, arg))
> > +				augmented = true;
> > +		}
> > +	}
> > +
> > +	/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
> > +	if (aug_size > loop_ctx->value_size)
> > +		aug_size = loop_ctx->value_size;
> > +
> > +	/* write data to payload */
> > +	if (augmented) {
> > +		int written = offsetof(struct augmented_arg, value) + aug_size;
> > +
> > +		if (written < 0 || written > sizeof(struct augmented_arg))
> > +			return 1; /* break */
> > +
> > +		augmented_arg->size = aug_size;
> > +		*loop_ctx->output += written;
> > +		loop_ctx->payload_offset += written;
> > +		*loop_ctx->do_output = true;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
> >  {
> > -	bool augmented, do_output = false;
> > -	int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
> > +	bool do_output = false;
> > +	int zero = 0, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
> >  	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
> > -	s64 aug_size, size;
> >  	unsigned int nr, *beauty_map;
> >  	struct beauty_payload_enter *payload;
> > -	void *arg, *payload_offset;
> > +	void *payload_offset;
> > +	long iters;
> >  
> >  	/* fall back to do predefined tail call */
> >  	if (args == NULL)
> > @@ -457,63 +538,17 @@ static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
> >  	/* copy the sys_enter header, which has the syscall_nr */
> >  	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
> >  
> > -	/*
> > -	 * Determine what type of argument and how many bytes to read from user space, using the
> > -	 * value in the beauty_map. This is the relation of parameter type and its corresponding
> > -	 * value in the beauty map, and how many bytes we read eventually:
> > -	 *
> > -	 * string: 1			      -> size of string
> > -	 * struct: size of struct	      -> size of struct
> > -	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
> > -	 */
> > -	for (int i = 0; i < 6; i++) {
> > -		arg = (void *)args->args[i];
> > -		augmented = false;
> > -		size = beauty_map[i];
> > -		aug_size = size; /* size of the augmented data read from user space */
> > -
> > -		if (size == 0 || arg == NULL)
> > -			continue;
> > -
> > -		if (size == 1) { /* string */
> > -			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
> > -			/* minimum of 0 to pass the verifier */
> > -			if (aug_size < 0)
> > -				aug_size = 0;
> > -
> > -			augmented = true;
> > -		} else if (size > 0 && size <= value_size) { /* struct */
> > -			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
> > -				augmented = true;
> > -		} else if ((int)size < 0 && size >= -6) { /* buffer */
> > -			index = -(size + 1);
> > -			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
> > -			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
> > -			aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
> > -
> > -			if (aug_size > 0) {
> > -				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
> > -					augmented = true;
> > -			}
> > -		}
> > -
> > -		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
> > -		if (aug_size > value_size)
> > -			aug_size = value_size;
> > -
> > -		/* write data to payload */
> > -		if (augmented) {
> > -			int written = offsetof(struct augmented_arg, value) + aug_size;
> > -
> > -			if (written < 0 || written > sizeof(struct augmented_arg))
> > -				return 1;
> > -
> > -			((struct augmented_arg *)payload_offset)->size = aug_size;
> > -			output += written;
> > -			payload_offset += written;
> > -			do_output = true;
> > -		}
> > -	}
> > +	struct args_loop_ctx loop_ctx = {
> > +		.args = args,
> > +		.beauty_map = beauty_map,
> > +		.payload_offset = payload_offset,
> > +		.value_size = value_size,
> > +		.output = &output,
> > +		.do_output = &do_output
> > +	};
> > +	iters = bpf_loop(6, process_arg_cb, &loop_ctx, 0);
> 
> bpf_loop() is old and generally not recommended.
> Please use bpf_for() then the diff will be one line change and
> can scale to any number of args. Not just 6.
 
One thing we should take care is to support old kernels.  The oldest
LTS kernel in the kernel.org is 5.10 and bpf_loop() was introduced in
5.17 and bpf_for (bpf_iter_num) was 6.4.

Maybe we can factor out the loop body and call it from different
mechanisms like open-coded loop, bpf_loop or bpf_for depending on the
kernel version.  But not sure it'd fix the verifier issue though.

Thanks,
Namhyung


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-06-23 17:10 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-23 11:25 [PATCH] perf trace: Refactor augmented_raw_syscalls using bpf_loop Viktor Malik
2026-06-23 11:39 ` sashiko-bot
2026-06-23 15:27 ` Alexei Starovoitov
2026-06-23 17:10   ` Namhyung Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox