BPF List

BPF List
 help / color / mirror / Atom feed

* Re: [PATCH bpf-next v9 3/5] selftests/bpf: add bpf_icmp_send kfunc cgroup_skb IPv6 tests
From: Jordan Rife @ 2026-06-24 21:24 UTC (permalink / raw)
  To: Mahe Tardy
  Cc: bpf, andrii, ast, daniel, john.fastabend, martin.lau,
	yonghong.song, emil
In-Reply-To: <20260624185554.362555-4-mahe.tardy@gmail.com>

On Wed, Jun 24, 2026 at 06:55:52PM +0000, Mahe Tardy wrote:
> This test extends the existing cgroup_skb tests with IPv6 support.
> 
> Note that we need to set IPV6_RECVERR on the socket for IPv6 in
> connect_to_fd_nonblock otherwise the error will be ignored even if we
> are in the middle of the TCP handshake. See in
> net/ipv6/datagram.c:ipv6_icmp_error for more details.
> 
> Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
> ---
>  .../bpf/prog_tests/icmp_send_kfunc.c          | 91 +++++++++++++------
>  tools/testing/selftests/bpf/progs/icmp_send.c | 48 ++++++++--
>  2 files changed, 101 insertions(+), 38 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> index b8a98c90053e..bbb3c3d4509c 100644
> --- a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> +++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> @@ -8,9 +8,11 @@
>  #define TIMEOUT_MS 1000
> 
>  #define ICMP_DEST_UNREACH 3
> +#define ICMPV6_DEST_UNREACH 1
> 
>  #define ICMP_FRAG_NEEDED 4
>  #define NR_ICMP_UNREACH 15
> +#define ICMPV6_REJECT_ROUTE 6
> 
>  #define KFUNC_RET_UNSET -1
> 
> @@ -18,7 +20,7 @@ static int connect_to_fd_nonblock(int server_fd)
>  {
>  	struct sockaddr_storage addr;
>  	socklen_t len = sizeof(addr);
> -	int fd, err;
> +	int fd, err, on = 1;
> 
>  	if (getsockname(server_fd, (struct sockaddr *)&addr, &len))
>  		return -1;
> @@ -27,6 +29,12 @@ static int connect_to_fd_nonblock(int server_fd)
>  	if (fd < 0)
>  		return -1;
> 
> +	if (addr.ss_family == AF_INET6 &&
> +	    setsockopt(fd, IPPROTO_IPV6, IPV6_RECVERR, &on, sizeof(on)) < 0) {
> +		close(fd);
> +		return -1;
> +	}
> +
>  	err = connect(fd, (struct sockaddr *)&addr, len);
>  	if (err < 0 && errno != EINPROGRESS) {
>  		close(fd);
> @@ -36,8 +44,14 @@ static int connect_to_fd_nonblock(int server_fd)
>  	return fd;
>  }
> 
> -static void read_icmp_errqueue(int sockfd, int expected_code)
> +static void read_icmp_errqueue(int sockfd, int expected_code, int af)
>  {
> +	int expected_ee_type = (af == AF_INET) ? ICMP_DEST_UNREACH :
> +						 ICMPV6_DEST_UNREACH;
> +	int expected_origin = (af == AF_INET) ? SO_EE_ORIGIN_ICMP :
> +						SO_EE_ORIGIN_ICMP6;
> +	int expected_level = (af == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6;
> +	int expected_type = (af == AF_INET) ? IP_RECVERR : IPV6_RECVERR;
>  	struct sock_extended_err *sock_err;
>  	char ctrl_buf[512];
>  	struct msghdr msg = {
> @@ -63,38 +77,43 @@ static void read_icmp_errqueue(int sockfd, int expected_code)
>  		return;
> 
>  	for (; cm; cm = CMSG_NXTHDR(&msg, cm)) {
> -		if (cm->cmsg_level != IPPROTO_IP || cm->cmsg_type != IP_RECVERR)
> +		if (cm->cmsg_level != expected_level ||
> +		    cm->cmsg_type != expected_type)
>  			continue;
> 
>  		sock_err = (struct sock_extended_err *)CMSG_DATA(cm);
> 
> -		if (!ASSERT_EQ(sock_err->ee_origin, SO_EE_ORIGIN_ICMP,
> -			       "sock_err_origin_icmp"))
> +		if (!ASSERT_EQ(sock_err->ee_origin, expected_origin,
> +			       "sock_err_origin"))
>  			return;
> -		if (!ASSERT_EQ(sock_err->ee_type, ICMP_DEST_UNREACH,
> +		if (!ASSERT_EQ(sock_err->ee_type, expected_ee_type,
>  			       "sock_err_type_dest_unreach"))
>  			return;
>  		ASSERT_EQ(sock_err->ee_code, expected_code, "sock_err_code");
>  		return;
>  	}
> 
> -	ASSERT_FAIL("no IP_RECVERR control message found");
> +	ASSERT_FAIL("no IP_RECVERR/IPV6_RECVERR control message found");
>  }
> 
> -static bool valid_unreach_code(int code)
> +static bool valid_unreach_code(int code, int af)
>  {
>  	if (code < 0)
>  		return false;
> 
> -	return code <= NR_ICMP_UNREACH && code != ICMP_FRAG_NEEDED;
> +	if (af == AF_INET)
> +		return code <= NR_ICMP_UNREACH && code != ICMP_FRAG_NEEDED;
> +
> +	return code <= ICMPV6_REJECT_ROUTE;
>  }
> 
> -static void trigger_prog_read_icmp_errqueue(struct icmp_send *skel, int code)
> +static void trigger_prog_read_icmp_errqueue(struct icmp_send *skel, int code,
> +					    int af, const char *ip)
>  {
>  	int srv_fd = -1, client_fd = -1;
>  	int port;
> 
> -	srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", 0, TIMEOUT_MS);
> +	srv_fd = start_server(af, SOCK_STREAM, ip, 0, TIMEOUT_MS);
>  	if (!ASSERT_OK_FD(srv_fd, "start_server"))
>  		return;
> 
> @@ -105,6 +124,8 @@ static void trigger_prog_read_icmp_errqueue(struct icmp_send *skel, int code)
>  	}
> 
>  	skel->bss->server_port = ntohs(port);
> +	skel->bss->unreach_type = (af == AF_INET) ? ICMP_DEST_UNREACH :
> +						    ICMPV6_DEST_UNREACH;
>  	skel->bss->unreach_code = code;
>  	skel->data->kfunc_ret = KFUNC_RET_UNSET;
> 
> @@ -114,13 +135,37 @@ static void trigger_prog_read_icmp_errqueue(struct icmp_send *skel, int code)
>  		return;
>  	}
> 
> -	if (valid_unreach_code(code))
> -		read_icmp_errqueue(client_fd, code);
> +	if (valid_unreach_code(code, af))
> +		read_icmp_errqueue(client_fd, code, af);
> 
>  	close(client_fd);
>  	close(srv_fd);
>  }
> 
> +static void run_icmp_test(struct icmp_send *skel, int af, const char *ip,
> +			  int max_code)
> +{
> +	for (int code = 0; code <= max_code; code++) {
> +		if (af == AF_INET && code == ICMP_FRAG_NEEDED)
> +			continue;
> +
> +		trigger_prog_read_icmp_errqueue(skel, code, af, ip);
> +		ASSERT_EQ(skel->data->kfunc_ret, 0, "kfunc_ret");
> +	}
> +
> +	/* Test invalid codes */
> +	trigger_prog_read_icmp_errqueue(skel, -1, af, ip);
> +	ASSERT_EQ(skel->data->kfunc_ret, -EINVAL, "kfunc_ret");
> +
> +	trigger_prog_read_icmp_errqueue(skel, max_code + 1, af, ip);
> +	ASSERT_EQ(skel->data->kfunc_ret, -EINVAL, "kfunc_ret");
> +
> +	if (af == AF_INET) {
> +		trigger_prog_read_icmp_errqueue(skel, ICMP_FRAG_NEEDED, af, ip);
> +		ASSERT_EQ(skel->data->kfunc_ret, -EINVAL, "kfunc_ret");
> +	}
> +}
> +
>  void test_icmp_send_unreach_cgroup(void)
>  {
>  	struct icmp_send *skel;
> @@ -139,23 +184,11 @@ void test_icmp_send_unreach_cgroup(void)
>  	if (!ASSERT_OK_PTR(skel->links.egress, "prog_attach_cgroup"))
>  		goto cleanup;
> 
> -	for (int code = 0; code <= NR_ICMP_UNREACH; code++) {
> -		if (code == ICMP_FRAG_NEEDED)
> -			continue;
> -
> -		trigger_prog_read_icmp_errqueue(skel, code);
> -		ASSERT_EQ(skel->data->kfunc_ret, 0, "kfunc_ret");
> -	}
> -
> -	/* Test invalid codes */
> -	trigger_prog_read_icmp_errqueue(skel, -1);
> -	ASSERT_EQ(skel->data->kfunc_ret, -EINVAL, "kfunc_ret");
> +	if (test__start_subtest("ipv4"))
> +		run_icmp_test(skel, AF_INET, "127.0.0.1", NR_ICMP_UNREACH);
> 
> -	trigger_prog_read_icmp_errqueue(skel, NR_ICMP_UNREACH + 1);
> -	ASSERT_EQ(skel->data->kfunc_ret, -EINVAL, "kfunc_ret");
> -
> -	trigger_prog_read_icmp_errqueue(skel, ICMP_FRAG_NEEDED);
> -	ASSERT_EQ(skel->data->kfunc_ret, -EINVAL, "kfunc_ret");
> +	if (test__start_subtest("ipv6"))
> +		run_icmp_test(skel, AF_INET6, "::1", ICMPV6_REJECT_ROUTE);
> 
>  cleanup:
>  	icmp_send__destroy(skel);
> diff --git a/tools/testing/selftests/bpf/progs/icmp_send.c b/tools/testing/selftests/bpf/progs/icmp_send.c
> index 6d0be0a9afe1..6e1ba539eeb0 100644
> --- a/tools/testing/selftests/bpf/progs/icmp_send.c
> +++ b/tools/testing/selftests/bpf/progs/icmp_send.c
> @@ -5,10 +5,11 @@
> 
>  /* 127.0.0.1 in host byte order */
>  #define SERVER_IP 0x7F000001
> -
> -#define ICMP_DEST_UNREACH 3
> +/* ::1 in host byte order (last 32-bit word) */
> +#define SERVER_IP6_LO 0x00000001
> 
>  __u16 server_port = 0;
> +int unreach_type = 0;
>  int unreach_code = 0;
>  int kfunc_ret = -1;
> 
> @@ -18,19 +19,48 @@ int egress(struct __sk_buff *skb)
>  	void *data = (void *)(long)skb->data;
>  	void *data_end = (void *)(long)skb->data_end;
>  	struct iphdr *iph;
> +	struct ipv6hdr *ip6h;
>  	struct tcphdr *tcph;
> +	__u8 version;
> 
> -	iph = data;
> -	if ((void *)(iph + 1) > data_end || iph->version != 4 ||
> -	    iph->protocol != IPPROTO_TCP || iph->daddr != bpf_htonl(SERVER_IP))
> +	if (data + 1 > data_end)
>  		return SK_PASS;
> 
> -	tcph = (void *)iph + iph->ihl * 4;
> -	if ((void *)(tcph + 1) > data_end ||
> -	    tcph->dest != bpf_htons(server_port))
> +	version = (*((__u8 *)data)) >> 4;
> +
> +	if (version == 4) {
> +		iph = data;
> +		if ((void *)(iph + 1) > data_end ||
> +		    iph->protocol != IPPROTO_TCP ||
> +		    iph->daddr != bpf_htonl(SERVER_IP))
> +			return SK_PASS;
> +
> +		tcph = (void *)iph + iph->ihl * 4;
> +		if ((void *)(tcph + 1) > data_end ||
> +		    tcph->dest != bpf_htons(server_port))
> +			return SK_PASS;
> +
> +	} else if (version == 6) {
> +		ip6h = data;
> +		if ((void *)(ip6h + 1) > data_end ||
> +		    ip6h->nexthdr != IPPROTO_TCP)
> +			return SK_PASS;
> +
> +		if (ip6h->daddr.in6_u.u6_addr32[0] != 0 ||
> +		    ip6h->daddr.in6_u.u6_addr32[1] != 0 ||
> +		    ip6h->daddr.in6_u.u6_addr32[2] != 0 ||
> +		    ip6h->daddr.in6_u.u6_addr32[3] != bpf_htonl(SERVER_IP6_LO))
> +			return SK_PASS;
> +
> +		tcph = (void *)(ip6h + 1);
> +		if ((void *)(tcph + 1) > data_end ||
> +		    tcph->dest != bpf_htons(server_port))
> +			return SK_PASS;
> +	} else {
>  		return SK_PASS;
> +	}
> 
> -	kfunc_ret = bpf_icmp_send(skb, ICMP_DEST_UNREACH, unreach_code);
> +	kfunc_ret = bpf_icmp_send(skb, unreach_type, unreach_code);
> 
>  	return SK_DROP;
>  }
> --
> 2.34.1
>

Reviewed-by: Jordan Rife <jordan@jrife.io>

^ permalink raw reply

* Re: [PATCH bpf 2/2] selftests/bpf: Add test for UDP sock leak on sockmap lookup-bind-release
From: Michal Luczaj @ 2026-06-24 21:24 UTC (permalink / raw)
  To: bot+bpf-ci, john.fastabend, jakub, jiayuan.chen, davem, edumazet,
	kuba, pabeni, horms, ast, cong.wang, daniel, andrii, eddyz87,
	memxor, martin.lau, song, yonghong.song, jolsa, emil, shuah
  Cc: netdev, bpf, linux-kernel, linux-kselftest, martin.lau, clm,
	ihor.solodrai
In-Reply-To: <bd4abd3f584c38f1fc512a9d3ae1f90c2a374588fa23529b32b7f1aed6ab2ead@mail.kernel.org>

On 6/23/26 21:32, bot+bpf-ci@kernel.org wrote:
>> selftests/bpf: Add test for UDP sock leak on sockmap lookup-bind-release
>>
>>     Setup and join a cgroup, then attach a cgroup/connect4 program that runs
>>
>>        sk = bpf_map_lookup_elem(sockmap, 0)
>>        bpf_bind(ctx, sa, sizeof(sa))
>>        bpf_sk_release(sk)
>>
>>     Unpatched kernel leaks the socket.
>>
>>     Signed-off-by: Michal Luczaj <mhal@rbox.co>
> 
> This test reproduces a UDP socket leak across sockmap lookup-bind-release,
> but there is no Fixes: tag.
> 
> The sibling fix commit a2510dc351c5 ("bpf, sockmap: Don't leak UDP socks on
> lookup-bind-release") carries a Fixes: tag pointing at the commit that lifted
> the socket-state restriction allowing unbound UDP sockets into sockmap.
> 
> Should this test carry the same tag so it stays backportable alongside the
> fix?
> 
>   Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")

I am skipping this suggestion as the test itself does not address any bugs
and requires no backporting.

Michal

^ permalink raw reply

* Re: [PATCH bpf-next v9 4/5] selftests/bpf: add bpf_icmp_send recursion test
From: Jordan Rife @ 2026-06-24 21:24 UTC (permalink / raw)
  To: Mahe Tardy
  Cc: bpf, andrii, ast, daniel, john.fastabend, martin.lau,
	yonghong.song, emil
In-Reply-To: <20260624185554.362555-5-mahe.tardy@gmail.com>

On Wed, Jun 24, 2026 at 06:55:53PM +0000, Mahe Tardy wrote:
> This test is similar to test_icmp_send_unreach_cgroup but checks that,
> in case of recursion, meaning that the BPF program calling the kfunc was
> re-triggered by the icmp_send done by the kfunc, the kfunc will stop
> early and return -EBUSY.
> 
> The test attaches to the root cgroup to ensure the ICMP packet generated
> by the kfunc re-triggers the BPF program.
> 
> Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
> Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
> ---
>  .../bpf/prog_tests/icmp_send_kfunc.c          | 46 ++++++++++++++++
>  tools/testing/selftests/bpf/progs/icmp_send.c | 55 +++++++++++++++++++
>  2 files changed, 101 insertions(+)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> index bbb3c3d4509c..bb532aa0d158 100644
> --- a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> +++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> @@ -1,8 +1,10 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #include <test_progs.h>
>  #include <network_helpers.h>
> +#include <cgroup_helpers.h>
>  #include <linux/errqueue.h>
>  #include <poll.h>
> +#include <unistd.h>
>  #include "icmp_send.skel.h"
> 
>  #define TIMEOUT_MS 1000
> @@ -10,6 +12,7 @@
>  #define ICMP_DEST_UNREACH 3
>  #define ICMPV6_DEST_UNREACH 1
> 
> +#define ICMP_HOST_UNREACH 1
>  #define ICMP_FRAG_NEEDED 4
>  #define NR_ICMP_UNREACH 15
>  #define ICMPV6_REJECT_ROUTE 6
> @@ -195,3 +198,46 @@ void test_icmp_send_unreach_cgroup(void)
>  	if (cgroup_fd >= 0)
>  		close(cgroup_fd);
>  }
> +
> +void test_icmp_send_unreach_recursion(void)
> +{
> +	struct icmp_send *skel;
> +	int cgroup_fd = -1;
> +	int err;
> +
> +	err = setup_cgroup_environment();
> +	if (!ASSERT_OK(err, "setup_cgroup_environment"))
> +		return;
> +
> +	skel = icmp_send__open_and_load();
> +	if (!ASSERT_OK_PTR(skel, "skel_open"))
> +		goto cleanup;
> +
> +	cgroup_fd = get_root_cgroup();
> +	if (!ASSERT_OK_FD(cgroup_fd, "get_root_cgroup"))
> +		goto cleanup;
> +
> +	skel->data->target_pid = getpid();
> +	skel->links.recursion =
> +		bpf_program__attach_cgroup(skel->progs.recursion, cgroup_fd);
> +	if (!ASSERT_OK_PTR(skel->links.recursion, "prog_attach_cgroup"))
> +		goto cleanup;
> +
> +	trigger_prog_read_icmp_errqueue(skel, ICMP_HOST_UNREACH, AF_INET,
> +					"127.0.0.1");
> +
> +	/*
> +	 * Because there's recursion involved, the first call will return at
> +	 * index 1 since it will return the second, and the second call will
> +	 * return at index 0 since it will return the first.
> +	 */
> +	ASSERT_EQ(skel->bss->rec_count, 2, "rec_count");
> +	ASSERT_EQ(skel->data->rec_kfunc_rets[0], -EBUSY, "kfunc_rets[0]");
> +	ASSERT_EQ(skel->data->rec_kfunc_rets[1], 0, "kfunc_rets[1]");
> +
> +cleanup:
> +	icmp_send__destroy(skel);
> +	if (cgroup_fd >= 0)
> +		close(cgroup_fd);
> +	cleanup_cgroup_environment();
> +}
> diff --git a/tools/testing/selftests/bpf/progs/icmp_send.c b/tools/testing/selftests/bpf/progs/icmp_send.c
> index 6e1ba539eeb0..c642ccdf9fd5 100644
> --- a/tools/testing/selftests/bpf/progs/icmp_send.c
> +++ b/tools/testing/selftests/bpf/progs/icmp_send.c
> @@ -12,6 +12,10 @@ __u16 server_port = 0;
>  int unreach_type = 0;
>  int unreach_code = 0;
>  int kfunc_ret = -1;
> +int target_pid = -1;
> +
> +unsigned int rec_count = 0;
> +int rec_kfunc_rets[] = { -1, -1 };
> 
>  SEC("cgroup_skb/egress")
>  int egress(struct __sk_buff *skb)
> @@ -65,4 +69,55 @@ int egress(struct __sk_buff *skb)
>  	return SK_DROP;
>  }
> 
> +SEC("cgroup_skb/egress")
> +int recursion(struct __sk_buff *skb)
> +{
> +	void *data = (void *)(long)skb->data;
> +	void *data_end = (void *)(long)skb->data_end;
> +	struct icmphdr *icmph;
> +	struct tcphdr *tcph;
> +	struct iphdr *iph;
> +	int ret;
> +
> +	if ((bpf_get_current_pid_tgid() >> 32) != target_pid)
> +		return SK_PASS;
> +
> +	iph = data;
> +	if ((void *)(iph + 1) > data_end || iph->version != 4)
> +		return SK_PASS;
> +
> +	if (iph->daddr != bpf_htonl(SERVER_IP))
> +		return SK_PASS;
> +
> +	if (iph->protocol == IPPROTO_TCP) {
> +		tcph = (void *)iph + iph->ihl * 4;
> +		if ((void *)(tcph + 1) > data_end ||
> +		    tcph->dest != bpf_htons(server_port))
> +			return SK_PASS;
> +	} else if (iph->protocol == IPPROTO_ICMP) {
> +		icmph = (void *)iph + iph->ihl * 4;
> +		if ((void *)(icmph + 1) > data_end ||
> +		    icmph->type != unreach_type || icmph->code != unreach_code)
> +			return SK_PASS;
> +	} else {
> +		return SK_PASS;
> +	}
> +
> +	/*
> +	 * This call will provoke a recursion: the ICMP packet generated by the
> +	 * kfunc will re-trigger this program since we are in the root cgroup in
> +	 * which the kernel ICMP socket belongs. However when re-entering the
> +	 * kfunc, it should return EBUSY.
> +	 */
> +	ret = bpf_icmp_send(skb, unreach_type, unreach_code);
> +	rec_kfunc_rets[rec_count & 1] = ret;
> +	__sync_fetch_and_add(&rec_count, 1);
> +
> +	/* Let the first ICMP error message pass */
> +	if (iph->protocol == IPPROTO_ICMP)
> +		return SK_PASS;
> +
> +	return SK_DROP;
> +}
> +
>  char LICENSE[] SEC("license") = "Dual BSD/GPL";
> --
> 2.34.1
> 

Reviewed-by: Jordan Rife <jordan@jrife.io>

^ permalink raw reply

* Re: [PATCH bpf-next v9 5/5] selftests/bpf: add bpf_icmp_send no route test
From: Jordan Rife @ 2026-06-24 21:24 UTC (permalink / raw)
  To: Mahe Tardy
  Cc: bpf, andrii, ast, daniel, john.fastabend, martin.lau,
	yonghong.song, emil
In-Reply-To: <20260624185554.362555-6-mahe.tardy@gmail.com>

On Wed, Jun 24, 2026 at 06:55:54PM +0000, Mahe Tardy wrote:
> For normal live cgroup_skb paths, the skb should already be routed. The
> exception is for test run via BPF_PROG_TEST_RUN with packets created
> via bpf_prog_test_run_skb. Those lack dst route and thus the icmp_send
> would quietly fail by returning early.
> 
> This test exercises this and makes sure the kfunc returns -ENETUNREACH.
> 
> Signed-off-by: Mahe Tardy <mahe.tardy@gmail.com>
> ---
>  .../bpf/prog_tests/icmp_send_kfunc.c          | 26 +++++++++++++++++++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> index bb532aa0d158..ffaf0fe1880b 100644
> --- a/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> +++ b/tools/testing/selftests/bpf/prog_tests/icmp_send_kfunc.c
> @@ -169,6 +169,29 @@ static void run_icmp_test(struct icmp_send *skel, int af, const char *ip,
>  	}
>  }
> 
> +static void run_icmp_no_route_test(struct icmp_send *skel)
> +{
> +	struct ipv4_packet pkt = pkt_v4;
> +	LIBBPF_OPTS(bpf_test_run_opts, opts,
> +		.data_in = &pkt,
> +		.data_size_in = sizeof(pkt),
> +	);
> +	int err;
> +
> +	pkt.iph.version = 4;
> +	pkt.iph.daddr = inet_addr("127.0.0.1");
> +	pkt.tcp.dest = htons(80);
> +	skel->bss->server_port = 80;
> +	skel->bss->unreach_type = ICMP_DEST_UNREACH;
> +	skel->bss->unreach_code = ICMP_HOST_UNREACH;
> +	skel->data->kfunc_ret = KFUNC_RET_UNSET;
> +
> +	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.egress), &opts);
> +	if (!ASSERT_OK(err, "test_run"))
> +		return;
> +	ASSERT_EQ(skel->data->kfunc_ret, -ENETUNREACH, "kfunc_ret_no_route");
> +}
> +
>  void test_icmp_send_unreach_cgroup(void)
>  {
>  	struct icmp_send *skel;
> @@ -193,6 +216,9 @@ void test_icmp_send_unreach_cgroup(void)
>  	if (test__start_subtest("ipv6"))
>  		run_icmp_test(skel, AF_INET6, "::1", ICMPV6_REJECT_ROUTE);
> 
> +	if (test__start_subtest("no_route"))
> +		run_icmp_no_route_test(skel);
> +
>  cleanup:
>  	icmp_send__destroy(skel);
>  	if (cgroup_fd >= 0)
> --
> 2.34.1
> 

Reviewed-by: Jordan Rife <jordan@jrife.io>

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf, sockmap: Don't leak UDP socks on lookup-bind-release
From: Michal Luczaj @ 2026-06-24 21:25 UTC (permalink / raw)
  To: Willem de Bruijn, Jakub Sitnicki
  Cc: John Fastabend, Jiayuan Chen, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Alexei Starovoitov,
	Cong Wang, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan, netdev,
	bpf, linux-kernel, linux-kselftest, kuniyu
In-Reply-To: <willemdebruijn.kernel.24d11e11d5dc0@gmail.com>

On 6/24/26 22:01, Willem de Bruijn wrote:
> Jakub Sitnicki wrote:
>> On Tue, Jun 23, 2026 at 08:03 PM +02, Michal Luczaj wrote:
>>> UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
>>> sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.
>>>
>>> Because sockmap accepts unbound UDP sockets, a BPF program can increment a
>>> socket's refcount via lookup. If the socket is subsequently bound, the
>>> transition from unbound to bound causes bpf_sk_release() to skip the
>>> decrement of the refcount, causing a memory leak.
>>>
>>> unreferenced object 0xffff88810bc2eb40 (size 1984):
>>>   comm "test_progs", pid 2451, jiffies 4295320596
>>>   hex dump (first 32 bytes):
>>>     7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
>>>     02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
>>>   backtrace (crc bdee079d):
>>>     kmem_cache_alloc_noprof+0x557/0x660
>>>     sk_prot_alloc+0x69/0x240
>>>     sk_alloc+0x30/0x460
>>>     inet_create+0x2ce/0xf80
>>>     __sock_create+0x25b/0x5c0
>>>     __sys_socket+0x119/0x1d0
>>>     __x64_sys_socket+0x72/0xd0
>>>     do_syscall_64+0xa1/0x5f0
>>>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>>>
>>> Maintain balanced refcounts across sk lookup/release: (re-)set
>>> SOCK_RCU_FREE on proto update to treat the socket (whether bound or
>>> unbound) as not requiring a refcount increment on (a RCU protected) lookup.
>>>
>>> Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
>>> Signed-off-by: Michal Luczaj <mhal@rbox.co>
>>> ---
>>> Note: this issue is related to commit 67312adc96b5 ("bpf: reject unhashed
>>> sockets in bpf_sk_assign").
>>> ---
>>>  net/ipv4/udp_bpf.c | 3 +++
>>>  1 file changed, 3 insertions(+)
>>>
>>> diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
>>> index ad57c4c9eaab..970327b59582 100644
>>> --- a/net/ipv4/udp_bpf.c
>>> +++ b/net/ipv4/udp_bpf.c
>>> @@ -173,6 +173,9 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
>>>  	if (sk->sk_family == AF_INET6)
>>>  		udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
>>>  
>>> +	/* Treat all sockets as non-refcounted, regardless of binding state. */
>>> +	sock_set_flag(sk, SOCK_RCU_FREE);
>>> +
>>>  	sock_replace_proto(sk, &udp_bpf_prots[family]);
>>>  	return 0;
>>>  }
>>
>> There is a side effect that an unhashed (unbound) UDP socket can now be
>> selected in sk_lookup with bpf_sk_assign.
> 
> The commit does mention a related fix, beneath the ---, commit
> 67312adc96b5 ("bpf: reject unhashed sockets in bpf_sk_assign").
> That fixes a similar issue by exactly disallowing this:
> 
>     Fix the problem by rejecting unhashed sockets in bpf_sk_assign().
>     This matches the behaviour of __inet_lookup_skb which is ultimately
>     the goal of bpf_sk_assign().
> 
> So ..
> 
>> Though perhaps that's for the
>> better because TC bpf_sk_assign doesn't reject non-refcounted UDP
>> sockets either, so we would have both socket dispatch sites behave the
>> same way.
> 
> .. there are two conflicting types of consistency here? Consistent with
> __inet_lookup_skb or the TC bpf hook. Of those the first is the more
> canonical.
> 
>> Also, with this patch, if we insert & remove an unhashed UDP socket
>> into/from a sockmap, we end up with an unhashed non-refcounted UDP
>> socket. Not entirely sure if that is actually a problem or not.
>>
>> Willem, what is your take on having unhashed non-refcoted UDP sockets?
> 
> I don't immediately see a problem, but I'm not an expert on SOCK_RCU_FREE.

Perhaps it's worth mentioning that unhashed non-refcounted UDP socket is
already possible: first auto-bind via connect(AF_INET) (which also sets
SOCK_RCU_FREE), then unhash via connect(AF_UNSPEC).

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf, sockmap: Don't leak UDP socks on lookup-bind-release
From: Kuniyuki Iwashima @ 2026-06-24 21:33 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Willem de Bruijn, Jakub Sitnicki, John Fastabend, Jiayuan Chen,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <dd065bfb-52ce-48fd-b1ef-9c6166f714ed@rbox.co>

On Wed, Jun 24, 2026 at 2:26 PM Michal Luczaj <mhal@rbox.co> wrote:
>
> On 6/24/26 22:01, Willem de Bruijn wrote:
> > Jakub Sitnicki wrote:
> >> On Tue, Jun 23, 2026 at 08:03 PM +02, Michal Luczaj wrote:
> >>> UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
> >>> sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.
> >>>
> >>> Because sockmap accepts unbound UDP sockets, a BPF program can increment a
> >>> socket's refcount via lookup. If the socket is subsequently bound, the
> >>> transition from unbound to bound causes bpf_sk_release() to skip the
> >>> decrement of the refcount, causing a memory leak.
> >>>
> >>> unreferenced object 0xffff88810bc2eb40 (size 1984):
> >>>   comm "test_progs", pid 2451, jiffies 4295320596
> >>>   hex dump (first 32 bytes):
> >>>     7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
> >>>     02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
> >>>   backtrace (crc bdee079d):
> >>>     kmem_cache_alloc_noprof+0x557/0x660
> >>>     sk_prot_alloc+0x69/0x240
> >>>     sk_alloc+0x30/0x460
> >>>     inet_create+0x2ce/0xf80
> >>>     __sock_create+0x25b/0x5c0
> >>>     __sys_socket+0x119/0x1d0
> >>>     __x64_sys_socket+0x72/0xd0
> >>>     do_syscall_64+0xa1/0x5f0
> >>>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >>>
> >>> Maintain balanced refcounts across sk lookup/release: (re-)set
> >>> SOCK_RCU_FREE on proto update to treat the socket (whether bound or
> >>> unbound) as not requiring a refcount increment on (a RCU protected) lookup.
> >>>
> >>> Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
> >>> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> >>> ---
> >>> Note: this issue is related to commit 67312adc96b5 ("bpf: reject unhashed
> >>> sockets in bpf_sk_assign").
> >>> ---
> >>>  net/ipv4/udp_bpf.c | 3 +++
> >>>  1 file changed, 3 insertions(+)
> >>>
> >>> diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
> >>> index ad57c4c9eaab..970327b59582 100644
> >>> --- a/net/ipv4/udp_bpf.c
> >>> +++ b/net/ipv4/udp_bpf.c
> >>> @@ -173,6 +173,9 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
> >>>     if (sk->sk_family == AF_INET6)
> >>>             udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
> >>>
> >>> +   /* Treat all sockets as non-refcounted, regardless of binding state. */
> >>> +   sock_set_flag(sk, SOCK_RCU_FREE);
> >>> +
> >>>     sock_replace_proto(sk, &udp_bpf_prots[family]);
> >>>     return 0;
> >>>  }
> >>
> >> There is a side effect that an unhashed (unbound) UDP socket can now be
> >> selected in sk_lookup with bpf_sk_assign.
> >
> > The commit does mention a related fix, beneath the ---, commit
> > 67312adc96b5 ("bpf: reject unhashed sockets in bpf_sk_assign").
> > That fixes a similar issue by exactly disallowing this:
> >
> >     Fix the problem by rejecting unhashed sockets in bpf_sk_assign().
> >     This matches the behaviour of __inet_lookup_skb which is ultimately
> >     the goal of bpf_sk_assign().
> >
> > So ..
> >
> >> Though perhaps that's for the
> >> better because TC bpf_sk_assign doesn't reject non-refcounted UDP
> >> sockets either, so we would have both socket dispatch sites behave the
> >> same way.
> >
> > .. there are two conflicting types of consistency here? Consistent with
> > __inet_lookup_skb or the TC bpf hook. Of those the first is the more
> > canonical.
> >
> >> Also, with this patch, if we insert & remove an unhashed UDP socket
> >> into/from a sockmap, we end up with an unhashed non-refcounted UDP
> >> socket. Not entirely sure if that is actually a problem or not.
> >>
> >> Willem, what is your take on having unhashed non-refcoted UDP sockets?
> >
> > I don't immediately see a problem, but I'm not an expert on SOCK_RCU_FREE.
>
> Perhaps it's worth mentioning that unhashed non-refcounted UDP socket is
> already possible: first auto-bind via connect(AF_INET) (which also sets
> SOCK_RCU_FREE), then unhash via connect(AF_UNSPEC).

Setting SOCK_RCU_FREE itself should not cause a problem, but I think
we should take a step back.

AFAIU, 0c48eefae712 was to allow putting AF_UNIX SOCK_DGRAM sockets
into sockmap, not to allow using unconnected UDP sockets in sk_lookup etc.

Actually, v4 of the patch was implemented as such but did not get any feedback,
https://lore.kernel.org/bpf/20210508220835.53801-9-xiyou.wangcong@gmail.com/#t

... and v5 (the final commit) somehow removed the restriction for unconnected
UDP socket as well.
https://lore.kernel.org/bpf/20210704190252.11866-3-xiyou.wangcong@gmail.com/

Given the initial use case, sockmap redirect, is still blocked by
TCP_ESTABLISHED
check in sock_map_redirect_allowed(), I feel there is no point in supporting
unconnected UDP sockets in sockmap.  It cannot get any skb from anywhere
(without buggy sk_lookup).

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf, sockmap: Don't leak UDP socks on lookup-bind-release
From: Kuniyuki Iwashima @ 2026-06-24 21:39 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Willem de Bruijn, Jakub Sitnicki, John Fastabend, Jiayuan Chen,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <CAAVpQUBHMrSiWqn0Zo_D7sUCLFdkKggduoijRdFtHx5CPiSpsg@mail.gmail.com>

On Wed, Jun 24, 2026 at 2:33 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> On Wed, Jun 24, 2026 at 2:26 PM Michal Luczaj <mhal@rbox.co> wrote:
> >
> > On 6/24/26 22:01, Willem de Bruijn wrote:
> > > Jakub Sitnicki wrote:
> > >> On Tue, Jun 23, 2026 at 08:03 PM +02, Michal Luczaj wrote:
> > >>> UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
> > >>> sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.
> > >>>
> > >>> Because sockmap accepts unbound UDP sockets, a BPF program can increment a
> > >>> socket's refcount via lookup. If the socket is subsequently bound, the
> > >>> transition from unbound to bound causes bpf_sk_release() to skip the
> > >>> decrement of the refcount, causing a memory leak.
> > >>>
> > >>> unreferenced object 0xffff88810bc2eb40 (size 1984):
> > >>>   comm "test_progs", pid 2451, jiffies 4295320596
> > >>>   hex dump (first 32 bytes):
> > >>>     7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
> > >>>     02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
> > >>>   backtrace (crc bdee079d):
> > >>>     kmem_cache_alloc_noprof+0x557/0x660
> > >>>     sk_prot_alloc+0x69/0x240
> > >>>     sk_alloc+0x30/0x460
> > >>>     inet_create+0x2ce/0xf80
> > >>>     __sock_create+0x25b/0x5c0
> > >>>     __sys_socket+0x119/0x1d0
> > >>>     __x64_sys_socket+0x72/0xd0
> > >>>     do_syscall_64+0xa1/0x5f0
> > >>>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > >>>
> > >>> Maintain balanced refcounts across sk lookup/release: (re-)set
> > >>> SOCK_RCU_FREE on proto update to treat the socket (whether bound or
> > >>> unbound) as not requiring a refcount increment on (a RCU protected) lookup.
> > >>>
> > >>> Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
> > >>> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> > >>> ---
> > >>> Note: this issue is related to commit 67312adc96b5 ("bpf: reject unhashed
> > >>> sockets in bpf_sk_assign").
> > >>> ---
> > >>>  net/ipv4/udp_bpf.c | 3 +++
> > >>>  1 file changed, 3 insertions(+)
> > >>>
> > >>> diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
> > >>> index ad57c4c9eaab..970327b59582 100644
> > >>> --- a/net/ipv4/udp_bpf.c
> > >>> +++ b/net/ipv4/udp_bpf.c
> > >>> @@ -173,6 +173,9 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
> > >>>     if (sk->sk_family == AF_INET6)
> > >>>             udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
> > >>>
> > >>> +   /* Treat all sockets as non-refcounted, regardless of binding state. */
> > >>> +   sock_set_flag(sk, SOCK_RCU_FREE);
> > >>> +
> > >>>     sock_replace_proto(sk, &udp_bpf_prots[family]);
> > >>>     return 0;
> > >>>  }
> > >>
> > >> There is a side effect that an unhashed (unbound) UDP socket can now be
> > >> selected in sk_lookup with bpf_sk_assign.
> > >
> > > The commit does mention a related fix, beneath the ---, commit
> > > 67312adc96b5 ("bpf: reject unhashed sockets in bpf_sk_assign").
> > > That fixes a similar issue by exactly disallowing this:
> > >
> > >     Fix the problem by rejecting unhashed sockets in bpf_sk_assign().
> > >     This matches the behaviour of __inet_lookup_skb which is ultimately
> > >     the goal of bpf_sk_assign().
> > >
> > > So ..
> > >
> > >> Though perhaps that's for the
> > >> better because TC bpf_sk_assign doesn't reject non-refcounted UDP
> > >> sockets either, so we would have both socket dispatch sites behave the
> > >> same way.
> > >
> > > .. there are two conflicting types of consistency here? Consistent with
> > > __inet_lookup_skb or the TC bpf hook. Of those the first is the more
> > > canonical.
> > >
> > >> Also, with this patch, if we insert & remove an unhashed UDP socket
> > >> into/from a sockmap, we end up with an unhashed non-refcounted UDP
> > >> socket. Not entirely sure if that is actually a problem or not.
> > >>
> > >> Willem, what is your take on having unhashed non-refcoted UDP sockets?
> > >
> > > I don't immediately see a problem, but I'm not an expert on SOCK_RCU_FREE.
> >
> > Perhaps it's worth mentioning that unhashed non-refcounted UDP socket is
> > already possible: first auto-bind via connect(AF_INET) (which also sets
> > SOCK_RCU_FREE), then unhash via connect(AF_UNSPEC).
>
> Setting SOCK_RCU_FREE itself should not cause a problem, but I think
> we should take a step back.
>
> AFAIU, 0c48eefae712 was to allow putting AF_UNIX SOCK_DGRAM sockets
> into sockmap, not to allow using unconnected UDP sockets in sk_lookup etc.
>
> Actually, v4 of the patch was implemented as such but did not get any feedback,
> https://lore.kernel.org/bpf/20210508220835.53801-9-xiyou.wangcong@gmail.com/#t
>
> ... and v5 (the final commit) somehow removed the restriction for unconnected
> UDP socket as well.
> https://lore.kernel.org/bpf/20210704190252.11866-3-xiyou.wangcong@gmail.com/
>
> Given the initial use case, sockmap redirect, is still blocked by
> TCP_ESTABLISHED
> check in sock_map_redirect_allowed(), I feel there is no point in supporting
> unconnected UDP sockets in sockmap.  It cannot get any skb from anywhere
> (without buggy sk_lookup).

s/unconnected/unhashed/g :)

^ permalink raw reply

* Re: [PATCH v12 07/12] static_call: Define EXPORT_STATIC_CALL_FOR_MODULES()
From: Pawan Gupta @ 2026-06-24 21:49 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Borislav Petkov, Dave Hansen, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, KP Singh,
	Jiri Olsa, David S. Miller, David Laight, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, David Ahern, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	Stanislav Fomichev, Hao Luo, Paolo Bonzini, Jonathan Corbet,
	Jason Baron, Alice Ryhl, Steven Rostedt, Ard Biesheuvel,
	Shuah Khan, linux-kernel, kvm, Asit Mallick, Tao Zhang, bpf,
	netdev, linux-doc
In-Reply-To: <ajvUp_kPJBRZ7k_p@google.com>

On Wed, Jun 24, 2026 at 05:59:19AM -0700, Sean Christopherson wrote:
> On Tue, Jun 23, 2026, Pawan Gupta wrote:
> > There is EXPORT_STATIC_CALL_TRAMP() that hides the static key from all
> > modules. But there is no equivalent of EXPORT_SYMBOL_FOR_MODULES() to
> > restrict symbol visibility to only certain modules.
> > 
> > Add EXPORT_STATIC_CALL_FOR_MODULES(name, mods) that wraps both the key and
> > the trampoline with EXPORT_SYMBOL_FOR_MODULES(), allowing only a limited
> > set of modules to see and update the static key.
> > 
> > The immediate user is KVM, in the following commit.
> > 
> > checkpatch reported below warnings with this change that I believe don't
> > apply in this case:
> > 
> >   include/linux/static_call.h:219: WARNING: Non-declarative macros with multiple statements should be enclosed in a do - while loop
> >   include/linux/static_call.h:220: WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable
> > 
> > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > ---
> >  include/linux/static_call.h | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/include/linux/static_call.h b/include/linux/static_call.h
> > index 78a77a4ae0ea..b610afd1ed55 100644
> > --- a/include/linux/static_call.h
> > +++ b/include/linux/static_call.h
> > @@ -216,6 +216,9 @@ extern long __static_call_return0(void);
> >  #define EXPORT_STATIC_CALL_GPL(name)					\
> >  	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
> >  	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
> > +#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)			\
> > +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_KEY(name), mods);		\
> > +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_TRAMP(name), mods)
> >  
> >  /* Leave the key unexported, so modules can't change static call targets: */
> >  #define EXPORT_STATIC_CALL_TRAMP(name)					\
> > @@ -276,6 +279,9 @@ extern long __static_call_return0(void);
> >  #define EXPORT_STATIC_CALL_GPL(name)					\
> >  	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
> >  	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
> > +#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)			\
> > +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_KEY(name), mods);		\
> > +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_TRAMP(name), mods)
> >  
> >  /* Leave the key unexported, so modules can't change static call targets: */
> >  #define EXPORT_STATIC_CALL_TRAMP(name)					\
> > @@ -346,6 +352,8 @@ static inline int static_call_text_reserved(void *start, void *end)
> >  
> >  #define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_KEY(name))
> >  #define EXPORT_STATIC_CALL_GPL(name)	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name))
> > +#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)			\
> > +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_KEY(name), mods)
> >  
> >  #endif /* CONFIG_HAVE_STATIC_CALL */
> 
> Drat, I forgot about this.  Exporting static call trampolines for KVM came up in
> another conversation[*].  I had already put together patches to effectively default
> to exporting only the trampoline, and also to deduplicate this code so that the
> CONFIG_HAVE_STATIC_CALL_INLINE=y / CONFIG_HAVE_STATIC_CALL=y / CONFIG_HAVE_STATIC_CALL=n
> implementations don't need to copy+paste the same lines of code.
> 
> The attached patches touch a lot more code, and will conflict mightily with KVM
> changes I want to land in 7.3 (more use of a static_call in KVM).  But if we get
> them applied (to tip tree) shortly after 7.2-rc1 and provide a topic branch/tag,
> then there shouldn't be too much juggling needed?
> 
> If we want to go with the more aggressive cleanup, I'll formally post the patches.
> 
> [*] https://lore.kernel.org/all/ahhoDGUz39KSGZ6o@google.com

Thanks for the context.

Earlier making the key ro-after-init came up as an option in a thread with
Peter. Does it look like a good option to you?

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index b610afd1ed55..ea56da8fb446 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -200,6 +200,14 @@ extern long __static_call_return0(void);
 	};								\
 	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
 
+#define DEFINE_STATIC_CALL_NULL_RO_AFTER_INIT(name, _func)		\
+	DECLARE_STATIC_CALL(name, _func);				\
+	struct static_call_key STATIC_CALL_KEY(name) __ro_after_init = {\
+		.func = _func,						\
+		.type = 1,						\
+	};								\
+	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
+
 #define DEFINE_STATIC_CALL_RET0(name, _func)				\
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\

^ permalink raw reply related

* Re: [PATCH v2 bpf-next 1/2] bpf: Support BPF_F_EGRESS with bpf_redirect_peer
From: Paul Chaignon @ 2026-06-24 21:58 UTC (permalink / raw)
  To: Jordan Rife
  Cc: bpf, netdev, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Stanislav Fomichev, Jiayuan Chen
In-Reply-To: <20260618182035.43811-2-jordan@jrife.io>

On Thu, Jun 18, 2026 at 11:20:32AM -0700, Jordan Rife wrote:
> We have several use cases where a pod injects traffic into the datapath
> of another so that the traffic appears to have originated from that
> pod. One such use case is a synthetic flow generator which injects
> synthetic traffic into a pod's datapath to enable dynamic probing and
> debugging. Another is a transparent proxy where connections originating
> from one pod are redirected towards another which proxies that
> connection. The new connection is bound to the IP of the original pod
> using IP_TRANSPARENT and its traffic is injected into that pod's
> datapath and handled as if it had originated there. This can be used for
> mTLS, etc.
> 
> We use bpf_redirect(BPF_F_INGRESS) to direct traffic leaving the proxy,
> flow generator, etc. towards the target pod, ensuring that eBPF programs
> that are meant to intercept traffic leaving that pod are executed.
> However, this doesn't work with netkit.
> 
> With netkit, an ingress redirection from proxy to workload skips eBPF
> programs that are meant to intercept traffic leaving the pod, since they
> reside on the netkit peer device. One workaround is to attach the
> same program to both the netkit peer device and the TCX ingress hook for
> the netkit pair's primary interface, but
> 
> a) This seems hacky and we need to be careful not to run the same
>    program twice for the same skb in cases where we want to pass that
>    traffic to the host stack.
> b) We're trying to keep the proxy redirection / traffic injection
>    systems as modular and separated from Cilium as possible, the system
>    that manages netkit setup and core eBPF programming.
> 
> It would be handy if instead we could redirect traffic directly from
> one netkit peer device to another. This patch proposes an extension
> to bpf_redirect_peer to allow us to do just that.
> 
> With this patch, the BPF_F_EGRESS flag tells bpf_redirect_peer to emit
> the skb in the egress direction of the target interface's peer device
> While the main use case is netkit, I suppose you could also use this
> mode with veth as well if, e.g., there were some eBPF programs attached
> to that side of the veth pair that needed to intercept traffic.
> 
>  +---------------------------------------------------------------------+
>  | +-------------------------+         6. bpf_redirect_neigh(eth0)     |
>  | | pod (10.244.0.10)       |           ------------------------      |
>  | |                         |          |                        |     |
>  | |              +--------+ |          |      +---------+       |     |
>  | | 1. packet -->|        | |          |      |         |       |     |
>  | |    leaves ^  | netkit |<===========|======| netkit  |       |     |
>  | |           |  | peer   |=======(eBPF)=====>| primary |       |     |
>  | |           |  |        | |          |      |         |       |     |
>  | |           |  +--------+ |          |      +---------+       |     |
>  | |           |             |          | 2. bpf_redirect        v     |
>  | +-----------|-------------+          |___________________   +-------|
>  |             |                                            |  | eth0  |
>  |             | 5. bpf_redirect_peer(BPF_F_EGRESS)         |  +-------|
>  |             |________________________                    |          |
>  | +-------------------------+          |                   |          |
>  | | proxy (10.244.0.11)     |          |                   |          |
>  | | IP_TRANSPARENT          |          |                   |          |
>  | |              +--------+ |          |      +---------+  |          |
>  | | 3. packet <--|        | |          |      |         |<--          |
>  | |    enters    | netkit |<===========|======| netkit  |             |
>  | |    [proxy]   | peer   |=======(eBPF)=====>| primary |             |
>  | | 4. packet -->|        | |                 |         |             |
>  | |    leaves    +--------+ |                 +---------+             |
>  | |    sip=10.244.0.10      |                                         |
>  | +-------------------------+                                         |
>  +---------------------------------------------------------------------+
> 
> Using the proxy use case as an example, in step 5 we would redirect
> traffic leaving the proxy towards the pod's peer device using
> bpf_redirect_peer(BPF_F_EGRESS).
> 
> As a bonus, since the skb doesn't have to go through the backlog queue
> it can take full advantage of netkit's performance benefits. I set up a
> test where outgoing iperf3 traffic is injected into the datapath of
> another pod using either bpf_redirect_peer(BPF_F_EGRESS) or
> bpf_redirect(BPF_F_INGRESS). I used Cilium's eBPF host routing mode
> which skips the host stack and uses BPF redirect helpers to do all the
> routing.
> 
>   (net.ipv4.tcp_congestion_control=cubic,mtu=1500,100GiB link,Cilium
>    eBPF host routing mode)
> 
> BASELINE [bpf_redirect(BPF_F_INGRESS)]
>   1. [iperf pod] ==bpf_redirect([pod b], BPF_F_INGRESS)==> [pod b]
>   2. [pod b]     ==bpf_redirect_neigh([eth0])==>           eth0
>   3. eth0        ==over network==>                         [host b]
> 
>   [ ID] Interval           Transfer     Bitrate         Retr
>   [  5]   0.00-60.00  sec   231 GBytes  33.0 Gbits/sec  12060     sender
>   [  5]   0.00-60.00  sec   230 GBytes  33.0 Gbits/sec            receiver
> 
> TEST [bpf_redirect_peer(BPF_F_EGRESS)]
>   1. [iperf pod] ==bpf_redirect_peer([pod b], BPF_F_EGRESS)==> [pod b]
>   2. [pod b]     ==bpf_redirect_neigh([eth0])==>               eth0
>   3. eth0        ==over network==>                             [host b]
> 
>   [ ID] Interval           Transfer     Bitrate         Retr
>   [  5]   0.00-60.00  sec   272 GBytes  38.9 Gbits/sec    0       sender
>   [  5]   0.00-60.00  sec   272 GBytes  38.9 Gbits/sec            receiver
> 
> In this test, using bpf_redirect_peer(BPF_F_EGRESS) for the hop from
> [iperf pod] to [pod b] led to ~18% more throughput compared to
> bpf_redirect(BPF_F_INGRESS).
> 
> Signed-off-by: Jordan Rife <jordan@jrife.io>

Acked-by: Paul Chaignon <paul.chaignon@gmail.com>


^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/2] selftests/bpf: Add tests for bpf_redirect_peer with BPF_F_EGRESS
From: Paul Chaignon @ 2026-06-24 21:59 UTC (permalink / raw)
  To: Jordan Rife
  Cc: bpf, netdev, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Stanislav Fomichev, Jiayuan Chen
In-Reply-To: <20260618182035.43811-3-jordan@jrife.io>

On Thu, Jun 18, 2026 at 11:20:33AM -0700, Jordan Rife wrote:
> Extend redirect tests to cover bpf_redirect_peer(BPF_F_EGRESS). SRC
> redirects to DST using bpf_redirect_peer(BPF_F_EGRESS) then traffic is
> hairpinned into DST using bpf_redirect.
> 
> Signed-off-by: Jordan Rife <jordan@jrife.io>

Acked-by: Paul Chaignon <paul.chaignon@gmail.com>


^ permalink raw reply

* Re: [PATCH v12 07/12] static_call: Define EXPORT_STATIC_CALL_FOR_MODULES()
From: Sean Christopherson @ 2026-06-24 22:03 UTC (permalink / raw)
  To: Pawan Gupta
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Borislav Petkov, Dave Hansen, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, KP Singh,
	Jiri Olsa, David S. Miller, David Laight, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, David Ahern, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	Stanislav Fomichev, Hao Luo, Paolo Bonzini, Jonathan Corbet,
	Jason Baron, Alice Ryhl, Steven Rostedt, Ard Biesheuvel,
	Shuah Khan, linux-kernel, kvm, Asit Mallick, Tao Zhang, bpf,
	netdev, linux-doc
In-Reply-To: <20260624214955.6kkivefeuapcocib@desk>

On Wed, Jun 24, 2026, Pawan Gupta wrote:
> On Wed, Jun 24, 2026 at 05:59:19AM -0700, Sean Christopherson wrote:
> > On Tue, Jun 23, 2026, Pawan Gupta wrote:
> > > There is EXPORT_STATIC_CALL_TRAMP() that hides the static key from all
> > > modules. But there is no equivalent of EXPORT_SYMBOL_FOR_MODULES() to
> > > restrict symbol visibility to only certain modules.
> > > 
> > > Add EXPORT_STATIC_CALL_FOR_MODULES(name, mods) that wraps both the key and
> > > the trampoline with EXPORT_SYMBOL_FOR_MODULES(), allowing only a limited
> > > set of modules to see and update the static key.
> > > 
> > > The immediate user is KVM, in the following commit.
> > > 
> > > checkpatch reported below warnings with this change that I believe don't
> > > apply in this case:
> > > 
> > >   include/linux/static_call.h:219: WARNING: Non-declarative macros with multiple statements should be enclosed in a do - while loop
> > >   include/linux/static_call.h:220: WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable
> > > 
> > > Suggested-by: Peter Zijlstra <peterz@infradead.org>
> > > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> > > ---

...

> > Drat, I forgot about this.  Exporting static call trampolines for KVM came up in
> > another conversation[*].  I had already put together patches to effectively default
> > to exporting only the trampoline, and also to deduplicate this code so that the
> > CONFIG_HAVE_STATIC_CALL_INLINE=y / CONFIG_HAVE_STATIC_CALL=y / CONFIG_HAVE_STATIC_CALL=n
> > implementations don't need to copy+paste the same lines of code.
> > 
> > The attached patches touch a lot more code, and will conflict mightily with KVM
> > changes I want to land in 7.3 (more use of a static_call in KVM).  But if we get
> > them applied (to tip tree) shortly after 7.2-rc1 and provide a topic branch/tag,
> > then there shouldn't be too much juggling needed?
> > 
> > If we want to go with the more aggressive cleanup, I'll formally post the patches.
> > 
> > [*] https://lore.kernel.org/all/ahhoDGUz39KSGZ6o@google.com
> 
> Thanks for the context.
> 
> Earlier making the key ro-after-init came up as an option in a thread with
> Peter. Does it look like a good option to you?

No, it won't work for KVM.  kvm.ko (owner of the keys) updates the keys only when
a vendor module (kvm-intel.ko or kvm-amd.ko) is loaded, and updates keys *every*
time a vendor module is loaded.  So for KVM, the static calls need to be __read_mostly,
not __ro_after_init.

> diff --git a/include/linux/static_call.h b/include/linux/static_call.h
> index b610afd1ed55..ea56da8fb446 100644
> --- a/include/linux/static_call.h
> +++ b/include/linux/static_call.h
> @@ -200,6 +200,14 @@ extern long __static_call_return0(void);
>  	};								\
>  	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
>  
> +#define DEFINE_STATIC_CALL_NULL_RO_AFTER_INIT(name, _func)		\
> +	DECLARE_STATIC_CALL(name, _func);				\
> +	struct static_call_key STATIC_CALL_KEY(name) __ro_after_init = {\
> +		.func = _func,						\
> +		.type = 1,						\
> +	};								\
> +	ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
> +
>  #define DEFINE_STATIC_CALL_RET0(name, _func)				\
>  	DECLARE_STATIC_CALL(name, _func);				\
>  	struct static_call_key STATIC_CALL_KEY(name) = {		\

^ permalink raw reply

* Re: [PATCH bpf-next v2 0/4] selftests/bpf: Add some tests for LoongArch
From: Tiezhu Yang @ 2026-06-24 22:18 UTC (permalink / raw)
  To: Alexei Starovoitov, Alexei Starovoitov, Huacai Chen; +Cc: loongarch, bpf
In-Reply-To: <DJ9QMPLAXOGG.2PDC3A2MUENPC@gmail.com>

On 6/15/26 23:45, Alexei Starovoitov wrote:
> On Mon Jun 15, 2026 at 3:05 AM PDT, Tiezhu Yang wrote:
>> This series is based on the latest master branch of bpf-next tree.
>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
>>
>> Dependency Note:
>> ================
>> This series strictly depends on the following LoongArch core series,
>> which moves thread_info into task_struct:
>>
>> "Move thread_info into task_struct for LoongArch"
>> https://lore.kernel.org/loongarch/20260612011616.27771-1-yangtiezhu@loongson.cn/
> 
> Pls repost when this patch appears in bpf-next.

OK, this is reasonable. I agree with you and plan to do this.

But I noticed that this series has already been merged into
the loongarch-next [1] tree, and there is something wrong
between the code and commit message in the first patch [2].

I think this is not proper. I suggest to drop them from the
loongarch-next tree.

Just FYI.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git/

[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git/commit/?h=loongarch-next&id=8796e11abddf

Thanks,
Tiezhu


^ permalink raw reply

* Re: [PATCH v1] kasan: Fix false-positive wild-memory-access on x86 under 5-level paging
From: Ihor Solodrai @ 2026-06-24 22:45 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Dave Hansen, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Andrey Ryabinin,
	Andrew Morton, bpf, kasan-dev, linux-mm, linux-kernel,
	Thomas Gleixner, Ingo Molnar, Dave Hansen, x86, H. Peter Anvin,
	Andrey Konovalov
In-Reply-To: <20260624025732.GDajtHnB1Jc21kQnl_@fat_crate.local>

On 6/23/26 7:57 PM, Borislav Petkov wrote:
> On Mon, Jun 22, 2026 at 05:29:12PM -0700, Ihor Solodrai wrote:
>> On 6/18/26 11:38 AM, Borislav Petkov wrote:
>>> On Thu, Jun 18, 2026 at 11:12:09AM -0700, Dave Hansen wrote:
>>>> On 6/18/26 10:09, Borislav Petkov wrote:
>>>>> On Wed, Jun 17, 2026 at 03:13:33PM -0700, Ihor Solodrai wrote:
>>>>>> So my question to maintainers is what approach seems best?
>>>>> The CPUID stuff is being rewritten currently and it should address your issue
>>>>> too. If not, then we need to rewrite it better.
>>>>>
>>>>> Can you reproduce with this set applied ontop:
>>>>>
>>>>> https://lore.kernel.org/r/20260528153923.403473-1-darwi@linutronix.de
>>>>
>>>> Thinking about this a bit more... If Ahmed's series does fix this, I
>>>> think it will be accidental. It still uses identify_cpu() and also does
>>>> a memset() of the new c->cpuid structure in addition to the old
>>>> c->x86_capability structure.
>>>>
>>>> I'm not knocking Ahmed's series by any means. It just probably won't fix
>>>> this issue.
>>>>
>>>> In a perfect world early_identify_cpu() and identify_cpu() would either
>>>> get consolidated into one thing. Or at least become two discrete things
>>>> that initialize two completely disjoint sets of data. That way,
>>>> identify_cpu() wouldn't memset() anything.
>>>>
>>>> Isn't that the _real_ fix? Instead of trying to hide the inconsistency
>>>> when good data is blown away, we stop blowing it away in the first place?
>>>
>>> early_ is only on the BSP and you want to scan all CPUs.
>>>
>>> AFAIR, the last time I was looking at how we scan the CPUID leafs, we do have
>>> cases where there's blips in time when cap bits get disappeared to be
>>> rescanned again. The toggling of MSR bits which control feature flags is one
>>> thing that comes to mind.
>>>
>>> But I'm with you on the consolidation approach. I think this is what we should
>>
>> Hello Dave, Boris. Thank you for the input.
>>
>> I tried a slight refactoring of the identify_cpu() machinery to
>> eliminate the memset(x86_capability) from the boot cpu, and it fixes
>> the splat that I reported.
>>
>> identify_cpu() is split into two parts:
>>   * identify_cpu_scan() - the top part, up to and including the
>>     generic_identify() call
>>   * identify_cpu() proper with the rest, no memset here
>>
>> Then (gated on x86_64) identify_boot_cpu() *skips* the
>> identify_cpu_scan() call, only executing the bottom part of current
>> identify_cpu(). We can do that because boot cpu already did the _scan
>> part in early_identify_cpu(), when interrupts are still disabled.
>>
>> One caveat: get_model_name() is moved from generic_identify() into
>> identify_cpu(), otherwise it wouldn't be called on boot cpu. Same for
>> c->loops_per_jiffy = loops_per_jiffy; - it's moved to the "bottom"
>> identify_cpu().
>>
>> The behavior for secondary cpus is unchanged: identify_secondary_cpu()
>> calls identify_cpu_scan() then immediately identify_cpu().
>>
>> Please take a look at the diff below and let me know if this is a good
>> way to proceed. I don't know exactly what I'm doing, so concerns and
>> corrections are welcome.
>>
>>
>> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
>> index a4268c47f2bc..4a655109cacf 100644
>> --- a/arch/x86/kernel/cpu/common.c
>> +++ b/arch/x86/kernel/cpu/common.c
>> @@ -1978,8 +1978,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
>>  
>>  	get_cpu_address_sizes(c);
>>  
>> -	get_model_name(c); /* Default name */
>> -
>>  	/*
>>  	 * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
>>  	 * systems that run Linux at CPL > 0 may or may not have the
>> @@ -1999,13 +1997,12 @@ static void generic_identify(struct cpuinfo_x86 *c)
>>  }
>>  
>>  /*
>> - * This does the hard work of actually picking apart the CPU stuff...
>> + * Rebuild capability set from CPUID and re-apply the forced-cap overrides
>> + * through generic_identify(). This *should* run with interrupts off, otherwise
>> + * an interrupt handler might see the capability bits cleared.
> 
> And yet it doesn't run with IRQs off. So that comment doesn't need to be here.
> The point of the whole exercise is to not disable ugly interrupts in that path
> at all.
> 
>>   */
>> -static void identify_cpu(struct cpuinfo_x86 *c)
>> +static void identify_cpu_scan(struct cpuinfo_x86 *c)
>>  {
>> -	int i;
>> -
>> -	c->loops_per_jiffy = loops_per_jiffy;
>>  	c->x86_cache_size = 0;
>>  	c->x86_vendor = X86_VENDOR_UNKNOWN;
>>  	c->x86_model = c->x86_stepping = 0;	/* So far unknown... */
>> @@ -2028,6 +2025,19 @@ static void identify_cpu(struct cpuinfo_x86 *c)
>>  #endif
>>  
>>  	generic_identify(c);
>> +}
>> +
>> +/*
>> + * This is safe to run with interrupts on.
>> + * The boot CPU can run just this from arch_cpu_finalize_init(), because
>> + * the scan part already happened in early_identify_cpu().
>> + */
>> +static void identify_cpu(struct cpuinfo_x86 *c)
>> +{
>> +	int i;
>> +
>> +	c->loops_per_jiffy = loops_per_jiffy;
>> +	get_model_name(c); /* Default name */
>>  
>>  	cpu_parse_topology(c);
>>  
>> @@ -2154,6 +2164,17 @@ void enable_sep_cpu(void)
>>  
>>  static __init void identify_boot_cpu(void)
>>  {
>> +	/*
>> +	 * The boot CPU's capabilities were already scanned by early_identify_cpu().
>> +	 * Here identify_cpu() only finalizes them.
>> +	 *
>> +	 * 32-bit still needs the full scan here: it sets X86_BUG_ESPFIX (via
>> +	 * generic_identify()) and the no-CPUID cpuid_level default, which the early
>> +	 * path does not.
>> +	 */
>> +#ifndef CONFIG_X86_64
>> +	identify_cpu_scan(&boot_cpu_data);
>> +#endif
>>  	identify_cpu(&boot_cpu_data);
>>  	if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
>>  		pr_info("CET detected: Indirect Branch Tracking enabled\n");
>> @@ -2178,6 +2199,7 @@ void identify_secondary_cpu(unsigned int cpu)
>>  		*c = boot_cpu_data;
>>  	c->cpu_index = cpu;
>>  
>> +	identify_cpu_scan(c);
>>  	identify_cpu(c);
>>  #ifdef CONFIG_X86_32
>>  	enable_sep_cpu();
>> -- 
> 
> Ok, thanks for that, that's a good first try. I did poke at it a bit and
> here's what I think should happen:
> 
> 1. The part which initializes struct cpuinfo_x86 up and including the memsets
>    should be one function called init_cpu_info() or so.
> 
> 2. Then, generic_identify() should be merged into identify_cpu() basically
>    turning the latter into a single function which does a generic CPU
>    identification both on the BSP and on the APs.
> 
> 3. #ifdef CONFIG_X86_32
>         enable_sep_cpu();
> #endif 
> 	that gunk should be stuck into a function called identify_cpu_32() and
> 	you'll have at the beginning of it
> 
> 	if (!IS_ENABLED(CONFIG_X86_32))
> 		return;
> 
>    and then you call that function both in identify_boot_cpu() and
>    identify_secondary_cpu(). This way you streamline the paths and drop all
>    that ugly ifdeffery.
> 
> 4. When you do 3. above, you should be able to move this gunk
> 
> 	#ifdef CONFIG_X86_32
>         set_cpu_bug(c, X86_BUG_ESPFIX);
> 	#endif
> 
> to it too. And now everything is nicely encapsulated and straight-forward.
> 
> 5. The above will allow you to have the init_cpu_info() only on 32-bit.
> 
> Oh and, looking at the ifdeffery on identify_cpu():
> 
> 	#ifdef CONFIG_X86_64
> 	        c->x86_clflush_size = 64;
> 	        c->x86_phys_bits = 36;
> 	        c->x86_virt_bits = 48;
> 	#else
> 
> you could probably add a identify_cpu_64() too. Or maybe the init parts should
> be called separately init_cpu_info_32() and init_cpu_info_64(). You probably
> need to see how it looks like when you write it.
> 
> Oh, and each 1., 2., ...step above is a single patch. This'll make the review
> very easy too.
> 
> How does that sound? Willing to give it a try? :-)

The instructions are quite detailed, thank you.

I'll try it and will send a separate series as soon as I can (likely next week).

> 
> If not, I could try to find some time and do it myself but I thought you might
> be willing to try it...
> 
> :-)
> 
> Thx.
> 


^ permalink raw reply

* Re: [PATCH v1] kasan: Fix false-positive wild-memory-access on x86 under 5-level paging
From: Borislav Petkov @ 2026-06-24 23:45 UTC (permalink / raw)
  To: Ihor Solodrai
  Cc: Dave Hansen, Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Andrey Ryabinin,
	Andrew Morton, bpf, kasan-dev, linux-mm, linux-kernel,
	Thomas Gleixner, Ingo Molnar, Dave Hansen, x86, H. Peter Anvin,
	Andrey Konovalov
In-Reply-To: <c49b1499-6aa0-4e17-8385-cd2bcd2220cc@linux.dev>

On Wed, Jun 24, 2026 at 03:45:15PM -0700, Ihor Solodrai wrote:
> The instructions are quite detailed, thank you.

I hope... the devil's in the detail as always so I might be talking a lot of
crap too, as usual. You'll know better when you start poking at the code.

> I'll try it and will send a separate series as soon as I can (likely next week).

No worries, take your time.

And ofc, if we decide we need a fix for stable, you could make a one-liner be
the first patch which maybe which disables KASAN inline  when 5level page
tables have been detected. Or something to that effect.

And then the last patch will remove that fix.

That is, iff there's even a need to have something in stable. I don't know how
widespread kasan inline is...

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply

* Re: [PATCH bpf-next v2 08/17] bpf: Report Register Type Safety errors
From: Eduard Zingerman @ 2026-06-24 23:55 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Emil Tsalapatis, kkd, kernel-team
In-Reply-To: <20260619205934.1312876-9-memxor@gmail.com>

On Fri, 2026-06-19 at 22:59 +0200, Kumar Kartikeya Dwivedi wrote:
> Augment selected register-state verifier failures with Register Type Safety
> reports. The existing verbose verifier messages remain in place; the new
> reports add reason, source context, causal path, and suggestions.
> 
> Cover invalid pointer dereferences, unreadable registers, missing outgoing
> stack arguments for bpf2bpf and kfunc calls, and rejected pointer arithmetic.
> Use scoped diagnostic history so reports start from the latest relevant value
> change and then show later branch outcomes.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---

Acked-by: Eduard Zingerman <eddyz87@gmail.com>

>  kernel/bpf/diagnostics.c | 234 ++++++++++++++++++++++++++++++++++++++-
>  kernel/bpf/diagnostics.h |  22 ++++
>  kernel/bpf/verifier.c    | 178 +++++++++++++++++++++++++++--
>  3 files changed, 423 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/bpf/diagnostics.c b/kernel/bpf/diagnostics.c
> index d6a9a2315f54..92a4aef3cc90 100644
> --- a/kernel/bpf/diagnostics.c
> +++ b/kernel/bpf/diagnostics.c
> @@ -837,6 +837,217 @@ void bpf_diag_report_source(struct bpf_verifier_env *env, u32 insn_idx,
>  	kfree(msg);
>  }
>  
> +static u32 bpf_diag_current_frameno(const struct bpf_verifier_env *env)
> +{
> +	return env->cur_state->frame[env->cur_state->curframe]->frameno;
> +}
> +
> +static int bpf_diag_stack_argno(u8 slot);

Nit: move the initial definition?

> +void bpf_diag_report_register_type(struct bpf_verifier_env *env,
> +				   u32 insn_idx, int regno,
> +				   const char *problem, const char *reason,
> +				   const char *suggestion)

[...]

> +static void bpf_diag_format_stack_arg(char *buf, size_t size, u8 slot)
> +{
> +	int argno = bpf_diag_stack_argno(slot);
> +	const char *ordinal = bpf_diag_arg_ordinal(argno);
> +
> +	if (ordinal)
> +		scnprintf(buf, size, "outgoing stack argument %u (%s argument)",
> +			  slot + 1, ordinal);
> +	else
> +		scnprintf(buf, size, "outgoing stack argument %u", slot + 1);

Nit: for regular arguments you also fetch the argument's name.

> +}

[...]

> @@ -1726,7 +1937,24 @@ void bpf_diag_print_history(struct bpf_verifier_env *env,
>  		}
>  	}
>  
> -	if (!printed)
> -		bpf_diag_write(env,
> -			       "  no retained diagnostic events on this path\n");
> +	if (!printed) {
> +		if (opts && opts->scope == BPF_DIAG_HISTORY_SCOPE_STACK_ARG &&
> +		    opts->stack_arg_slot >= 0) {

Nit: I'd skip this block entirely, the error message already says that
     the argument is not provided.

> +			const char *arg_buf;
> +
> +			arg_buf = bpf_diag_scratch_buf(env, 0, NULL);
> +			if (arg_buf)
> +				bpf_diag_format_stack_arg((char *)arg_buf,
> +							  BPF_DIAG_SCRATCH_STR_LEN,
> +							  opts->stack_arg_slot);
> +			else
> +				arg_buf = "this outgoing stack argument";
> +			bpf_diag_write(env,
> +				       "  no retained writes for %s on this path\n",
> +				       arg_buf);
> +		} else {
> +			bpf_diag_write(env,
> +				       "  no retained diagnostic events on this path\n");
> +		}
> +	}
>  }

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index e584dec04b34..2c5f24528071 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[...]

> @@ -4362,6 +4368,12 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
>  	if (!fixed_off_ok && reg->var_off.value != 0) {
>  		verbose(env, "dereference of modified %s ptr %s off=%lld disallowed\n",
>  			reg_type_str(env, reg->type), reg_arg_name(env, argno), reg->var_off.value);
> +		bpf_diag_report_invalid_deref(env, env->insn_idx,
> +					      reg_from_argno(argno),
> +					      reg_arg_name(env, argno),
> +					      reg_type_str(env, reg->type),

Should this use bpf_diag_reg_type_plain()?
Also, maybe pass reg->type to the bpf_diag_report_invalid_deref()
and convert it to string there?

> +					      BPF_DIAG_DEREF_MODIFIED_PTR,
> +					      reg->var_off.value);
>  		return -EACCES;
>  	}
>  

[...]

^ permalink raw reply

* Re: [PATCH bpf-next v2 16/17] bpf: Report Verifier Internal errors
From: Eduard Zingerman @ 2026-06-25  0:26 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Emil Tsalapatis, kkd, kernel-team
In-Reply-To: <20260619205934.1312876-17-memxor@gmail.com>

On Fri, 2026-06-19 at 22:59 +0200, Kumar Kartikeya Dwivedi wrote:

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ec51e0c81053..5c60fe033271 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -8481,6 +8481,9 @@ static int check_reg_type(struct bpf_verifier_env *env, struct bpf_reg_state *re
>  				verbose(env, "verifier internal error:");
>  				verbose(env, "%s has non-overwritten BPF_PTR_POISON type\n",
>  					reg_arg_name(env, argno));
> +				bpf_diag_report_internal_error(env, env->insn_idx,
> +							       "poisoned helper argument type",
> +							       "helper type checking reached a non-overwritten BPF_PTR_POISON expected type.");

The cases below should be converted to use verifier_bug() utility function.
I'm not sure if we want elaborate error reporting in this case, but in case
we do, the verifier_bug() is the right place to inject the call to
bpf_diag_report_internal_error().

>  				return -EACCES;
>  			}
>  
> @@ -12560,12 +12563,19 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
>  	default:
>  		verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
>  			btf_field_type_name(node_field_type));
> +		bpf_diag_report_internal_error(env, env->insn_idx,
> +					       "unexpected graph node argument type",
> +					       "the kfunc graph-node checker saw an unsupported graph node field type.");
>  		return false;
>  	}
>  
> -	if (!ret)
> +	if (!ret) {
>  		verbose(env, "verifier internal error: %s node arg for unknown kfunc\n",
>  			btf_field_type_name(node_field_type));
> +		bpf_diag_report_internal_error(env, env->insn_idx,
> +					       "graph node argument for unknown kfunc",
> +					       "the kfunc graph-node checker could not map this graph node argument to the called kfunc.");
> +	}
>  	return ret;
>  }
>  

^ permalink raw reply

* Re: [PATCH bpf-next v2 14/17] bpf: Report Policy helper and kfunc errors
From: Eduard Zingerman @ 2026-06-25  0:36 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Emil Tsalapatis, kkd, kernel-team
In-Reply-To: <20260619205934.1312876-15-memxor@gmail.com>

On Fri, 2026-06-19 at 22:59 +0200, Kumar Kartikeya Dwivedi wrote:
> Augment selected helper and kfunc allowability failures with Policy reports.
> These reports explain which requested operation is forbidden and why, without
> adding path history for non-path-dependent policy checks.
> 
> Cover unprivileged bpf2bpf and kfunc use, helper program-type restrictions,
> GPL-only helpers, helper-specific allow callbacks, kfunc allowability, and
> destructive kfunc capability checks.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---

Acked-by: Eduard Zingerman <eddyz87@gmail.com>

[...]

^ permalink raw reply

* Re: [PATCH bpf-next v2 13/17] bpf: Report Program Structure CFG errors
From: Eduard Zingerman @ 2026-06-25  0:47 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Emil Tsalapatis, kkd, kernel-team
In-Reply-To: <20260619205934.1312876-14-memxor@gmail.com>

On Fri, 2026-06-19 at 22:59 +0200, Kumar Kartikeya Dwivedi wrote:
> Augment selected subprogram CFG validation failures with Program Structure
> reports. These errors are structural rather than path-dependent, so the report
> focuses on source and instruction context instead of causal history.
> 
> Cover jumps that leave the current subprogram, subprograms whose last
> instruction can fall through into the next subprogram, and recursive bpf2bpf
> call graph edges.
> 
> Format long jump-range reasons directly in diagnostics.c, and keep the
> fallthrough suggestion aligned with the verifier check by suggesting exit or
> explicit jumps.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---

Acked-by: Eduard Zingerman <eddyz87@gmail.com>

[...]

> diff --git a/kernel/bpf/cfg.c b/kernel/bpf/cfg.c
> index 26d37066465f..032b3dc56f2e 100644

[...]

> @@ -136,6 +143,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
>  		verbose_linfo(env, t, "%d: ", t);
>  		verbose_linfo(env, w, "%d: ", w);
>  		verbose(env, "back-edge from insn %d to %d\n", t, w);
> +		bpf_diag_report_program_structure(env, t,
> +						  "back-edge is not allowed",
> +						  "Rewrite the control flow as a verifier-supported bounded loop, or load with privileges that allow this back-edge.",

As far as I understand, this error fires if a control flow graph loop
is detected while loading a program in an unprivileged mode.
In such a context suggestion to "Rewrite ... as verifier-supported ... loop"
is not actionable, I'd rephrase it as
"Avoid loops for unprivileged programs or load with privileges ...".

> +						  "Instruction %d branches back to instruction %d. This program is being rejected without the privilege needed for this back-edge.",
> +						  t, w);
>  		return -EINVAL;
>  	} else if (insn_state[w] == EXPLORED) {
>  		/* forward- or cross-edge */
> @@ -316,6 +328,11 @@ static struct bpf_iarray *jt_from_subprog(struct bpf_verifier_env *env,
>  
>  	if (!jt) {
>  		verbose(env, "no jump tables found for subprog starting at %u\n", subprog_start);
> +		bpf_diag_report_program_structure(env, subprog_start,
> +						  "missing jump table",
> +						  "Provide a jump table whose entries target this subprogram.",

I'd rephrase it as "Make sure that subprograms containing gotox
instructions are accompanied by jump tables referencing these
subprograms.". (or "... derive jump target from a jump table
referencing the subprogram"?).

> +						  "No jump table was found for the subprogram that starts at instruction %u.",
> +						  subprog_start);
>  		return ERR_PTR(-EINVAL);
>  	}
>  

[...]

^ permalink raw reply

* Re: [PATCH net] net: mana: Fall back to standard MTU when PF reports adapter_mtu of 0
From: patchwork-bot+netdevbpf @ 2026-06-25  1:08 UTC (permalink / raw)
  To: Erni Sri Satya Vennela
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, dipayanroy, ssengar, jacob.e.keller,
	horms, gargaditya, kees, linux-hyperv, netdev, linux-kernel, bpf
In-Reply-To: <20260619055348.467224-1-ernis@linux.microsoft.com>

Hello:

This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 18 Jun 2026 22:53:38 -0700 you wrote:
> Commit d7709812e13d ("net: mana: hardening: Validate adapter_mtu from
> MANA_QUERY_DEV_CONFIG") rejected any adapter_mtu value smaller than
> ETH_MIN_MTU + ETH_HLEN, including 0, returning -EPROTO and failing
> mana_probe().
> 
> Some older PF firmware versions still in the field report
> adapter_mtu as 0 in the MANA_QUERY_DEV_CONFIG response. With the
> hardening check in place, the MANA VF driver now fails to load on
> those hosts, breaking networking entirely for guests.
> 
> [...]

Here is the summary with links:
  - [net] net: mana: Fall back to standard MTU when PF reports adapter_mtu of 0
    https://git.kernel.org/netdev/net/c/6bd81a5b4e0d

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH bpf-next v2 09/17] bpf: Report Memory Safety bounds errors
From: Eduard Zingerman @ 2026-06-25  1:30 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Emil Tsalapatis, kkd, kernel-team
In-Reply-To: <20260619205934.1312876-10-memxor@gmail.com>

On Fri, 2026-06-19 at 22:59 +0200, Kumar Kartikeya Dwivedi wrote:
> Augment selected memory-range verifier failures with Memory Safety reports
> while preserving the existing terse verifier messages for compatibility.
> 
> Cover stack spill corruption, uninitialized stack reads, variable stack helper
> accesses, and check_mem_region_access() range-proof failures. The bounds report
> spells out the required offset + access_size <= object_size proof with concrete
> values and uses scoped diagnostic history for causal context.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---

[...]

> @@ -1541,6 +1555,153 @@ static void bpf_diag_format_scalar_range(struct bpf_diag_reg_fmt *fmt,

[...]

> +static void bpf_diag_format_access_offset(struct bpf_verifier_env *env,
> +					  char *buf, size_t size, int off,
> +					  const struct bpf_reg_state *reg)
> +{
> +	struct bpf_diag_scratch *scratch = bpf_diag_scratch(env);
> +	struct bpf_diag_reg_fmt *fmt;
> +	char *start;
> +
> +	if (tnum_is_const(reg->var_off)) {
> +		start = bpf_diag_scratch_buf(env, 2,
> +					     NULL);
> +		if (!start) {

Nit: this is an impossible branch, bpf_diag_format_access_offset()
     is called from bpf_diag_report_mem_bounds() under condition
     that bpf_diag_enabled(env) is true.

> +			scnprintf(buf, size, "constant");
> +			return;
> +		}
> +		bpf_diag_format_s64_sum(start, BPF_DIAG_SCRATCH_STR_LEN,
> +					(s64)reg->var_off.value, off);
> +		scnprintf(buf, size, "constant %s", start);
> +		return;
> +	}
> +
> +	if (tnum_is_unknown(reg->var_off) &&
> +	    bpf_diag_cnum64_unknown(reg->r64)) {
> +		scnprintf(buf, size, "unknown");

Nit: I'd use "unbounded".

> +		return;
> +	}
> +
> +	fmt = &scratch->reg_fmt;
> +	memset(fmt, 0, sizeof(*fmt));
> +
> +	bpf_diag_format_scalar_range(fmt, fmt->range, sizeof(fmt->range),
> +				     reg->r64);
> +	if (off)
> +		scnprintf(buf, size,
> +			  "variable: known bits %#llx, unknown mask %#llx, plus fixed offset %d; %s",
> +			  (u64)reg->var_off.value, reg->var_off.mask, off,
> +			  fmt->range);
> +	else
> +		scnprintf(buf, size,
> +			  "variable: known bits %#llx, unknown mask %#llx; %s",
> +			  (u64)reg->var_off.value, reg->var_off.mask,
> +			  fmt->range);
> +}
> +
> +static u64 bpf_diag_mem_max_start(const struct bpf_reg_state *reg, int off)

Could you please elaborate a bit why is this function needed?
It is only called from

  > +	case BPF_DIAG_MEM_MAX_OUT_OF_RANGE:
  > +	default:
  > +		max_start = bpf_diag_mem_max_start(reg, off);
  > +		max_end = max_start + size;
  > +		scnprintf(proof, BPF_DIAG_SCRATCH_STR_LEN,
  > +			  "the largest possible access ends at %llu: start %llu + access_size %d, beyond object_size %u",
  > +			  max_end, max_start, size, mem_size);
  > +		break;

And there it would report "start 0 + access_size 10" in case of negative offset.
Which is not verifier actually checks for:

  __check_mem_access(env, reg, argno, reg_umax(reg) + off, ...)
                                      ^^^^^^^^^^^^^^^^^^^
                                  this parameter is declared as 'int'
> +{
> +	/* A negative fixed offset can clamp the maximum start to zero when
> +	 * the unsigned variable maximum is smaller than -off.
> +	 */
> +	if (off < 0 && reg_umax(reg) < (u64)-off)
> +		return 0;
> +	return reg_umax(reg) + off;
> +}
> +
> +void bpf_diag_report_mem_bounds(struct bpf_verifier_env *env, u32 insn_idx,
> +				int regno, const char *reg_name,
> +				const char *type_name,
> +				enum bpf_diag_mem_bounds_kind kind,
> +				int off, int size, u32 mem_size,
> +				const struct bpf_reg_state *reg)
> +{
> +	struct bpf_diag_history_opts opts = {
> +		.scope = BPF_DIAG_HISTORY_SCOPE_REG,
> +		.frameno = bpf_diag_current_frameno(env),
> +		.regno = regno,
> +	};
> +	char *offset_desc, *proof, *start;
> +	u64 max_start, max_end;
> +
> +	if (!bpf_diag_enabled(env))
> +		return;
> +
> +	offset_desc = bpf_diag_scratch_buf(env, 0, NULL);
> +	proof = bpf_diag_scratch_buf(env, 1, NULL);
> +	start = bpf_diag_scratch_buf(env, 2, NULL);
> +	if (!offset_desc || !proof || !start)
> +		return;
> +
> +	switch (kind) {
> +	case BPF_DIAG_MEM_NEGATIVE_MIN:
> +		bpf_diag_format_s64_sum(start, BPF_DIAG_SCRATCH_STR_LEN, reg_smin(reg),
> +					off);
> +		scnprintf(proof, BPF_DIAG_SCRATCH_STR_LEN,
> +			  "the smallest possible access starts at %s, below 0",

Nit: "the minimal bound for a memory access is a negative value: %s".

> +			  start);
> +		break;
> +	case BPF_DIAG_MEM_MIN_OUT_OF_RANGE:
> +		bpf_diag_format_s64_sum(start, BPF_DIAG_SCRATCH_STR_LEN, reg_smin(reg),
> +					off);
> +		scnprintf(proof, BPF_DIAG_SCRATCH_STR_LEN,
> +			  "the smallest possible access starts at %s, outside object_size %u",

Nit: "the minimal bound for a memory access is %s and is outside of the object of size %u"

> +			  start, mem_size);
> +		break;
> +	case BPF_DIAG_MEM_UNBOUNDED:
> +		scnprintf(proof, BPF_DIAG_SCRATCH_STR_LEN,
> +			  "%s has unsigned maximum %llu, which exceeds BPF_MAX_VAR_OFF %u",

Nit: "the maximal bound for a memory access is %llu and exceeds maximum allowed offset of %u".

> +			  reg_name, reg_umax(reg), BPF_MAX_VAR_OFF);
> +		break;
> +	case BPF_DIAG_MEM_MAX_OUT_OF_RANGE:
> +	default:

Nit: compounding this with 'default:' looks strange.

> +		max_start = bpf_diag_mem_max_start(reg, off);
> +		max_end = max_start + size;
> +		scnprintf(proof, BPF_DIAG_SCRATCH_STR_LEN,
> +			  "the largest possible access ends at %llu: start %llu + access_size %d, beyond object_size %u",
> +			  max_end, max_start, size, mem_size);
> +		break;
> +	}
> +

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2c5f24528071..af04709c5178 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3527,6 +3527,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
>  	    !bpf_is_spilled_scalar_reg(&state->stack[spi]) &&
>  	    size != BPF_REG_SIZE) {
>  		verbose(env, "attempt to corrupt spilled pointer on stack\n");
> +		bpf_diag_report_memory(env, insn_idx,
> +				       "stack spill corruption",
> +				       bpf_diag_scratch_printf(env,
> +							       0,
> +							       "This store writes %d bytes at stack offset %d into a stack slot that currently holds a spilled pointer. Partial writes to spilled pointers are rejected because they can corrupt pointer metadata and leak kernel pointers.",
> +							       size, off),
> +				       "Write the full 8-byte spilled pointer slot, or use a separate stack slot for scalar data before overwriting only part of it.");
>  		return -EACCES;
>  	}
>  
> @@ -3811,6 +3818,18 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env,
>  	}
>  }
>  
> +static void bpf_diag_report_stack_read_uninit(struct bpf_verifier_env *env,
> +					      int off, int i, int size)
> +{
> +	bpf_diag_report_memory(env, env->insn_idx,
> +			       "uninitialized stack read",
> +			       bpf_diag_scratch_printf(env,
> +						       0,
> +						       "This rejected read uses %d bytes at stack offset %d, but byte %d in that range is uninitialized on this path. Programs with sufficient privilege can be allowed to read uninitialized stack bytes, but this program is being rejected without that allowance.",

Nit: instead of "sufficient privilege" I'd refer to CAP_BPF (or which one is it?).

> +						       size, off, i),
> +			       "Initialize every byte in the stack range before reading it, adjust the offset and size so the read covers only initialized bytes, or load with the privilege needed for uninitialized stack reads.");
> +}
> +
>  /* Read the stack at 'off' and put the results into the register indicated by
>   * 'dst_regno'. It handles reg filling if the addressed stack slot is a
>   * spilled reg.
> @@ -3896,9 +3915,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
>  					if (type == STACK_POISON) {
>  						verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
>  							off, i, size);
> +						return -EFAULT;

sashiko is right to call this out as a behavioral change,
but it is a correct changer nonetheless.

>  					} else {
>  						verbose(env, "invalid read from stack off %d+%d size %d\n",
>  							off, i, size);
> +						bpf_diag_report_stack_read_uninit(env, off, i,
> +										  size);
>  					}
>  					return -EACCES;
>  				}
> @@ -3951,9 +3973,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
>  			if (type == STACK_POISON) {
>  				verbose(env, "reading from stack off %d+%d size %d, slot poisoned by dead code elimination\n",
>  					off, i, size);
> +				return -EFAULT;
>  			} else {
>  				verbose(env, "invalid read from stack off %d+%d size %d\n",
>  					off, i, size);
> +				bpf_diag_report_stack_read_uninit(env, off, i,
> +								  size);
>  			}
>  			return -EACCES;
>  		}
> @@ -4045,6 +4070,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
>  		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
>  		verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n",
>  			tn_buf, off, size);
> +		bpf_diag_report_memory(env, env->insn_idx,
> +				       "variable stack access",
> +				       bpf_diag_scratch_printf(env,
> +							       0,
> +							       "The helper would access the stack through variable offset %s plus fixed offset %d and size %d. Helper stack memory arguments require a constant stack offset and a precise initialized range.",
> +							       tn_buf, off, size),
> +				       "Use a fixed stack offset for helper memory arguments, or copy the needed bytes into a fixed stack slot first.");
>  		return -EACCES;
>  	}
>  	/* Variable offset is prohibited for unprivileged mode for simplicity
> @@ -4312,6 +4344,12 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_
>  	      reg_smin(reg) + off < 0)) {
>  		verbose(env, "%s min value is negative, either use unsigned index or do a if (index >=0) check.\n",
>  			reg_arg_name(env, argno));
> +		bpf_diag_report_mem_bounds(env, env->insn_idx,
> +					   reg_from_argno(argno),
> +					   reg_arg_name(env, argno),
> +					   reg_type_str(env, reg->type),
> +					   BPF_DIAG_MEM_NEGATIVE_MIN,
> +					   off, size, mem_size, reg);

Nit: avoid duplicating this 10 parameters call?

        err = -EACCESS;
        kind = BPF_DIAG_MEM_NEGATIVE_MIN;
        goto report_error;

  report_error:
        bpf_diag_report_mem_bounds(...);
        return err;

?

>  		return -EACCES;
>  	}
>  	err = __check_mem_access(env, reg, argno, reg_smin(reg) + off, size,
> @@ -4319,6 +4357,12 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_
>  	if (err) {
>  		verbose(env, "%s min value is outside of the allowed memory range\n",
>  			reg_arg_name(env, argno));
> +		bpf_diag_report_mem_bounds(env, env->insn_idx,
> +					   reg_from_argno(argno),
> +					   reg_arg_name(env, argno),
> +					   reg_type_str(env, reg->type),
> +					   BPF_DIAG_MEM_MIN_OUT_OF_RANGE,
> +					   off, size, mem_size, reg);
>  		return err;
>  	}
>  
> @@ -4329,6 +4373,12 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_
>  	if (reg_umax(reg) >= BPF_MAX_VAR_OFF) {
>  		verbose(env, "%s unbounded memory access, make sure to bounds check any such access\n",
>  			reg_arg_name(env, argno));
> +		bpf_diag_report_mem_bounds(env, env->insn_idx,
> +					   reg_from_argno(argno),
> +					   reg_arg_name(env, argno),
> +					   reg_type_str(env, reg->type),
> +					   BPF_DIAG_MEM_UNBOUNDED,
> +					   off, size, mem_size, reg);
>  		return -EACCES;
>  	}
>  	err = __check_mem_access(env, reg, argno, reg_umax(reg) + off, size,
> @@ -4336,6 +4386,12 @@ static int check_mem_region_access(struct bpf_verifier_env *env, struct bpf_reg_
>  	if (err) {
>  		verbose(env, "%s max value is outside of the allowed memory range\n",
>  			reg_arg_name(env, argno));
> +		bpf_diag_report_mem_bounds(env, env->insn_idx,
> +					   reg_from_argno(argno),
> +					   reg_arg_name(env, argno),
> +					   reg_type_str(env, reg->type),
> +					   BPF_DIAG_MEM_MAX_OUT_OF_RANGE,
> +					   off, size, mem_size, reg);
>  		return err;
>  	}
>  

^ permalink raw reply

* Re: [PATCH net 0/7] xsk: fix AF_XDP multi-buffer Tx descriptor reclaim
From: Jason Xing @ 2026-06-25  1:33 UTC (permalink / raw)
  To: Maciej Fijalkowski
  Cc: Stanislav Fomichev, netdev, bpf, magnus.karlsson, stfomichev,
	kuba, pabeni, horms, bjorn
In-Reply-To: <ajwHuCp82SazSwKv@boxer>

On Thu, Jun 25, 2026 at 12:37 AM Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> On Wed, Jun 24, 2026 at 08:38:20AM -0700, Stanislav Fomichev wrote:
> > On 06/23, Maciej Fijalkowski wrote:
> > > Hi,
> > >
> > > This series fixes several AF_XDP multi-buffer Tx paths where descriptors
> > > consumed from the Tx ring are not consistently returned to userspace
> > > through the completion ring when the packet is later dropped as invalid.
> > >
> > > The affected cases are invalid or oversized multi-buffer Tx packets in
> > > both the generic and zero-copy paths. In these cases, the kernel can
> > > consume one or more Tx descriptors while building or validating a
> > > multi-buffer packet, then drop the packet before it reaches the device.
> > > Userspace still owns the UMEM buffers only after the corresponding
> > > addresses are returned through the CQ. Missing completions therefore
> > > make userspace lose track of those buffers.
> > >
> > > The generic path fixes cover three related cases:
> > > * partially built multi-buffer skbs dropped by xsk_drop_skb();
> > >   continuation descriptors left in the Tx ring after xsk_build_skb()
> > >   reports overflow;
> > > * invalid descriptors encountered in the middle of a multi-buffer
> > >   packet, including the offending invalid descriptor itself.
> > >
> > > The zero-copy path is handled separately. The batched Tx parser now
> > > distinguishes descriptors that can be passed to the driver from
> > > descriptors that are consumed only because they belong to an invalid
> > > multi-buffer packet. Reclaim-only descriptors are written to the CQ
> > > address area and published in completion order, after any earlier
> > > driver-visible Tx descriptors.
> > >
> > > The ZC batching path can also retain drain state when userspace has not
> > > yet provided the end of an invalid multi-buffer packet. To keep this
> > > state local to the singular batched path, the series prevents a second
> > > Tx socket from joining the same pool while such drain state exists.
> > > During the singular-to-shared transition, Tx batching is gated,
> > > pre-existing readers are waited out, and bind fails with -EAGAIN if the
> > > existing socket still has pending drain state. This avoids adding
> > > multi-buffer drain handling to the shared-UMEM fallback path.
> > >
> > > The last two patches update xskxceiver so the tests account invalid
> > > multi-buffer Tx packets as descriptors that must be reclaimed, while
> > > still not expecting those invalid packets on the Rx side.
> > >
> > > This is a follow-up to Jason's changes [0] which were addressing generic
> > > xmit only and this set allows me to pass full xskxceiver test suite run
> > > against ice driver.
> >
> > There is a fair amount of feedback from sashiko already :-( So the meta
> > question from me is: is it time to scrap our current approach where
> > we parse descriptor by descriptor? (and maintain half-baked skb and
> > half-consumed descriptor queues)
> >
> > Should we:
> >
> > 1. do desc[MAX_SKB_FRAGS] and xskq_cons_peek_desc until we exhaust
> > PKT_CONT (if the last packet has PKT_CONT, return EOVERFLOW to userspace
> > and do a full stop here)
> > 2. now that we really know the number of valid descriptors -> reserve
> > the cq space (if not -> EAGAIN)
> > 3. pre-allocate everything here (if at any point we have ENOMEM -> cleanup
> > locally, don't ever create semi-initialized skb)
> > 4. construct the skb
> > 5. xmit
>
> Yeah generic xmit became utterly horrible, haven't gone through sashiko
> reviews yet, but bare in mind this set also aligns zc side to what was
> previously being addressed by Jason.
>
> I believe planned logistics were to get these fixes onto net and then
> Jason had an implementation of batching on generic xmit, directed towards
> -next and that's where we could address current flow.

Agreed. That's what I'm hoping for. There would be much more
discussion on how to do batch xmit in an elegant way, I believe.

Thanks,
Jason

>
> >
> > If at any point there is an issue, the cleanup is straightforward. That
> > whole xk->skb goes away, no state between syscalls. Thoughts?

^ permalink raw reply

* Re: [PATCH 1/2] bpf: preserve rx_queue_index across XDP redirects
From: Jakub Kicinski @ 2026-06-25  1:54 UTC (permalink / raw)
  To: Siddharth C; +Cc: ast, hawk, andrii, netdev, bpf, linux-kernel, linux-kselftest
In-Reply-To: <20260620121321.45227-2-siddharthcibi@icloud.com>

On Sat, 20 Jun 2026 12:13:13 +0000 Siddharth C wrote:
> diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
> index 5e59ab896f05..8f2d7013620f 100644
> --- a/kernel/bpf/cpumap.c
> +++ b/kernel/bpf/cpumap.c
> @@ -197,7 +197,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
>  
>  		rxq.dev = xdpf->dev_rx;
>  		rxq.mem.type = xdpf->mem_type;
> -		/* TODO: report queue_index to xdp_rxq_info */
> +		rxq.queue_index = xdpf->rx_queue_index;

Do you actually need this or you're just trying to address the TODO?
You can always store the info you need in the metadata prepend.

>  		xdp_convert_frame_to_buff(xdpf, &xdp);
>  
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index dc7b859e8bbf..f419fa0e53e5 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -339,7 +339,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  				struct net_device *rx_dev)
>  {
>  	struct xdp_txq_info txq = { .dev = tx_dev };
> -	struct xdp_rxq_info rxq = { .dev = rx_dev };
> +	struct xdp_rxq_info rxq = { };
>  	struct xdp_buff xdp;
>  	int i, nframes = 0;
>  
> @@ -349,6 +349,9 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
>  		int err;
>  
>  		xdp_convert_frame_to_buff(xdpf, &xdp);
> +		rxq.dev = rx_dev;
> +		rxq.mem.type = xdpf->mem_type;

Why are you setting mem_type?

> +		rxq.queue_index = xdpf->rx_queue_index;
>  		xdp.txq = &txq;
>  		xdp.rxq = &rxq;

^ permalink raw reply

* Re: [PATCH bpf-next v2 10/17] bpf: Report Resource Lifetime reference leaks
From: Eduard Zingerman @ 2026-06-25  1:58 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi, bpf
  Cc: Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Emil Tsalapatis, kkd, kernel-team
In-Reply-To: <20260619205934.1312876-11-memxor@gmail.com>

On Fri, 2026-06-19 at 22:59 +0200, Kumar Kartikeya Dwivedi wrote:

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

[...]

> @@ -1120,6 +1124,15 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
>  
>  		verbose(env, "irq flag acquired by %s kfuncs cannot be restored with %s kfuncs\n",
>  			flag_kfunc, used_kfunc);
> +		bpf_diag_report_irq_resource_state(env, env->insn_idx,
> +						   "IRQ flag restore mismatch",
> +						   bpf_diag_scratch_printf(env,
> +									   0,
> +									   "This IRQ flag was saved by %s IRQ kfuncs, but the restore call belongs to the %s IRQ kfunc family. Save and restore operations must use the same family.",
> +									   flag_kfunc,
> +									   used_kfunc),
> +						   "Restore the flag with the matching IRQ restore kfunc for the save operation that created it.",
> +						   env->cur_state->active_irq_id ? 1 : 0);
>  		return -EINVAL;
>  	}
>  
> @@ -1137,6 +1150,11 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r
>  
>  		verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
>  			env->cur_state->active_irq_id, insn_idx);
> +		bpf_diag_report_irq_resource_state(env, env->insn_idx,
> +						   "IRQ flag restore out of order",
> +						   "IRQ-disabled regions must be restored in last-in, first-out order, but this restore does not match the currently active IRQ flag.",
> +						   "Restore nested IRQ flags in the reverse order they were saved.",
> +						   env->cur_state->active_irq_id ? 1 : 0);
>  		return err;
>  	}
>  
> @@ -7258,11 +7276,19 @@ static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state
>  			if (find_lock_state(env->cur_state, REF_TYPE_LOCK, 0, NULL)) {
>  				verbose(env,
>  					"Locking two bpf_spin_locks are not allowed\n");
> +				bpf_diag_report_resource_state(env, env->insn_idx,
> +							       "nested spin lock",
> +							       "This path already holds a bpf_spin_lock. The verifier allows only one regular BPF spin lock at a time.",
> +							       "Unlock the current bpf_spin_lock before taking another one.");

It might be helpful to report where the previous spin lock had been acquired,
however bpf_diag_report_resource_state() does not inspect the events history.

>  				return -EINVAL;
>  			}
>  		} else if (is_res_lock && cur->active_locks) {
>  			if (find_lock_state(env->cur_state, REF_TYPE_RES_LOCK | REF_TYPE_RES_LOCK_IRQ, reg->id, ptr)) {
>  				verbose(env, "Acquiring the same lock again, AA deadlock detected\n");
> +				bpf_diag_report_resource_state(env, env->insn_idx,
> +							       "recursive resource spin lock",
> +							       "This path already holds the same resource spin lock. Taking it again would deadlock.",
> +							       "Avoid reacquiring the same resource spin lock before it is unlocked.");

Same here.

>  				return -EINVAL;
>  			}
>  		}

[...]

> @@ -7300,14 +7330,26 @@ static int process_spin_lock(struct bpf_verifier_env *env, struct bpf_reg_state
>  			type = REF_TYPE_LOCK;
>  		if (!find_lock_state(cur, type, reg->id, ptr)) {
>  			verbose(env, "%s_unlock of different lock\n", lock_str);
> +			bpf_diag_report_resource_state(env, env->insn_idx,
> +						       "unlock of different lock",
                                                               ^^^^^
                                                           nit: "of a different"
> +						       "This unlock does not match any active lock with the same tracked identity on the current path.",
> +						       "Unlock the same lock object that was most recently acquired.");

Would it be helpful to report which object exactly would verifier
expect to be unlocked?

>  			return -EINVAL;
>  		}
>  		if (reg->id != cur->active_lock_id || ptr != cur->active_lock_ptr) {
>  			verbose(env, "%s_unlock cannot be out of order\n", lock_str);
> +			bpf_diag_report_resource_state(env, env->insn_idx,
> +						       "unlock out of order",
> +						       "Locks must be released in last-in, first-out order, but this unlock does not match the currently active lock.",
> +						       "Release nested locks in the reverse order they were acquired.");
>  			return -EINVAL;
>  		}
>  		if (release_lock_state(env, type, reg->id, ptr)) {
>  			verbose(env, "%s_unlock of different lock\n", lock_str);
> +			bpf_diag_report_resource_state(env, env->insn_idx,
> +						       "unlock of different lock",
> +						       "The verifier could not release a lock state matching this unlock operation.",
> +						       "Pass the same lock object and lock kind that were used for the matching lock operation.");
>  			return -EINVAL;
>  		}
>  
> @@ -7473,6 +7515,10 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat
>  		verbose(env,
>  			"%s expected pointer to stack or const struct bpf_dynptr\n",
>  			reg_arg_name(env, argno));
> +		bpf_diag_report_resource_state(env, insn_idx,
> +					       "invalid dynptr argument",
> +					       "A dynptr argument must be a pointer to a dynptr stack slot or a verifier-provided const struct bpf_dynptr.",
> +					       "Pass the address of a stack dynptr object, or use a const dynptr pointer returned by the verifier-supported path.");

Should the error message point out what's in the register instead of
expected type?

>  		return -EINVAL;
>  	}
>  

[...]

> @@ -7526,6 +7584,10 @@ static int process_dynptr_func(struct bpf_verifier_env *env, struct bpf_reg_stat
>  				"Expected a dynptr of type %s as %s\n",
>  				dynptr_type_str(arg_to_dynptr_type(arg_type)),
>  				reg_arg_name(env, argno));
> +			bpf_diag_report_resource_state(env, insn_idx,
> +						       "wrong dynptr type",
> +						       "The dynptr is initialized, but it was created for a different backing object type than this operation accepts.",
> +						       "Use a dynptr constructor that matches this operation, or call an operation that accepts the dynptr's current type.");

Nit: elaborate on which type is expected and which type the actual dynptr has?

>  			return -EINVAL;
>  		}
>  
> @@ -7594,6 +7656,10 @@ static int process_iter_arg(struct bpf_verifier_env *env, struct bpf_reg_state *
>  	if (reg->type != PTR_TO_STACK) {
>  		verbose(env, "%s expected pointer to an iterator on stack\n",
>  			reg_arg_name(env, argno));
> +		bpf_diag_report_resource_state(env, insn_idx,
> +					       "invalid iterator argument",
> +					       "Iterator state must live in verifier-tracked stack memory, but this argument is not a stack pointer.",
> +					       "Pass the address of a stack iterator object for iterator new, next, and destroy calls.");

Similarly here (and in several places below), report what's actually in the register?

>  		return -EINVAL;
>  	}
>  

[...]

^ permalink raw reply

* [PATCH dwarves v10 0/5] pahole: Encode true signatures in kernel BTF
From: Yonghong Song @ 2026-06-25  2:01 UTC (permalink / raw)
  To: Alan Maguire, Arnaldo Carvalho de Melo, dwarves
  Cc: Alexei Starovoitov, Andrii Nakryiko, bpf, kernel-team

Current vmlinux BTF encoding is based on the source level signatures.
But the compiler may do some optimization and changed the signature.
If the user tried with source level signature, their initial implementation
may have wrong results and then the user need to check what is the
problem and work around it, e.g. through kprobe since kprobe does not
need vmlinux BTF.

Majority of changed signatures are due to dead argument elimination.
The following is a more complex one. The original source signature:
  typedef struct {
        union {
                void            *kernel;
                void __user     *user;
        };
        bool            is_kernel : 1;
  } sockptr_t;
  typedef sockptr_t bpfptr_t;
  static int map_create(union bpf_attr *attr, bpfptr_t uattr) { ... }
After compiler optimization, the signature becomes:
  static int map_create(union bpf_attr *attr, bool uattr__is_kernel) { ... }
In the above, uattr__is_kernel corresponds to 'is_kernel' field in sockptr_t.
This makes it easier for developers to understand what changed.

The new signature needs to properly follow ABI specification based on
locations. Otherwise, that signature should be discarded. For example,

    0x0242f1f7:   DW_TAG_subprogram
                    DW_AT_name      ("memblock_find_in_range")
                    DW_AT_calling_convention        (DW_CC_nocall)
                    DW_AT_type      (0x0242decc "phys_addr_t")
                    ...
    0x0242f22e:     DW_TAG_formal_parameter
                      DW_AT_location        (indexed (0x14a) loclist = 0x005595bc:
                         [0xffffffff87a000f9, 0xffffffff87a00178): DW_OP_reg5 RDI
                         [0xffffffff87a00178, 0xffffffff87a001be): DW_OP_reg14 R14
                         [0xffffffff87a001be, 0xffffffff87a001c7): DW_OP_entry_value(DW_OP_reg5 RDI), DW_OP_stack_value
                         [0xffffffff87a001c7, 0xffffffff87a00214): DW_OP_reg14 R14)
                      DW_AT_name    ("start")
                      DW_AT_type    (0x0242decc "phys_addr_t")
                      ...
    0x0242f239:     DW_TAG_formal_parameter
                      DW_AT_location        (indexed (0x14b) loclist = 0x005595e6:
                         [0xffffffff87a000f9, 0xffffffff87a00175): DW_OP_reg4 RSI
                         [0xffffffff87a00175, 0xffffffff87a001b8): DW_OP_reg3 RBX
                         [0xffffffff87a001b8, 0xffffffff87a001c7): DW_OP_entry_value(DW_OP_reg4 RSI), DW_OP_stack_value
                         [0xffffffff87a001c7, 0xffffffff87a00214): DW_OP_reg3 RBX)
                      DW_AT_name    ("end")
                      DW_AT_type    (0x0242decc "phys_addr_t")
                      ...
    0x0242f245:     DW_TAG_formal_parameter
                      DW_AT_location        (indexed (0x14c) loclist = 0x00559610:
                         [0xffffffff87a001e3, 0xffffffff87a001ef): DW_OP_breg4 RSI+0)
                      DW_AT_name    ("size")
                      DW_AT_type    (0x0242decc "phys_addr_t")
                      ...
    0x0242f250:     DW_TAG_formal_parameter
                      DW_AT_const_value     (4096)
                      DW_AT_name    ("align")
                      DW_AT_type    (0x0242decc "phys_addr_t")
                      ...

The third argument should correspond to RDX for x86_64. But the location suggests that
the parameter value is stored in the address with 'RSI + 0'. It is not clear whether
the parameter value is stored in RDX or not. So we have to discard this funciton in
vmlinux BTF to avoid incorrect true signatures.

For llvm, any function having
  DW_AT_calling_convention        (DW_CC_nocall)
in dwarf DW_TAG_subprogram will indicate that this function has signature changed.
But for non DW_CC_nocall functions, it is possible that true signature still not
available due to locations. So every functions will be checked.

I did experiment with latest bpf-next. For x86_64, there are 69103 kernel functions
and 875 kernel functions having signature changed. A series of patches are intended
to ensure true signatures are properly represented. Eventually, only 20 functions
cannot have true signatures due to locations.

For arm64, there are 863 kernel functions having signature changed, and
108 functions cannot have true signatures due to locations. I checked those
functions and look like llvm arm64 backend more relaxed to compute parameter
values.

For full testing, I enabled true signature support in kernel scripts/Makefile.btf like below:
  -pahole-flags-$(call test-ge, $(pahole-ver), 131) += --btf_features=attributes
  +pahole-flags-$(call test-ge, $(pahole-ver), 131) += --btf_features=attributes --btf_features=+true_signature

See individual patches for details.

Changelog:
  v9 -> v10:
    - v9: https://lore.kernel.org/bpf/20260624052553.3139112-1-yonghong.song@linux.dev/
    - Remove patch 'btf_encoder: Avoid comparing inlined and out-of-line function prototypes'
      as it caused btf_functions.sh failure.
  v8 -> v9:
    - v8: https://lore.kernel.org/bpf/20260623222850.3290612-1-yonghong.song@linux.dev/
    - v8 made a mistake where all functions go through main loop in
      function__analyze_parameter_locations(). This is not correct as it includes functions
      whose signature is not changed. See function
      function__match_clang_parameter_locations().
    - Add a change in btf_encoder to avoid comparing inlined and out-of-line function prototypes.
      This allows some functions to be in btf.
    - Add one more test, clang_parm_optimized_stack_2, where a global function, with
      many unused parameters, gets properly handling in location checking.
  v7 -> v8:
    - v7: https://lore.kernel.org/bpf/20260623040704.2732530-1-yonghong.song@linux.dev/
    - Both Check and Analyze phases go through all functions.
    - Require true_signature in btf_encoder to have name like <paramName>__<field>.
    - Remove signature_changed (from nocall). It is possible that function signatures
      are not changed but the location reigsters do not match.
  v6 -> v7:
    - v6: https://lore.kernel.org/bpf/20260618011358.632394-1-yonghong.song@linux.dev/
    - Ensure that 'collect' and 'analyze' have the same location checking.
    - In 'analyze' stage, undo true_sig_member_name if the next expected register
      matches the previous source type although the previous parameter may
      only use half of the value.
  v5 -> v6:
    - v5: https://lore.kernel.org/bpf/20260523165712.1225231-1-yonghong.song@linux.dev/
    - The previous change relies on parameter__new() to collect and analyze each
      parameter to decide true signatures. The new one separates collecting and
      analyzing phase from Alan. This two-phase makes logic easy to understand.
    - In btf_encoder.c, remove usage of skip_idx to simplify the code.
  v4 -> v5:
    - v4: https://lore.kernel.org/bpf/20260326013144.2901265-1-yonghong.song@linux.dev/
    - Check info.signature_changed only under clang.
    - Fix an uninitialized varable issue (var reg_dix) for gcc.
  v3 -> v4:
    - v3: https://lore.kernel.org/bpf/20260320190917.1970524-1-yonghong.song@linux.dev/
    - Add simple prescan of parameter registers in order to get true signatures
      for those functions where optimization could happen but compiler didn't do it.
    - Do not create a new name (e.g. "uattr__is_kernel") with malloc at parameter_reg()
      stage. Instead remember both "uattr" and "is_kernel" and later generate the
      name "uattr_is_kernel" in btf encoder.
    - Add comments to explain how to handle parameters which may take two registers.
    - Fix some test failures on aarch64.
  v2 -> v3:
    - v2: https://lore.kernel.org/bpf/20260309153215.1917033-1-yonghong.song@linux.dev/
    - Change tests by using newly added test_lib.sh.
    - Simplify to get bool variable producer_clang.
    - Try to avoid producer_clang appearance in dwarf_loader.c in order to avoid
      clear separation between clang and gcc.
  v1 -> v2:
    - v1: https://lore.kernel.org/bpf/20260305225455.1151066-1-yonghong.song@linux.dev/
    - Added producer_clang guarding in btf_encoder. Otherwise, gcc kernel build
      will crash pahole.
    - Fix an early return in parameter__reg() which didn't do pthread_mutex_unlock()
      which caused the deadlock for arm64.
    - Add a few more places to guard with producer_clang and conf->true_signature
      to maintain the previous behavior if not clang or conf->true_signature is false.

Yonghong Song (5):
  dwarf_loader: Detect aggregate ABI register usage and signature
    changes
  dwarf_loader: Collect per-parameter information
  dwarf_loader: Analyze per-parameter information for true signatures
  btf_encoder: Emit true function signatures
  tests: Add BTF true_signature encoding tests

 btf_encoder.c                         |  26 +-
 dwarf_loader.c                        | 603 +++++++++++++++++++++++---
 dwarves.h                             |  14 +
 tests/clang_parm_aggregate_1.sh       |  85 ++++
 tests/clang_parm_aggregate_2.sh       |  88 ++++
 tests/clang_parm_memory.sh            |  77 ++++
 tests/clang_parm_optimized.sh         |  63 +++
 tests/clang_parm_optimized_stack_1.sh |  63 +++
 tests/clang_parm_optimized_stack_2.sh |  63 +++
 9 files changed, 1017 insertions(+), 65 deletions(-)
 create mode 100755 tests/clang_parm_aggregate_1.sh
 create mode 100755 tests/clang_parm_aggregate_2.sh
 create mode 100755 tests/clang_parm_memory.sh
 create mode 100755 tests/clang_parm_optimized.sh
 create mode 100755 tests/clang_parm_optimized_stack_1.sh
 create mode 100755 tests/clang_parm_optimized_stack_2.sh

-- 
2.53.0-Meta


^ permalink raw reply

* [PATCH dwarves v10 1/5] dwarf_loader: Detect aggregate ABI register usage and signature changes
From: Yonghong Song @ 2026-06-25  2:01 UTC (permalink / raw)
  To: Alan Maguire, Arnaldo Carvalho de Melo, dwarves
  Cc: Alexei Starovoitov, Andrii Nakryiko, bpf, kernel-team
In-Reply-To: <20260625020148.1883082-1-yonghong.song@linux.dev>

Aggregate ABI register usage applies for both clang and gcc.
The signature change detection is clang only.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 dwarf_loader.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++----
 dwarves.h      |  3 +++
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/dwarf_loader.c b/dwarf_loader.c
index 8ce34cb..b967d31 100644
--- a/dwarf_loader.c
+++ b/dwarf_loader.c
@@ -1137,6 +1137,16 @@ static void arch__set_register_params(const GElf_Ehdr *ehdr, struct cu *cu)
 	}
 }
 
+static bool arch__agg_use_two_regs(const GElf_Ehdr *ehdr)
+{
+	switch (ehdr->e_machine) {
+	case EM_S390:
+		return false;
+	default:
+		return true;
+	}
+}
+
 static struct template_type_param *template_type_param__new(Dwarf_Die *die, struct cu *cu, struct conf_load *conf)
 {
 	struct template_type_param *ttparm = tag__alloc(cu, sizeof(*ttparm));
@@ -1539,6 +1549,29 @@ static struct ftype *ftype__new(Dwarf_Die *die, struct cu *cu)
 	return ftype;
 }
 
+static bool function__signature_changed(struct function *func, Dwarf_Die *die)
+{
+	/* The inlined DW_TAG_subprogram typically has the original source type for
+	 * abstract origin of a concrete function with address range, inlined subroutine,
+	 * or call site.
+	 */
+	if (func->inlined)
+		return false;
+
+	if (!func->abstract_origin)
+		return attr_numeric(die, DW_AT_calling_convention) == DW_CC_nocall;
+
+	Dwarf_Attribute attr;
+	if (dwarf_attr(die, DW_AT_abstract_origin, &attr)) {
+		Dwarf_Die origin;
+		if (dwarf_formref_die(&attr, &origin))
+			return attr_numeric(&origin, DW_AT_calling_convention) == DW_CC_nocall;
+	}
+
+	/* This should not happen */
+	return false;
+}
+
 static struct function *function__new(Dwarf_Die *die, struct cu *cu, struct conf_load *conf)
 {
 	struct function *func = tag__alloc(cu, sizeof(*func));
@@ -2491,10 +2524,17 @@ static struct tag *die__create_new_function(Dwarf_Die *die, struct cu *cu, struc
 {
 	struct function *function = function__new(die, cu, conf);
 
-	if (function != NULL &&
-	    die__process_function(die, &function->proto, &function->lexblock, cu, conf) != 0) {
-		function__delete(function, cu);
-		function = NULL;
+	if (function != NULL) {
+		/* For clang, we determine if function signature changes via DW_AT_calling_convention
+		 * set to DW_CC_nocall.
+		 */
+		if (cu->producer_clang)
+			function->proto.signature_changed = function__signature_changed(function, die);
+
+		if (die__process_function(die, &function->proto, &function->lexblock, cu, conf) != 0) {
+			function__delete(function, cu);
+			function = NULL;
+		}
 	}
 
 	if (function != NULL &&
@@ -3154,6 +3194,17 @@ static unsigned long long dwarf_tag__orig_id(const struct tag *tag,
 	return cu->extra_dbg_info ? dtag->id : 0;
 }
 
+static bool attr_producer_clang(Dwarf_Die *die)
+{
+	const char *producer;
+
+	producer = attr_string(die, DW_AT_producer, NULL);
+	if (!producer)
+		return false;
+
+	return !!strstr(producer, "clang");
+}
+
 struct debug_fmt_ops dwarf__ops;
 
 static int die__process(Dwarf_Die *die, struct cu *cu, struct conf_load *conf)
@@ -3191,6 +3242,7 @@ static int die__process(Dwarf_Die *die, struct cu *cu, struct conf_load *conf)
 	}
 
 	cu->language = attr_numeric(die, DW_AT_language);
+	cu->producer_clang = attr_producer_clang(die);
 
 	if (conf->early_cu_filter)
 		cu = conf->early_cu_filter(cu);
@@ -3409,6 +3461,7 @@ static int cu__set_common(struct cu *cu, struct conf_load *conf,
 
 	cu->little_endian = ehdr.e_ident[EI_DATA] == ELFDATA2LSB;
 	cu->nr_register_params = arch__nr_register_params(&ehdr);
+	cu->agg_use_two_regs = arch__agg_use_two_regs(&ehdr);
 	arch__set_register_params(&ehdr, cu);
 	return 0;
 }
@@ -3950,6 +4003,7 @@ static int cus__merge_and_process_cu(struct cus *cus, struct conf_load *conf,
 			cu->priv = dcu;
 			cu->dfops = &dwarf__ops;
 			cu->language = attr_numeric(cu_die, DW_AT_language);
+			cu->producer_clang = attr_producer_clang(cu_die);
 			cus__add(cus, cu);
 		}
 
diff --git a/dwarves.h b/dwarves.h
index 75c311a..ac559c3 100644
--- a/dwarves.h
+++ b/dwarves.h
@@ -306,6 +306,8 @@ struct cu {
 	uint8_t		 has_addr_info:1;
 	uint8_t		 uses_global_strings:1;
 	uint8_t		 little_endian:1;
+	uint8_t		 producer_clang:1;
+	uint8_t		 agg_use_two_regs:1;	/* An aggregate like {long a; long b;} */
 	uint8_t		 nr_register_params;
 	int		 register_params[ARCH_MAX_REGISTER_PARAMS];
 	int		 functions_saved;
@@ -1030,6 +1032,7 @@ struct ftype {
 	uint8_t		 inconsistent_proto:1;
 	uint8_t		 uncertain_parm_loc:1;
 	uint8_t		 reordered_parm:1;
+	uint8_t		 signature_changed:1;
 	struct list_head template_type_params;
 	struct list_head template_value_params;
 	struct template_parameter_pack *template_parameter_pack;
-- 
2.53.0-Meta


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox