Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC PATCH 2/2] selftests/landlock: Add test for TCP fast open
From: Mickaël Salaün @ 2026-06-26 21:19 UTC (permalink / raw)
  To: Matthieu Buffet
  Cc: Bryam Vargas, Günther Noack, linux-security-module,
	Mikhail Ivanov, Paul Moore, Eric Dumazet, Neal Cardwell,
	linux-kernel, netdev
In-Reply-To: <20260617180526.15627-3-matthieu@buffet.re>

On Wed, Jun 17, 2026 at 08:05:24PM +0200, Matthieu Buffet wrote:
> Enforce that TCP Fast Open is controlled by
> LANDLOCK_ACCESS_NET_CONNECT_TCP. Semantics of connect() and
> sendmsg(MSG_FASTOPEN) should be identical from Landlock's perspective.
> Also enforce error code consistency, since UDP sockets ignore
> the MSG_FASTOPEN flag while Unix sockets reject it.
> 
> Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
> ---
>  tools/testing/selftests/landlock/net_test.c | 155 ++++++++++++++++++++
>  1 file changed, 155 insertions(+)
> 
> diff --git a/tools/testing/selftests/landlock/net_test.c b/tools/testing/selftests/landlock/net_test.c
> index 0c256e7c8675..177ed28e70f6 100644
> --- a/tools/testing/selftests/landlock/net_test.c
> +++ b/tools/testing/selftests/landlock/net_test.c
> @@ -258,6 +258,64 @@ static int connect_variant(const int sock_fd,
>  	return connect_variant_addrlen(sock_fd, srv, get_addrlen(srv, false));
>  }
>  
> +static int sendto_variant_addrlen(const int sock_fd,
> +				  const struct service_fixture *const srv,
> +				  const socklen_t addrlen, void *buf,
> +				  size_t len, size_t flags)
> +{
> +	const struct sockaddr *dst = NULL;
> +	ssize_t ret;
> +
> +	/*
> +        * We never want our processes to be killed by SIGPIPE: we check return
> +        * codes and errno, so that we have actual error messages.
> +        */

There are some extra spaces above.

> +	flags |= MSG_NOSIGNAL;
> +
> +	if (srv != NULL) {

Just `if (srv) {`

> +		switch (srv->protocol.domain) {
> +		case AF_UNSPEC:
> +		case AF_INET:
> +			dst = (const struct sockaddr *)&srv->ipv4_addr;
> +			break;
> +
> +		case AF_INET6:
> +			dst = (const struct sockaddr *)&srv->ipv6_addr;
> +			break;
> +
> +		case AF_UNIX:
> +			dst = (const struct sockaddr *)&srv->unix_addr;
> +			break;
> +
> +		default:
> +			errno = EAFNOSUPPORT;
> +			return -errno;
> +		}
> +	}
> +
> +	ret = sendto(sock_fd, buf, len, flags, dst, addrlen);
> +	if (ret < 0)
> +		return -errno;
> +
> +	/* errno is not set in cases of partial writes. */
> +	if (ret != len)
> +		return -EINTR;
> +
> +	return 0;
> +}
> +
> +static int sendto_variant(const int sock_fd,
> +			  const struct service_fixture *const srv, void *buf,
> +			  size_t len, size_t flags)
> +{
> +	socklen_t addrlen = 0;
> +
> +	if (srv != NULL)

ditto

> +		addrlen = get_addrlen(srv, false);
> +
> +	return sendto_variant_addrlen(sock_fd, srv, addrlen, buf, len, flags);
> +}
> +
>  FIXTURE(protocol)
>  {
>  	struct service_fixture srv0, srv1, srv2, unspec_any0, unspec_srv0;
> @@ -950,6 +1008,103 @@ TEST_F(protocol, connect_unspec)
>  	EXPECT_EQ(0, close(bind_fd));
>  }
>  
> +TEST_F(protocol, tcp_fastopen)
> +{
> +	const bool restricted = variant->sandbox == TCP_SANDBOX &&
> +				variant->prot.type == SOCK_STREAM &&
> +				(variant->prot.protocol == IPPROTO_TCP ||
> +				 variant->prot.protocol == IPPROTO_IP) &&
> +				(variant->prot.domain == AF_INET ||
> +				 variant->prot.domain == AF_INET6);
> +	const struct landlock_ruleset_attr ruleset_attr = {
> +		.handled_access_net = LANDLOCK_ACCESS_NET_CONNECT_TCP,
> +	};
> +	int bind_fd, client_fd, status;
> +	char buf;
> +	pid_t child;
> +
> +	bind_fd = socket_variant(&self->srv0);
> +	ASSERT_LE(0, bind_fd);
> +	EXPECT_EQ(0, bind_variant(bind_fd, &self->srv0));
> +	if (self->srv0.protocol.type == SOCK_STREAM)
> +		EXPECT_EQ(0, listen(bind_fd, backlog));
> +
> +	child = fork();
> +	ASSERT_LE(0, child);
> +	if (child == 0) {
> +		int connect_fd, ret;
> +
> +		/* Closes listening socket for the child. */
> +		EXPECT_EQ(0, close(bind_fd));
> +
> +		connect_fd = socket_variant(&self->srv0);
> +		ASSERT_LE(0, connect_fd);
> +
> +		if (variant->sandbox == TCP_SANDBOX) {
> +			const int ruleset_fd = landlock_create_ruleset(
> +				&ruleset_attr, sizeof(ruleset_attr), 0);
> +			ASSERT_LE(0, ruleset_fd);
> +
> +			enforce_ruleset(_metadata, ruleset_fd);
> +			EXPECT_EQ(0, close(ruleset_fd));
> +		}
> +
> +		/* Fast Open with no address. */
> +		ret = sendto_variant(connect_fd, NULL, NULL, 0, MSG_FASTOPEN);


> +		if (self->srv0.protocol.domain == AF_UNIX) {
> +			ASSERT_EQ(-ENOTCONN, ret);
> +		} else if (self->srv0.protocol.type == SOCK_DGRAM) {
> +			ASSERT_EQ(-EDESTADDRREQ, ret);
> +		} else {
> +			ASSERT_EQ(-EINVAL, ret);
> +		}
> +
> +		/* Fast Open to a denied address. */
> +		ret = sendto_variant(connect_fd, &self->srv0, "A", 1,
> +				     MSG_FASTOPEN);
> +		if (restricted) {
> +			ASSERT_EQ(-EACCES, ret);
> +		} else if (self->srv0.protocol.domain == AF_UNIX &&
> +			   self->srv0.protocol.type == SOCK_STREAM) {
> +			ASSERT_EQ(-EOPNOTSUPP, ret);
> +		} else {
> +			ASSERT_EQ(0, ret);
> +		}

All these ret checks should be done with EXPECT_EQ because they don't
block the test itself and we can get more info by checking more after
that.

> +
> +		EXPECT_EQ(0, close(connect_fd));
> +		_exit(_metadata->exit_code);
> +		return;
> +	}
> +
> +	client_fd = bind_fd;
> +	if (!restricted && self->srv0.protocol.type == SOCK_STREAM &&
> +	    self->srv0.protocol.domain != AF_UNIX) {
> +		client_fd = accept(bind_fd, NULL, 0);
> +		ASSERT_LE(0, client_fd);
> +	}
> +
> +	if (restricted) {
> +		EXPECT_EQ(-1, read(client_fd, &buf, 1));
> +		EXPECT_EQ(ENOTCONN, errno);
> +	} else if (self->srv0.protocol.domain == AF_UNIX &&
> +		   self->srv0.protocol.type == SOCK_STREAM) {
> +		EXPECT_EQ(-1, read(client_fd, &buf, 1));
> +		EXPECT_EQ(EINVAL, errno);
> +	} else {
> +		EXPECT_EQ(1, read(client_fd, &buf, 1));
> +		EXPECT_EQ('A', buf);
> +	}
> +
> +	EXPECT_EQ(child, waitpid(child, &status, 0));
> +	EXPECT_EQ(1, WIFEXITED(status));
> +	EXPECT_EQ(EXIT_SUCCESS, WEXITSTATUS(status));
> +
> +	if (client_fd != bind_fd)
> +		EXPECT_LE(0, close(client_fd));
> +
> +	EXPECT_EQ(0, close(bind_fd));
> +}
> +
>  FIXTURE(ipv4)
>  {
>  	struct service_fixture srv0, srv1;
> -- 
> 2.47.3
> 
> 

^ permalink raw reply

* Re: [PATCH bpf v2 3/4] selftests/bpf: Adapt sockmap update error handling
From: bot+bpf-ci @ 2026-06-26 21:14 UTC (permalink / raw)
  To: mhal, edumazet, kuniyu, pabeni, willemb, john.fastabend, jakub,
	jiayuan.chen, davem, kuba, horms, ast, cong.wang, daniel, andrii,
	eddyz87, memxor, martin.lau, song, yonghong.song, jolsa, emil,
	shuah
  Cc: netdev, bpf, linux-kernel, linux-kselftest, mhal, ast, andrii,
	daniel, martin.lau, eddyz87, yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-3-7e7e201c951a@rbox.co>

[-- Attachment #1: Type: text/plain, Size: 1189 bytes --]

>     selftests/bpf: Adapt sockmap update error handling
>
>     Update sockmap_listen to accommodate the recent change in sockmap that
>     rejects unbound UDP sockets.
>
>     TCP: Reject unbound and bound (unless established or listening).
>     UDP: Accept only bound sockets.
>
>     Signed-off-by: Michal Luczaj <mhal@rbox.co>

This adjusts the test expectations that commit 0c48eefae712 ("sock_map:
Lift socket state restriction for datagram sockets") had introduced, the
per-sotype branching in test_insert_opened() along with the matching
changes in test_insert_bound() and test_ops().

The companion kernel fix 77a34848a723 ("bpf, sockmap: Reject unhashed UDP
sockets on sockmap update") already carries:

  Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")

Should this selftest commit carry the same Fixes: tag?

  Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/28264556257

^ permalink raw reply

* Re: [PATCH bpf v2 4/4] selftests/bpf: Fail unbound UDP on sockmap update
From: Kuniyuki Iwashima @ 2026-06-26 21:03 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Eric Dumazet, Paolo Abeni, Willem de Bruijn, John Fastabend,
	Jakub Sitnicki, Jiayuan Chen, David S. Miller, Jakub Kicinski,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-4-7e7e201c951a@rbox.co>

On Fri, Jun 26, 2026 at 1:37 PM Michal Luczaj <mhal@rbox.co> wrote:
>
> sockmap now rejects unbound UDP sockets. Adjust test_maps.
>
> This effectively reverts commit c39aa2159974 ("bpf, selftests: Fix
> test_maps now that sockmap supports UDP").
>
> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> ---
>  tools/testing/selftests/bpf/test_maps.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
> index c32da7bd8be2..81cd5d0d69c1 100644
> --- a/tools/testing/selftests/bpf/test_maps.c
> +++ b/tools/testing/selftests/bpf/test_maps.c
> @@ -759,12 +759,12 @@ static void test_sockmap(unsigned int tasks, void *data)
>                 goto out_sockmap;
>         }
>
> -       /* Test update with unsupported UDP socket */
> +       /* Test update with unsupported unbound UDP socket */
>         udp = socket(AF_INET, SOCK_DGRAM, 0);
>         i = 0;
>         err = bpf_map_update_elem(fd, &i, &udp, BPF_ANY);
> -       if (err) {
> -               printf("Failed socket update SOCK_DGRAM '%i:%i'\n",
> +       if (!err) {
> +               printf("Failed allowed unbound SOCK_DGRAM socket update '%i:%i'\n",

nit: Maybe s/Failed/Unexpectedly succeeded/ ?

If we want to avoid breakage, this patch needs to be squashed to
the fix patch, but it's discouraged in netdev, not sure about bpf tree.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH bpf v2 3/4] selftests/bpf: Adapt sockmap update error handling
From: Kuniyuki Iwashima @ 2026-06-26 20:58 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Eric Dumazet, Paolo Abeni, Willem de Bruijn, John Fastabend,
	Jakub Sitnicki, Jiayuan Chen, David S. Miller, Jakub Kicinski,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-3-7e7e201c951a@rbox.co>

On Fri, Jun 26, 2026 at 1:37 PM Michal Luczaj <mhal@rbox.co> wrote:
>
> Update sockmap_listen to accommodate the recent change in sockmap that
> rejects unbound UDP sockets.
>
> TCP: Reject unbound and bound (unless established or listening).
> UDP: Accept only bound sockets.
>
> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> ---
>  tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 17 +++++++++--------
>  1 file changed, 9 insertions(+), 8 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
> index cc0c68bab907..6ee1bc6b3b23 100644
> --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
> +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
> @@ -63,11 +63,8 @@ static void test_insert_opened(struct test_sockmap_listen *skel __always_unused,
>         errno = 0;
>         value = s;
>         err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
> -       if (sotype == SOCK_STREAM) {
> -               if (!err || errno != EOPNOTSUPP)
> -                       FAIL_ERRNO("map_update: expected EOPNOTSUPP");
> -       } else if (err)
> -               FAIL_ERRNO("map_update: expected success");

Initially I thought AF_UNIX still exercised this path but it was removed
in f3de1cf621f7.  The leftover in family_str() was a bit confusing, so please
follow up on bpf-next.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH bpf v2 2/4] selftests/bpf: Ensure UDP sockets are bound
From: Kuniyuki Iwashima @ 2026-06-26 20:47 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Eric Dumazet, Paolo Abeni, Willem de Bruijn, John Fastabend,
	Jakub Sitnicki, Jiayuan Chen, David S. Miller, Jakub Kicinski,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-2-7e7e201c951a@rbox.co>

On Fri, Jun 26, 2026 at 1:37 PM Michal Luczaj <mhal@rbox.co> wrote:
>
> Update sockmap_basic tests to bind sockets before they are used. This
> accommodates the recent change in sockmap that rejects unbound UDP sockets.
>
> Signed-off-by: Michal Luczaj <mhal@rbox.co>

nit: this should be patch 1.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH bpf v2 1/4] bpf, sockmap: Reject unhashed UDP sockets on sockmap update
From: Kuniyuki Iwashima @ 2026-06-26 20:45 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Eric Dumazet, Paolo Abeni, Willem de Bruijn, John Fastabend,
	Jakub Sitnicki, Jiayuan Chen, David S. Miller, Jakub Kicinski,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-1-7e7e201c951a@rbox.co>

On Fri, Jun 26, 2026 at 1:37 PM Michal Luczaj <mhal@rbox.co> wrote:
>
> UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
> sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.
>
> Because sockmap accepts unbound UDP sockets, a BPF program can increment a
> socket's refcount via lookup. If the socket is subsequently bound, the
> transition from unbound to bound causes bpf_sk_release() to skip the
> decrement of the refcount, causing a memory leak.
>
> unreferenced object 0xffff88810bc2eb40 (size 1984):
>   comm "test_progs", pid 2451, jiffies 4295320596
>   hex dump (first 32 bytes):
>     7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
>     02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
>   backtrace (crc bdee079d):
>     kmem_cache_alloc_noprof+0x557/0x660
>     sk_prot_alloc+0x69/0x240
>     sk_alloc+0x30/0x460
>     inet_create+0x2ce/0xf80
>     __sock_create+0x25b/0x5c0
>     __sys_socket+0x119/0x1d0
>     __x64_sys_socket+0x72/0xd0
>     do_syscall_64+0xa1/0x5f0
>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Instead of special-casing for refcounted sockets, reject unhashed UDP
> sockets during sockmap updates, as there is no benefit to supporting those.
> This effectively reverts the commit under Fixes, with two exceptions:
>
> 1. sock_map_sk_state_allowed() maintains a fall-through `return true`.
> 2. In the spirit of commit b8b8315e39ff ("bpf, sockmap: Remove unhash
>    handler for BPF sockmap usage"), the proto::unhash BPF handler is not
>    reintroduced.
>
> Historical note: this issue is related to commit 67312adc96b5 ("bpf: reject
> unhashed sockets in bpf_sk_assign").
>
> Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
> Suggested-by: Kuniyuki Iwashima <kuniyu@google.com>
> Signed-off-by: Michal Luczaj <mhal@rbox.co>

Looks good, thanks !

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf, sockmap: Don't leak UDP socks on lookup-bind-release
From: Michal Luczaj @ 2026-06-26 20:42 UTC (permalink / raw)
  To: Jakub Sitnicki, Kuniyuki Iwashima
  Cc: Willem de Bruijn, John Fastabend, Jiayuan Chen, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Alexei Starovoitov, Cong Wang, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan,
	netdev, bpf, linux-kernel, linux-kselftest
In-Reply-To: <87o6gyyjxk.fsf@cloudflare.com>

On 6/25/26 12:48, Jakub Sitnicki wrote:
> On Wed, Jun 24, 2026 at 02:39 PM -07, Kuniyuki Iwashima wrote:
>> On Wed, Jun 24, 2026 at 2:33 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>>> ...
>>> Setting SOCK_RCU_FREE itself should not cause a problem, but I think
>>> we should take a step back.
>>>
>>> AFAIU, 0c48eefae712 was to allow putting AF_UNIX SOCK_DGRAM sockets
>>> into sockmap, not to allow using unconnected UDP sockets in sk_lookup etc.
>>>
>>> Actually, v4 of the patch was implemented as such but did not get any feedback,
>>> https://lore.kernel.org/bpf/20210508220835.53801-9-xiyou.wangcong@gmail.com/#t
>>>
>>> ... and v5 (the final commit) somehow removed the restriction for unconnected
>>> UDP socket as well.
>>> https://lore.kernel.org/bpf/20210704190252.11866-3-xiyou.wangcong@gmail.com/
>>>
>>> Given the initial use case, sockmap redirect, is still blocked by
>>> TCP_ESTABLISHED
>>> check in sock_map_redirect_allowed(), I feel there is no point in supporting
>>> unconnected UDP sockets in sockmap.  It cannot get any skb from anywhere
>>> (without buggy sk_lookup).
>>
>> s/unconnected/unhashed/g :)
> 
> Rejecting unhashed UDP sockets on insert to sockmap SGTM.
> It is also in line with disable-problematic-cases strategy.

OK, here's v2 with the sock_map_sk_state_allowed() check reintroduced:
https://lore.kernel.org/bpf/20260626-sockmap-lookup-udp-leak-v2-0-7e7e201c951a@rbox.co/

Thanks,
Michal


^ permalink raw reply

* Re: [RFC PATCH 1/2] landlock: fix TCP Fast Open connection bypass
From: Mickaël Salaün @ 2026-06-26 20:40 UTC (permalink / raw)
  To: Matthieu Buffet
  Cc: Bryam Vargas, Günther Noack, linux-security-module,
	Mikhail Ivanov, Paul Moore, Eric Dumazet, Neal Cardwell,
	linux-kernel, netdev
In-Reply-To: <20260617180526.15627-2-matthieu@buffet.re>

Thanks Matthieu, could you please rebase this serise on the master
branch (especially on top of your UDP changes)?

This patch will be useful for backports though.

On Wed, Jun 17, 2026 at 08:05:23PM +0200, Matthieu Buffet wrote:
> The documentation of the socket_connect() LSM hook states that it
> controls connecting a socket to a remote address. It has not been the
> case since the addition of TCP Fast Open (RFC 7413) support, which allows
> opening a TCP connection (thus, setting a socket's destination address)
> via the MSG_FASTOPEN flag passed to sendto()/sendmsg()/sendmmsg(). The
> problem then got duplicated into MPTCP.
> 
> Landlock did not take it into account when its TCP support was added,
> leaving a bypass of TCP connect policy.
> 
> Ideally a call to the LSM hook would be added in the fastopen code path,
> in order to fix this generically. But connect() hooks are designed to run
> with the socket locked, unlike sendmsg() hooks.
> 
> Closes: https://github.com/landlock-lsm/linux/issues/41
> Fixes: fff69fb03dde ("landlock: Support network rules with TCP bind and connect")
> Signed-off-by: Matthieu Buffet <matthieu@buffet.re>
> ---
>  security/landlock/net.c | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/security/landlock/net.c b/security/landlock/net.c
> index 4ee4002a8f56..a2375762c18b 100644
> --- a/security/landlock/net.c
> +++ b/security/landlock/net.c
> @@ -246,9 +246,26 @@ static int hook_socket_connect(struct socket *const sock,
>  					   access_request);
>  }
>  
> +static int hook_socket_sendmsg(struct socket *const sock,
> +			       struct msghdr *const msg, const int size)
> +{
> +	struct sockaddr *const address = msg->msg_name;
> +	const int addrlen = msg->msg_namelen;
> +
> +	if (sk_is_tcp(sock->sk) && address != NULL &&
> +	    (msg->msg_flags & MSG_FASTOPEN) != 0) {

This might be a bit better:

  if ((msg->msg_flags & MSG_FASTOPEN) && address && sk_is_tcp(sock->sk))

> +		return current_check_access_socket(
> +			sock, address, addrlen,
> +			LANDLOCK_ACCESS_NET_CONNECT_TCP);
> +	}
> +
> +	return 0;
> +}
> +
>  static struct security_hook_list landlock_hooks[] __ro_after_init = {
>  	LSM_HOOK_INIT(socket_bind, hook_socket_bind),
>  	LSM_HOOK_INIT(socket_connect, hook_socket_connect),
> +	LSM_HOOK_INIT(socket_sendmsg, hook_socket_sendmsg),
>  };
>  
>  __init void landlock_add_net_hooks(void)
> -- 
> 2.47.3
> 
> 

^ permalink raw reply

* [PATCH bpf v2 2/4] selftests/bpf: Ensure UDP sockets are bound
From: Michal Luczaj @ 2026-06-26 20:36 UTC (permalink / raw)
  To: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Jakub Kicinski, Simon Horman, Alexei Starovoitov, Cong Wang,
	Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-0-7e7e201c951a@rbox.co>

Update sockmap_basic tests to bind sockets before they are used. This
accommodates the recent change in sockmap that rejects unbound UDP sockets.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index cb3229711f93..2d22a9058a8e 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -853,7 +853,7 @@ static void test_sockmap_many_socket(void)
 		return;
 	}
 
-	udp = xsocket(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, 0);
+	udp = socket_loopback(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK);
 	if (udp < 0) {
 		close(dgram);
 		close(tcp);
@@ -922,7 +922,7 @@ static void test_sockmap_many_maps(void)
 		return;
 	}
 
-	udp = xsocket(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, 0);
+	udp = socket_loopback(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK);
 	if (udp < 0) {
 		close(dgram);
 		close(tcp);
@@ -993,7 +993,7 @@ static void test_sockmap_same_sock(void)
 		return;
 	}
 
-	udp = xsocket(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, 0);
+	udp = socket_loopback(AF_INET, SOCK_DGRAM | SOCK_NONBLOCK);
 	if (udp < 0) {
 		close(dgram);
 		close(tcp);

-- 
2.54.0


^ permalink raw reply related

* [PATCH bpf v2 0/4] bpf, sockmap: Fix sockmap leaking UDP socks
From: Michal Luczaj @ 2026-06-26 20:36 UTC (permalink / raw)
  To: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Jakub Kicinski, Simon Horman, Alexei Starovoitov, Cong Wang,
	Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj

Fix for UDP sockets getting leaked during sockmap lookup/release.
Accompanied by selftests updates.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
Changes in v2:
- selftest: drop the original, adapt old tests
- fix: change approach to rejecting unbound UDP [Kuniyuki]
- Link to v1: https://patch.msgid.link/20260623-sockmap-lookup-udp-leak-v1-0-05804f9308e4@rbox.co

---
Michal Luczaj (4):
      bpf, sockmap: Reject unhashed UDP sockets on sockmap update
      selftests/bpf: Ensure UDP sockets are bound
      selftests/bpf: Adapt sockmap update error handling
      selftests/bpf: Fail unbound UDP on sockmap update

 net/core/sock_map.c                                     |  2 ++
 tools/testing/selftests/bpf/prog_tests/sockmap_basic.c  |  6 +++---
 tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 17 +++++++++--------
 tools/testing/selftests/bpf/test_maps.c                 |  6 +++---
 4 files changed, 17 insertions(+), 14 deletions(-)
---
base-commit: 26490a375cb9be9bac96b5171610fd85ca6c2305
change-id: 20260617-sockmap-lookup-udp-leak-bc4e5c5481d7

Best regards,
--  
Michal Luczaj <mhal@rbox.co>


^ permalink raw reply

* [PATCH bpf v2 4/4] selftests/bpf: Fail unbound UDP on sockmap update
From: Michal Luczaj @ 2026-06-26 20:36 UTC (permalink / raw)
  To: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Jakub Kicinski, Simon Horman, Alexei Starovoitov, Cong Wang,
	Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-0-7e7e201c951a@rbox.co>

sockmap now rejects unbound UDP sockets. Adjust test_maps.

This effectively reverts commit c39aa2159974 ("bpf, selftests: Fix
test_maps now that sockmap supports UDP").

Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
 tools/testing/selftests/bpf/test_maps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index c32da7bd8be2..81cd5d0d69c1 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -759,12 +759,12 @@ static void test_sockmap(unsigned int tasks, void *data)
 		goto out_sockmap;
 	}
 
-	/* Test update with unsupported UDP socket */
+	/* Test update with unsupported unbound UDP socket */
 	udp = socket(AF_INET, SOCK_DGRAM, 0);
 	i = 0;
 	err = bpf_map_update_elem(fd, &i, &udp, BPF_ANY);
-	if (err) {
-		printf("Failed socket update SOCK_DGRAM '%i:%i'\n",
+	if (!err) {
+		printf("Failed allowed unbound SOCK_DGRAM socket update '%i:%i'\n",
 		       i, udp);
 		goto out_sockmap;
 	}

-- 
2.54.0


^ permalink raw reply related

* [PATCH bpf v2 1/4] bpf, sockmap: Reject unhashed UDP sockets on sockmap update
From: Michal Luczaj @ 2026-06-26 20:36 UTC (permalink / raw)
  To: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Jakub Kicinski, Simon Horman, Alexei Starovoitov, Cong Wang,
	Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-0-7e7e201c951a@rbox.co>

UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.

Because sockmap accepts unbound UDP sockets, a BPF program can increment a
socket's refcount via lookup. If the socket is subsequently bound, the
transition from unbound to bound causes bpf_sk_release() to skip the
decrement of the refcount, causing a memory leak.

unreferenced object 0xffff88810bc2eb40 (size 1984):
  comm "test_progs", pid 2451, jiffies 4295320596
  hex dump (first 32 bytes):
    7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
    02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
  backtrace (crc bdee079d):
    kmem_cache_alloc_noprof+0x557/0x660
    sk_prot_alloc+0x69/0x240
    sk_alloc+0x30/0x460
    inet_create+0x2ce/0xf80
    __sock_create+0x25b/0x5c0
    __sys_socket+0x119/0x1d0
    __x64_sys_socket+0x72/0xd0
    do_syscall_64+0xa1/0x5f0
    entry_SYSCALL_64_after_hwframe+0x76/0x7e

Instead of special-casing for refcounted sockets, reject unhashed UDP
sockets during sockmap updates, as there is no benefit to supporting those.
This effectively reverts the commit under Fixes, with two exceptions:

1. sock_map_sk_state_allowed() maintains a fall-through `return true`.
2. In the spirit of commit b8b8315e39ff ("bpf, sockmap: Remove unhash
   handler for BPF sockmap usage"), the proto::unhash BPF handler is not
   reintroduced.

Historical note: this issue is related to commit 67312adc96b5 ("bpf: reject
unhashed sockets in bpf_sk_assign").

Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
Suggested-by: Kuniyuki Iwashima <kuniyu@google.com>
Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
 net/core/sock_map.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index c60ba6d292f9..9efbd8ca7db8 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -542,6 +542,8 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
 {
 	if (sk_is_tcp(sk))
 		return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
+	if (sk_is_udp(sk))
+		return sk_hashed(sk);
 	if (sk_is_stream_unix(sk))
 		return (1 << READ_ONCE(sk->sk_state)) & TCPF_ESTABLISHED;
 	if (sk_is_vsock(sk) &&

-- 
2.54.0

^ permalink raw reply related

* [PATCH bpf v2 3/4] selftests/bpf: Adapt sockmap update error handling
From: Michal Luczaj @ 2026-06-26 20:36 UTC (permalink / raw)
  To: Eric Dumazet, Kuniyuki Iwashima, Paolo Abeni, Willem de Bruijn,
	John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Jakub Kicinski, Simon Horman, Alexei Starovoitov, Cong Wang,
	Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj
In-Reply-To: <20260626-sockmap-lookup-udp-leak-v2-0-7e7e201c951a@rbox.co>

Update sockmap_listen to accommodate the recent change in sockmap that
rejects unbound UDP sockets.

TCP: Reject unbound and bound (unless established or listening).
UDP: Accept only bound sockets.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
 tools/testing/selftests/bpf/prog_tests/sockmap_listen.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
index cc0c68bab907..6ee1bc6b3b23 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -63,11 +63,8 @@ static void test_insert_opened(struct test_sockmap_listen *skel __always_unused,
 	errno = 0;
 	value = s;
 	err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
-	if (sotype == SOCK_STREAM) {
-		if (!err || errno != EOPNOTSUPP)
-			FAIL_ERRNO("map_update: expected EOPNOTSUPP");
-	} else if (err)
-		FAIL_ERRNO("map_update: expected success");
+	if (!err || errno != EOPNOTSUPP)
+		FAIL_ERRNO("map_update: expected EOPNOTSUPP");
 	xclose(s);
 }
 
@@ -93,8 +90,12 @@ static void test_insert_bound(struct test_sockmap_listen *skel __always_unused,
 	errno = 0;
 	value = s;
 	err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
-	if (!err || errno != EOPNOTSUPP)
-		FAIL_ERRNO("map_update: expected EOPNOTSUPP");
+	if (sotype == SOCK_STREAM) {
+		if (!err || errno != EOPNOTSUPP)
+			FAIL_ERRNO("map_update: expected EOPNOTSUPP");
+	} else if (err) {
+		FAIL_ERRNO("map_update: expected success");
+	}
 close:
 	xclose(s);
 }
@@ -1289,7 +1290,7 @@ static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map,
 		/* insert */
 		TEST(test_insert_invalid),
 		TEST(test_insert_opened),
-		TEST(test_insert_bound, SOCK_STREAM),
+		TEST(test_insert_bound),
 		TEST(test_insert),
 		/* delete */
 		TEST(test_delete_after_insert),

-- 
2.54.0


^ permalink raw reply related

* Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Ahmed, Aaron @ 2026-06-26 20:26 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: stable@vger.kernel.org, netdev@vger.kernel.org,
	ncardwell@google.com, edumazet@google.com, aws-binance-tam
In-Reply-To: <34F462A1-CEB9-4812-8E98-239E38585F14@amazon.com>

+CC: aws-binance-tam@amazon.com

On 6/19/26, 3:58 PM, "Ahmed, Aaron" <aarnahmd@amazon.com <mailto:aarnahmd@amazon.com>> wrote:

>Hi Kuniyuki,
>
>Sorry to keep asking, were you able to take a look at the updated reproducer? I've still been able to repro with the latest 6.18 LTS.
>
> Thanks,
> Aaron 








^ permalink raw reply

* RE: [PATCH] net: lan743x: Initialize eth_syslock spinlock before use
From: David Thompson @ 2026-06-26 19:50 UTC (permalink / raw)
  To: Andrea Righi, Bryan Whitehead, UNGLinuxDriver@microchip.com
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Raju Lakkaraju, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20260626163218.3591486-1-arighi@nvidia.com>

> -----Original Message-----
> From: Andrea Righi <arighi@nvidia.com>
> Sent: Friday, June 26, 2026 12:32 PM
> To: Bryan Whitehead <bryan.whitehead@microchip.com>;
> UNGLinuxDriver@microchip.com
> Cc: Andrew Lunn <andrew+netdev@lunn.ch>; David S . Miller
> <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>; Raju
> Lakkaraju <Raju.Lakkaraju@microchip.com>; David Thompson
> <davthompson@nvidia.com>; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: [PATCH] net: lan743x: Initialize eth_syslock spinlock before use
> 
> lan743x_hardware_init() calls pci11x1x_strap_get_status() during the PCI11x1x
> probe sequence. That helper acquires the Ethernet subsystem hardware lock
> via lan743x_hs_syslock_acquire(), which relies on
> adapter->eth_syslock_spinlock to serialize access.
> 
> The spinlock is currently initialized only after the strap status is read. With
> CONFIG_DEBUG_SPINLOCK enabled, taking the zeroed initialized spinlock can
> trip the spinlock debug check.
> 
> Fix by initializing adapter->eth_syslock_spinlock before reading the strap status
> so the probe path never attempts to lock an uninitialized spinlock.
> 
> Fixes: 46b777ad9a8c ("net: lan743x: Add support to SGMII 1G and 2.5G")
> Cc: stable@vger.kernel.org # v6.0+
> Signed-off-by: Andrea Righi <arighi@nvidia.com>
> ---
>  drivers/net/ethernet/microchip/lan743x_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/microchip/lan743x_main.c
> b/drivers/net/ethernet/microchip/lan743x_main.c
> index 1cdce35e14239..e759171bfd766 100644
> --- a/drivers/net/ethernet/microchip/lan743x_main.c
> +++ b/drivers/net/ethernet/microchip/lan743x_main.c
> @@ -3541,8 +3541,8 @@ static int lan743x_hardware_init(struct
> lan743x_adapter *adapter,
>  		adapter->max_tx_channels = PCI11X1X_MAX_TX_CHANNELS;
>  		adapter->used_tx_channels = PCI11X1X_USED_TX_CHANNELS;
>  		adapter->max_vector_count =
> PCI11X1X_MAX_VECTOR_COUNT;
> -		pci11x1x_strap_get_status(adapter);
>  		spin_lock_init(&adapter->eth_syslock_spinlock);
> +		pci11x1x_strap_get_status(adapter);
>  		mutex_init(&adapter->sgmii_rw_lock);
>  		pci11x1x_set_rfe_rd_fifo_threshold(adapter);
>  		sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL);
> --
> 2.54.0

Reviewed-by: David Thompson <davthompson@nvidia.com>


^ permalink raw reply

* RE: the confusing 10000base_CR. Shouldn't it be 10000_SFI_DA?
From: D H, Siddaraju @ 2026-06-26 19:19 UTC (permalink / raw)
  To: Andrew Lunn, Maxime Chevallier
  Cc: netdev@vger.kernel.org, Das, Shubham, Chintalapalle, Balaji,
	Srinivasan, Vijay
In-Reply-To: <2baa5a77-3305-4ed9-b56d-ba27cecc1911@lunn.ch>

Sure, thanks for pointing them, Andrew, will follow.
Now I realized what you meant there, thank you for the quick feedback.

About options,
Ok, got it: "option-(a): renaming *10000baseCR*" is out.
  Sure, will support this from uAPI backward-compatibility point-of-view.

  Just to highlight Maxime, yes during exploration, we too came across
  those few vendor products. But when we looked further to understand
  which standard those 10GBaseCR cables were following, we found they all
  explicitly call out that its SFP+ DA conforming to SFF-8431.

What about
"option-(b): create a new enum ETHTOOL_LINK_MODE_10G_SFI_DA_Full_BIT"?
  Idea is just to create a new enum, with same enum value of 10000baseCR.
  This will NOT consume a bit position in "ethtool_link_mode_bit_indices".
  It just helps those tech-savvy people, who does not accept 10000baseCR
  and prefer 10000sfiDA for being explicit.

At worst case, hope we agree for
"option-(c): ethtool.8 man page help strings to indicate 10G_SFI_DA"
  Something like
    "10000baseCR (10G_SFI_DA    SFF-8431 SFP+ DA)
  under "advertise" mask values.

- Thank you,
Siddaraju D H

^ permalink raw reply

* Re: [PATCH bpf-next v2 01/15] bpf: Remove __rcu tagging in st_link->map
From: Eduard Zingerman @ 2026-06-26 18:52 UTC (permalink / raw)
  To: Amery Hung, bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, memxor, martin.lau,
	shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	kernel-team
In-Reply-To: <20260623175006.3136053-2-ameryhung@gmail.com>

On Tue, 2026-06-23 at 10:49 -0700, Amery Hung wrote:
> From: Martin KaFai Lau <martin.lau@kernel.org>
> 
> st_link->map is always written under update_mutex. The paths that read
> st_link->map with rcu_read_lock() are not in the fast path, so they can
> simply take update_mutex instead. Remove the __rcu annotation and replace
> all RCU accessors with direct pointer reads under update_mutex. Use
> READ_ONCE() in bpf_struct_ops_map_link_poll() which reads the pointer
> without holding update_mutex.
> 
> It is a simplification change.
> 
> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
> Signed-off-by: Amery Hung <ameryhung@gmail.com>
> ---

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>

[...]

^ permalink raw reply

* Re: the confusing 10000base_CR. Shouldn't it be 10000_SFI_DA?
From: Andrew Lunn @ 2026-06-26 18:18 UTC (permalink / raw)
  To: D H, Siddaraju
  Cc: netdev@vger.kernel.org, Das, Shubham, Chintalapalle, Balaji,
	Srinivasan, Vijay
In-Reply-To: <MW4PR11MB6912BE7FEDE5C509B6C5D5DF9AEB2@MW4PR11MB6912.namprd11.prod.outlook.com>

On Fri, Jun 26, 2026 at 06:12:15PM +0000, D H, Siddaraju wrote:
> Sorry Andrew, carry-forwarded IEEE's PMD naming convention of separating media "-CR" :D. Will stick to Ethtool :).
> I was referring to "ETHTOOL_LINK_MODE_10000baseCR_FULL_Full_BIT" and its associated documentations Andrew.

Please don't top post. And set your mail client to wrap to around 75
characters. All standard netiquette things.

It is a good idea to be specific when asking questions. Asking about
something which does not exist in the kernel just makes it a guessing
game.

As Maxime pointed out, this is uAPI, so cannot be changed.

	Andrew

^ permalink raw reply

* RE: the confusing 10000base_CR. Shouldn't it be 10000_SFI_DA?
From: D H, Siddaraju @ 2026-06-26 18:12 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: netdev@vger.kernel.org, Das, Shubham, Chintalapalle, Balaji,
	Srinivasan, Vijay
In-Reply-To: <2994edfc-0894-49c9-abac-6daea5e12075@lunn.ch>

Sorry Andrew, carry-forwarded IEEE's PMD naming convention of separating media "-CR" :D. Will stick to Ethtool :).
I was referring to "ETHTOOL_LINK_MODE_10000baseCR_FULL_Full_BIT" and its associated documentations Andrew.

- Thank you,
Siddaraju D H

-----Original Message-----
From: Andrew Lunn <andrew@lunn.ch> 
Sent: Friday, June 26, 2026 8:02 PM
To: D H, Siddaraju <siddaraju.dh@intel.com>
Cc: netdev@vger.kernel.org; Das, Shubham <shubham.das@intel.com>; Chintalapalle, Balaji <balaji.chintalapalle@intel.com>; Srinivasan, Vijay <vijay.srinivasan@intel.com>
Subject: Re: the confusing 10000base_CR. Shouldn't it be 10000_SFI_DA?

On Fri, Jun 26, 2026 at 02:15:27PM +0000, D H, Siddaraju wrote:
> Hi Linux Ethernet Team,
> 
> We explored Ethtool's "10000base_CR" PMD PHY type and we think it 
> might be just a wrong name.

~/linux$ grep -r 10000Base_CR
~/linux$ 

~/ethtool$ grep -r 10000Base_CR
~/ethtool$ 

What exactly do you mean by 10000Base_CR?

     Andrew

^ permalink raw reply

* Re: [PATCH bpf-next v2] bpf, unix: Guard sk_msg-dependent code behind CONFIG_NET_SOCK_MSG
From: John Fastabend @ 2026-06-26 18:04 UTC (permalink / raw)
  To: Jakub Sitnicki
  Cc: Alexei Starovoitov, Jiayuan Chen, Amery Hung, Kuniyuki Iwashima,
	bpf, Alexei Starovoitov, Daniel Borkmann, Jakub Kicinski,
	Network Development, kernel-team
In-Reply-To: <87cxxey09d.fsf@cloudflare.com>

On Thu, Jun 25, 2026 at 07:53:50PM +0200, Jakub Sitnicki wrote:
>On Wed, Jun 24, 2026 at 01:57 PM -07, Alexei Starovoitov wrote:
>> On Tue Jun 23, 2026 at 6:32 PM PDT, Jiayuan Chen wrote:
>>>
>>> Hi Alexei and Jakub,
>>>
>>> skmsg is actually still pretty useful for gateways.
>>> I started with bpf by integrating skmsg into nginx as a module and envoy
>>> has something similar.
>>> The usual setup is cgroup/sk for L4 bypass (reject SYN), and skmsg for
>>> L7, redirecting
>>> between local apps by looking at the payload. So there are real users.

Interesting.

>>
>> ...
>>
>>> Agree, just like we remove skmsg from KTLS which is rarely used.
>>
>> ...
>>
>>> Hope not have skmsg disabled by default.
>>
>> I wasn't suggesting to delete the whole skmsg,
>> but to disable combinations that are causing issues.
>> Like what was done for skmsg and ktls.
>> I'd allow plain tcp and udp sockets only.
>> Allowing unix sockets was fishy. I think we should reject it too.
>
>For unix & vsock we know Bytedance built a proxy using it.
>We've been showcasing it as one of sockmap use cases [1].
>That said, I don't know if it's still being used or not.
>
>If we don't want to go through the config-knob-then-deprecate process,
>then I guess the only option is to kill it and see if anyone complains.
>
>[1] Slide 117, https://github.com/sockmap-project/sockmap-project/blob/810d259af6e7a5793922af3991c9dc7ff502fe19/talks/2024-09%20-%20NDC%20TechTown%20-%20Splicing%20Sockets%20with%20SOCKMAP.pdf

Most the bugs I'm seeing are combinations of push/pop/pull/tail/head
calls over sk_msg scatter gather list no one ever considered. Or at
least I never considered. Add in additional socket combinations that
came later and we get bugs.

Do the nginx/envoy offloads manipulate the scatter gather list as
well?

Do we need all these helpers? At some point I thought I was going to
build a real kernel proxy with this, but never did it. Another option
would be better test framework.

.John

^ permalink raw reply

* Re: [mellanox/mlx5-next RFC 1/1] net/mlx5: RX, Fix refcount warning on frag page release
From: Nabil S. Alramli @ 2026-06-26 18:02 UTC (permalink / raw)
  To: Dragos Tatulea, saeedm, tariqt, mbloch
  Cc: nalramli, leon, andrew+netdev, davem, edumazet, kuba, pabeni,
	netdev, linux-rdma, linux-kernel
In-Reply-To: <9f150145-d95c-4a90-a358-5b33ab78a8ef@nvidia.com>

On 6/26/26 09:12, Dragos Tatulea wrote:
> 
> 
> On 25.06.26 19:40, Nabil S. Alramli wrote:
>> Under memory pressure, mlx5 driver has WARNING during fragmented page
>> release. This happens because there is a discrepency between what mlx5
>> thinks the page fragment counter is vs what the page_pool actually says it
>> is.
>>
> The mlx5 frag counter is not the same as pp_ref_count. The page gets
> split into 64 parts during page allocation. The frag counter tracks how
> many of those frags have been used.
> 

Thank you for explaining that to me. I thought it was the same because in
mlx5e_page_release_fragmented, drain_count is computed from frag_page->frags
and then passed into page_pool_unref_page as the nr parameter which is then
compared to the refcount in the page_pool.

>> The cause of the issue is page allocations on concurrent cpus, which
>> increment the non-atomic u16 page counter mlx5e_frag_page.frags, while at
>> the same time the page reference counter net_iov.pp_ref_count is atomically
>> incremented. That sometimes leads to a difference in the counts and
>> therefore triggers the warning in page_pool_unref_netmem:
>>
> page_pool page allocations must not happen in parallel on different CPUs.
> Each queue has its own page_pool and allocation happens within the NAPI of
> that queue which sticks to a single CPU. The release path does support
> releasing on another CPU (release to ring).
> 
> How did you encounter this scenario of having parallel allocations on
> different CPUs from the same page_pool?
> 

Perhaps we didn't, I had assumed that the numbers must match. What we did
encounter is these WARNINGs, and they seemed to go away with this patch but
maybe it is a coincidence.

>> ```
>> 	ret = atomic_long_sub_return(nr, pp_ref_count);
>> 	WARN_ON(ret < 0);
>> ```
>>
>> The actual stack trace looks like this:
>>
>> ```
>> WARNING: CPU: 37 PID: 447795 at include/net/page_pool/helpers.h:277 mlx5e_page_release_fragmented.isra.0+0x51/0x60 [mlx5_core]
>> Tainted: [S]=CPU_OUT_OF_SPEC, [O]=OOT_MODULE
>> Hardware name: *
>> RIP: 0010:mlx5e_page_release_fragmented.isra.0+0x51/0x60 [mlx5_core]
>> RSP: 0018:ffffc90019814d98 EFLAGS: 00010293
>> RAX: 000000000000003f RBX: ffff88c0993d0a10 RCX: ffffea02424592c0
>> RDX: 0000000000000001 RSI: ffffea02424592c0 RDI: ffff88c090e20000
>> RBP: 000000000000000a R08: 0000000000001409 R09: 0000000000000006
>> R10: 0000000000000000 R11: ffff88c095fbc040 R12: 000000000000141f
>> R13: 0000000000000009 R14: ffff88c090e20000 R15: 0000000000000001
>> FS:  00007f34149fa6c0(0000) GS:ffff89200fa40000(0000) knlGS:0000000000000000
>> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> CR2: 00007ed0265eb000 CR3: 0000005091cbe000 CR4: 0000000000350ef0
>> Call Trace:
>>  <IRQ>
>>  mlx5e_free_rx_wqes+0x7b/0xa0 [mlx5_core]
>>  mlx5e_post_rx_wqes+0x1ac/0x5a0 [mlx5_core]
>>  mlx5e_napi_poll+0x5e5/0x6f0 [mlx5_core]
>>  __napi_poll+0x2b/0x1a0
>>  net_rx_action+0x30e/0x370
>>  ? sched_clock+0x9/0x10
>>  ? sched_clock_cpu+0xf/0x170
>>  handle_softirqs+0xe2/0x2a0
>>  common_interrupt+0x85/0xa0
>>  </IRQ>
>>  <TASK>
>>  asm_common_interrupt+0x26/0x40
>> RIP: 0010:page_counter_uncharge+0x34/0x90
>> RSP: 0018:ffffc900e728bb00 EFLAGS: 00000213
>> RAX: ffff88aff4762000 RBX: ffff88aff4762100 RCX: 0000000000000304
>> RDX: 0000000000000001 RSI: 00000000004e9e1a RDI: ffff88aff4762100
>> RBP: 0000000000000001 R08: ffff891ea0560048 R09: 00007ffffffff000
>> R10: 0000000000001000 R11: ffff891ae8061b00 R12: ffffffffffffffff
>> R13: ffff89107fcfd4c0 R14: ffff891ae8061b00 R15: ffff892002fe1400
>>  uncharge_batch+0x40/0xd0
>> ```
>>
> Can you provide more data on how you reproduced this? This helps to
> narrow down the bug. Reproduction steps would be ideal.
> 

I don't have clear steps to reproduce it, we just have seen it randomly on
some servers that were under memory pressure. I will try to look into it more
and find a way to reliably reproduce it. I agree that would be ideal to find a
proper fix.

>> The fix is to use an atomic page fragment counter, so it will always match
>> the number of references held in the page_pool.
>>
> This is not the right fix. The mlx5 page frag counter is not atomic
> on purpose because all changes to it happen only within the NAPI
> context.
> 

That was a question that I had, is it ever possible for frag_page->frags to be
incremented / set outside of NAPI context? I tried to answer that by looking
at code and by tracing it but could not get a clear picture. If it's not
possible then I agree, this is not the right fix.

> Thanks,
> Dragos

Thanks again for your guidance.

Nabil S. Alramli


^ permalink raw reply

* Re: [PATCH 2/2] net/sched: sch_multiq: Replace direct dequeue call with peek and qdisc_dequeue_peeked
From: Jamal Hadi Salim @ 2026-06-26 17:47 UTC (permalink / raw)
  To: Victor Nogueira
  Cc: hexlabsecurity, Vinicius Costa Gomes, Paolo Abeni, Jiri Pirko,
	Jakub Kicinski, David S. Miller, Eric Dumazet, Simon Horman,
	netdev, Jarek Poplawski, Vladimir Oltean, linux-kernel
In-Reply-To: <fc1b9963-2f69-4e47-bf59-6b668faedc0e@mojatatu.com>

On Fri, Jun 26, 2026 at 1:18 PM Victor Nogueira <victor@mojatatu.com> wrote:
>
> On 25/06/2026 06:51, Bryam Vargas via B4 Relay wrote:
> > From: Bryam Vargas <hexlabsecurity@proton.me>
> >
> > multiq_dequeue() takes a packet from a band's child with a direct
> > ->dequeue() call after multiq_peek() peeked it. When the child is
> > non-work-conserving the peek stashes the skb in the child's gso_skb, so
> > the direct dequeue returns a different skb and orphans the stash,
> > desyncing the child's qlen/backlog. With a qfq child reached through a
> > peeking parent (e.g. tbf) this re-enters the child on an emptied list and
> > dereferences NULL, panicking the kernel from softirq on ordinary egress.
> >
> > Take the packet through qdisc_dequeue_peeked(), as sch_prio already does
> > and as sch_red and sch_sfb were just fixed to do. The helper is a no-op
> > when the child has no stash, so a work-conserving child is unaffected.
> >
> > Fixes: 77be155cba4e ("pkt_sched: Add peek emulation for non-work-conserving qdiscs.")
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Bryam Vargas <hexlabsecurity@proton.me>
>
> Reviewed-by: Victor Nogueira <victor@mojatatu.com>

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>

cheers,
jamal

^ permalink raw reply

* Re: [PATCH 1/2] net/sched: sch_taprio: Replace direct dequeue call with peek and qdisc_dequeue_peeked
From: Jamal Hadi Salim @ 2026-06-26 17:46 UTC (permalink / raw)
  To: Victor Nogueira
  Cc: hexlabsecurity, Vinicius Costa Gomes, Paolo Abeni, Jiri Pirko,
	Jakub Kicinski, David S. Miller, Eric Dumazet, Simon Horman,
	netdev, Jarek Poplawski, Vladimir Oltean, linux-kernel
In-Reply-To: <5d57e4ce-0460-4ad2-9891-1bc7d24020d8@mojatatu.com>

On Fri, Jun 26, 2026 at 1:16 PM Victor Nogueira <victor@mojatatu.com> wrote:
>
> On 25/06/2026 06:51, Bryam Vargas via B4 Relay wrote:
> > From: Bryam Vargas <hexlabsecurity@proton.me>
> >
> > When taprio's software path peeks a non-work-conserving child qdisc, the
> > child stashes the peeked skb in its gso_skb; taprio_dequeue_from_txq()
> > then takes the packet with a direct child ->dequeue() call, which ignores
> > that stash, orphans the peeked skb and desyncs the child's qlen/backlog.
> > With a qfq child this re-enters the child on an emptied list and
> > dereferences NULL, panicking the kernel from softirq on ordinary egress.
> >
> > Take the packet through qdisc_dequeue_peeked(), as sch_red and sch_sfb
> > now do. The helper returns the child's stashed skb first and is a no-op
> > when there is none, so a work-conserving child is unaffected and the
> > gated path now consumes the skb whose length was charged to the budget.
> >
> > Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler")
> > Cc: stable@vger.kernel.org
> > Cc: Vladimir Oltean <vladimir.oltean@nxp.com>
> > Signed-off-by: Bryam Vargas <hexlabsecurity@proton.me>
>
> Reviewed-by: Victor Nogueira <victor@mojatatu.com>

Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>

cheers,
jamal

^ permalink raw reply

* Re: [BUG] netdevsim: KASAN slab-use-after-free in ref_tracker_free
From: Shuangpeng @ 2026-06-26 17:41 UTC (permalink / raw)
  To: saeed bishara
  Cc: netdev, Jakub Kicinski, Andrew Lunn, David S. Miller,
	Eric Dumazet, Paolo Abeni, Simon Horman, linux-kernel
In-Reply-To: <CAHfVqdXqZkrp5Dm0a8BJU0Tbc_T09A=e02PenwxrZFDMQrEmyQ@mail.gmail.com>



> On Jun 16, 2026, at 05:34, saeed bishara <saeed.bishara.os@gmail.com> wrote:
> 
> I tried gemini, here its analysis and a fix suggestion:
> 
> This is a brilliant, subtle bug. Let's break this down with a rigorous
> audit of the Linux network device refcounting architecture to see
> exactly why this Use-After-Free is occurring.
> The root cause is a classic "Reverse-Teardown Fallacy" colliding with
> the kernel's dual-refcount lifecycle for network devices (pcpu_refcnt
> vs. kobject refcount).
> 
> The Dual-Lifecycle Trap
> In modern kernel networking, a struct net_device is kept alive by two
> distinct mechanisms that must be carefully orchestrated:
> dev->pcpu_refcnt (Operational Lifecycle): Managed by dev_hold() and
> dev_put(). This tracks active operational references. The
> unregister_netdevice sequence blocks in netdev_run_todo() waiting for
> this to hit zero before it invokes free_netdev().
> dev->dev.kobj (Memory Lifecycle): The underlying memory structure
> manager. free_netdev() drops a reference to this. When the kobject
> refcount hits 0, device_release() is synchronously triggered, and the
> kmalloc-cg-8k slab is actually freed.
> 
> The Exact Race Sequence
> Look at the teardown order in the error path of sysfs_rtnl_lock():
> 
> unbreak:
>    sysfs_unbreak_active_protection(kn);  // LINE 121
>    dev_put(ndev);                        // LINE 122
> 
> Normally, developers write error paths in the exact reverse order of
> initialization. However, in this specific case, adhering to that
> convention is fatal.
> The Setup: sysfs_rtnl_lock starts by calling dev_hold(ndev),
> incrementing the pcpu_refcnt. It then calls
> sysfs_break_active_protection(), which explicitly calls kobject_get(),
> elevating the kobj memory refcount.
> The Unregister Race: In parallel, another CPU initiates device
> teardown (e.g., echo 1 > del_device on netdevsim). The sysfs entries
> are deleted, and device_del() drops its primary kobj reference. The
> teardown thread hits netdev_run_todo() and blocks, waiting for
> sysfs_rtnl_lock to release its dev_hold.
> The Lock Fails: sysfs_rtnl_lock fails to get the rtnl lock and jumps to unbreak.
> The Fatal Drop (Line 121): sysfs_unbreak_active_protection(kn)
> executes. It calls kobject_put(). Because the unregistration path
> already dropped the standard kobject references, this is the absolute
> last kobject reference. device_release() is synchronously invoked, and
> the net_device memory is instantly freed to the slab.
> The KASAN Trigger (Line 122): dev_put(ndev) executes. Under the hood,
> it attempts to call netdev_tracker_free(&ndev->ref_tracker, ...). It
> reads from the ndev structure that was just destroyed microseconds
> earlier. KASAN screams.
> 
> To fix this, the teardown convention must be deliberately violated to
> respect the underlying memory dependencies.
> 

Thanks for the analysis and patch suggestion.

I tested the patch together with the local delay/debug patch that was
used to trigger the original report.  I rebuilt the kernel, booted a
fresh VM, and reran the same netdevsim reproducer.

With the patched kernel, the original KASAN UAF was not reproduced in
this run:

  BUG: KASAN: slab-use-after-free in ref_tracker_free

However, the same rx-0 create_dir failure path was still reached, and
the run still produced a ref tracker warning:

  kobject_add_internal create_dir failed for rx-0 ...
  WARNING: lib/ref_tracker.c:295 at ref_tracker_free
  sysfs_rtnl_lock
  phys_port_name_show


> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index a1b2c3d4e5f6..7f8e9d0c1b2a 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -118,8 +118,8 @@ static int sysfs_rtnl_lock(struct kobject *kobj,
> struct attribute *attr,
>  return 0;
> 
> unbreak:
> - sysfs_unbreak_active_protection(kn);
>  dev_put(ndev);
> + sysfs_unbreak_active_protection(kn);
>  return ret;
> }
> 
> 
> On Mon, Jun 15, 2026 at 4:18 AM Shuangpeng Bai
> <shuangpeng.kernel@gmail.com> wrote:
>> 
>> Hi netdev maintainers,
>> 
>> I hit the following KASAN report while testing an upstream kernel.
>> 
>> The issue was reproduced with netdevsim. I have not confirmed whether this is
>> specific to netdevsim or whether other net devices can trigger a similar issue.
>> 
>> The KASAN report shows a slab-use-after-free in ref_tracker_free(), reached from
>> sysfs_rtnl_lock() while reading phys_port_name.
>> 
>> I reproduced this on commit: e8c2f9fdadee7cbc75134dc463c1e0d856d6e5c7 (May 25 2026)
>> 
>> To help trigger the bug more reliably, we applied a minimal diagnostic patch
>> that only adds delays and print statements.
>> 
>> The reproducer and .config files are here.
>> https://gist.github.com/shuangpengbai/b49765d646ec4610917015371aa1c3ca
>> 
>> I'm happy to test debug patches or provide additional information.
>> 
>> Reported-by: Shuangpeng Bai <shuangpeng.kernel@gmail.com>
>> 
>> [ 3145.449971][T17497] BUG: KASAN: slab-use-after-free in ref_tracker_free (lib/ref_tracker.c:295)
>> [ 3145.452089][T17497] Read of size 1 at addr ffff888107678598 by task cat/17497
>> [ 3145.454439][T17497]
>> [ 3145.454977][T17497] Tainted: [W]=WARN
>> [ 3145.454980][T17497] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
>> [ 3145.454985][T17497] Call Trace:
>> [ 3145.454991][T17497]  <TASK>
>> [ 3145.454994][T17497]  dump_stack_lvl (lib/dump_stack.c:94 lib/dump_stack.c:120)
>> [ 3145.455002][T17497]  print_report (mm/kasan/report.c:378 mm/kasan/report.c:482)
>> [ 3145.455028][T17497]  kasan_report (mm/kasan/report.c:595)
>> [ 3145.455046][T17497]  ref_tracker_free (lib/ref_tracker.c:295)
>> [ 3145.455083][T17497]  sysfs_rtnl_lock (include/linux/netdevice.h:4491 include/linux/netdevice.h:4508 include/linux/netdevice.h:4534 net/core/net-sysfs.c:122)
>> [ 3145.455091][T17497]  phys_port_name_show (net/core/net-sysfs.c:665)
>> [ 3145.455118][T17497]  dev_attr_show (drivers/base/core.c:2421)
>> [ 3145.455128][T17497]  sysfs_kf_seq_show (fs/sysfs/file.c:65)
>> [ 3145.455135][T17497]  seq_read_iter (fs/seq_file.c:231)
>> [ 3145.455144][T17497]  vfs_read (fs/read_write.c:493 fs/read_write.c:574)
>> [ 3145.455169][T17497]  ksys_read (fs/read_write.c:717)
>> [ 3145.455181][T17497]  do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94)
>> [ 3145.455188][T17497]  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121)
>> [ 3145.455193][T17497] RIP: 0033:0x7fcf098c43ce
>> [ 3145.455200][T17497] Code: c0 e9 b6 fe ff ff 50 48 8d 3d 6e 08 0b 00 e8 69 01 02 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28
>> [ 3145.455204][T17497] RSP: 002b:00007ffd05e76b98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
>> [ 3145.455211][T17497] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fcf098c43ce
>> [ 3145.455214][T17497] RDX: 0000000000020000 RSI: 00007fcf095e4000 RDI: 0000000000000003
>> [ 3145.455217][T17497] RBP: 00007fcf095e4000 R08: 00007fcf095e3010 R09: 0000000000000000
>> [ 3145.455219][T17497] R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000
>> [ 3145.455222][T17497] R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000
>> [ 3145.455227][T17497]  </TASK>
>> [ 3145.455229][T17497]
>> [ 3145.479014][T17497] Freed by task 17497 on cpu 0 at 3145.447575s:
>> [ 3145.479559][T17497]  kasan_save_track (mm/kasan/common.c:57 mm/kasan/common.c:78)
>> [ 3145.479963][T17497]  kasan_save_free_info (mm/kasan/generic.c:584)
>> [ 3145.480411][T17497]  __kasan_slab_free (mm/kasan/common.c:253 mm/kasan/common.c:285)
>> [ 3145.480813][T17497]  kfree (include/linux/kasan.h:235 mm/slub.c:2689 mm/slub.c:6251 mm/slub.c:6566)
>> [ 3145.481148][T17497]  device_release (drivers/base/core.c:2542)
>> [ 3145.481567][T17497]  kobject_put (lib/kobject.c:689 lib/kobject.c:720 include/linux/kref.h:65 lib/kobject.c:737)
>> [ 3145.481951][T17497]  sysfs_rtnl_lock (net/core/net-sysfs.c:121)
>> [ 3145.482351][T17497]  phys_port_name_show (net/core/net-sysfs.c:665)
>> [ 3145.482782][T17497]  dev_attr_show (drivers/base/core.c:2421)
>> [ 3145.483154][T17497]  sysfs_kf_seq_show (fs/sysfs/file.c:65)
>> [ 3145.483586][T17497]  seq_read_iter (fs/seq_file.c:231)
>> [ 3145.483975][T17497]  vfs_read (fs/read_write.c:493 fs/read_write.c:574)
>> [ 3145.484334][T17497]  ksys_read (fs/read_write.c:717)
>> [ 3145.484701][T17497]  do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94)
>> [ 3145.485092][T17497]  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:121)
>> [ 3145.485592][T17497]
>> [ 3145.485794][T17497] The buggy address belongs to the object at ffff888107678000
>> [ 3145.485794][T17497]  which belongs to the cache kmalloc-cg-8k of size 8192
>> [ 3145.486991][T17497] The buggy address is located 1432 bytes inside of
>> [ 3145.486991][T17497]  freed 8192-byte region [ffff888107678000, ffff88810767a000)
>> [ 3145.488159][T17497]
>> [ 3145.488367][T17497] The buggy address belongs to the physical page:
>> 
>> 
>> Best,
>> Shuangpeng
>> 


^ permalink raw reply

* Re: [PATCH net v2 1/1] net: sched: ets: avoid deficit wrap and bound empty dequeue rounds
From: Yuan Tan @ 2026-06-26 17:34 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Ren Wei, netdev, jiri, davem, petrm, yifanwucs, tomapufckgml,
	zcliangcn, bird, bronzed_45_vested
In-Reply-To: <CAM0EoM=FvNKemH4Z5dL_pFQZcGWxLb4c_AMahOePLY7s6Pkhcg@mail.gmail.com>

On Fri, Jun 26, 2026 at 2:54 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> On Fri, Jun 26, 2026 at 4:32 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
> >
> > From: Wyatt Feng <bronzed_45_vested@icloud.com>
> >
> > ETS keeps each DRR-style deficit in a u32 and replenishes it with
> > the configured quantum whenever the head packet is too large. Both
> > the quantum and qdisc_pkt_len() are user-controlled inputs: a large
> > quantum can wrap the deficit counter, while a tiny quantum combined
> > with an inflated qdisc_pkt_len() can force billions of iterations in
> > softirq context before any packet becomes eligible.
> >
> > Store the deficit in u64 so replenishment cannot wrap the counter.
> > This keeps the existing dequeue logic unchanged while fixing the
> > overflow condition.
> >
> > Bound one dequeue attempt to at most nbands * 2 ETS rotations, as
> > suggested in review. This avoids the livelock without adding heavier
> > logic to the fast path.
> >
> > Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc")
> > Cc: stable@vger.kernel.org
> > Reported-by: Yuan Tan <yuantan098@gmail.com>
> > Reported-by: Yifan Wu <yifanwucs@gmail.com>
> > Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> > Reported-by: Zhengchuan Liang <zcliangcn@gmail.com>
> > Reported-by: Xin Liu <bird@lzu.edu.cn>
> > Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com>
> > Assisted-by: Codex:GPT-5.4
> > Signed-off-by: Wyatt Feng <bronzed_45_vested@icloud.com>
> > Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
>
> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>
> Note, you did not Cc many maintainers. Next time, or if you have to
> resend this patch make sure you Cc the stakeholders (see
> scripts/get_maintainers.pl)

Thank you very much for your advice. We were previously concerned
about bothering too many maintainers, so we used the pattern-depth=1
option when running scripts/get_maintainers.pl. We’ll relax this
parameter a bit in future submissions.

>
> cheers,
> jamal
>
> > ---
> > changes in v2:
> >   - Instead of doing a div() in the fast path, simply bound the loop per
> >     dequeue
> >   - v1 Link: https://lore.kernel.org/all/20260615103759.2404228-2-n05ec@lzu.edu.cn/
> >
> >
> >  net/sched/sch_ets.c | 6 +++++-
> >  1 file changed, 5 insertions(+), 1 deletion(-)
> >
> > diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c
> > index cb8cf437ce87..12a156ccb0a6 100644
> > --- a/net/sched/sch_ets.c
> > +++ b/net/sched/sch_ets.c
> > @@ -40,7 +40,7 @@ struct ets_class {
> >         struct list_head alist; /* In struct ets_sched.active. */
> >         struct Qdisc *qdisc;
> >         u32 quantum;
> > -       u32 deficit;
> > +       u64 deficit;
> >         struct gnet_stats_basic_sync bstats;
> >         struct gnet_stats_queue qstats;
> >  };
> > @@ -463,6 +463,8 @@ ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb)
> >  static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
> >  {
> >         struct ets_sched *q = qdisc_priv(sch);
> > +       unsigned int max_loops = READ_ONCE(q->nbands) * 2;
> > +       unsigned int loops = 0;
> >         struct ets_class *cl;
> >         struct sk_buff *skb;
> >         unsigned int band;
> > @@ -499,6 +501,8 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch)
> >
> >                 cl->deficit += READ_ONCE(cl->quantum);
> >                 list_move_tail(&cl->alist, &q->active);
> > +               if (++loops > max_loops)
> > +                       goto out;
> >         }
> >  out:
> >         return NULL;
> > --
> > 2.47.3
> >

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox