Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next v4 2/2] selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper
From: Leon Hwang @ 2026-06-13 16:24 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis,
	John Fastabend, Stanislav Fomichev, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Shuah Khan, Leon Hwang, Ihor Solodrai, netdev, linux-kernel,
	linux-kselftest, kernel-patches-bot
In-Reply-To: <20260613162443.60515-1-leon.hwang@linux.dev>

Verify the fix by:

1. Attach cgroup sockops prog.
2. Build a tcp connection using ipv4 addr in ipv6 socket.
3. Verify the return value of bpf_setsockopt() helper.

Assisted-by: Codex:gpt-5.5-xhigh
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 .../selftests/bpf/prog_tests/setget_sockopt.c | 78 +++++++++++++++++++
 .../selftests/bpf/progs/setget_sockopt.c      | 23 ++++++
 2 files changed, 101 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
index 77fe1bfb7504..4e91d9b615ce 100644
--- a/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/setget_sockopt.c
@@ -199,6 +199,83 @@ static void test_nonstandard_opt(int family)
 	bpf_link__destroy(getsockopt_link);
 }
 
+static int connect_to_v4mapped_v6_fd(int server_fd)
+{
+	struct sockaddr_storage addr;
+	struct sockaddr_in *addr4 = (void *)&addr;
+	socklen_t addrlen = sizeof(addr);
+	struct sockaddr_in6 addr6 = {};
+	int fd = -1, v6only = 0, err;
+
+	err = getsockname(server_fd, (struct sockaddr *)&addr, &addrlen);
+	if (!ASSERT_OK(err, "getsockname"))
+		return -1;
+
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (!ASSERT_GE(fd, 0, "socket"))
+		return -1;
+
+	err = settimeo(fd, 0);
+	if (!ASSERT_OK(err, "settimeo"))
+		goto err_out;
+
+	err = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof(v6only));
+	if (!ASSERT_OK(err, "clear_v6only"))
+		goto err_out;
+
+	addr6.sin6_family = AF_INET6;
+	addr6.sin6_port = addr4->sin_port;
+	addr6.sin6_addr.s6_addr[10] = 0xff;
+	addr6.sin6_addr.s6_addr[11] = 0xff;
+	memcpy(&addr6.sin6_addr.s6_addr[12], &addr4->sin_addr, sizeof(addr4->sin_addr));
+
+	err = connect(fd, (struct sockaddr *)&addr6, sizeof(addr6));
+	if (!ASSERT_OK(err, "connect"))
+		goto err_out;
+
+	return fd;
+
+err_out:
+	close(fd);
+	return -1;
+}
+
+static void test_v4mapped_v6_ip_tos(void)
+{
+	struct setget_sockopt__bss *bss = skel->bss;
+	int sfd = -1, fd = -1, got = 0, exp = 0x1c;
+	socklen_t optlen;
+
+	memset(bss, 0, sizeof(*bss));
+	bss->v4mapped_v6_ip_tos_enable = 1;
+	bss->v4mapped_v6_ip_tos_ret = -1;
+	bss->v4mapped_v6_ip_tos_val = exp;
+
+	sfd = start_server(AF_INET, SOCK_STREAM, addr4_str, 0, 0);
+	if (!ASSERT_GE(sfd, 0, "start_server"))
+		goto err_out;
+
+	fd = connect_to_v4mapped_v6_fd(sfd);
+	if (!ASSERT_GE(fd, 0, "connect_to_v4mapped_v6_fd"))
+		goto err_out;
+
+	ASSERT_GT(bss->v4mapped_v6_ip_tos_cnt, 0, "v4mapped_v6_ip_tos_cnt");
+	ASSERT_EQ(bss->v4mapped_v6_ip_tos_ret, 0, "v4mapped_v6_ip_tos_ret");
+
+	optlen = sizeof(got);
+	if (!ASSERT_OK(getsockopt(fd, SOL_IP, IP_TOS, &got, &optlen), "getsockopt_ip_tos"))
+		goto err_out;
+
+	ASSERT_EQ(got, exp, "ip_tos");
+
+err_out:
+	bss->v4mapped_v6_ip_tos_enable = 0;
+	if (fd >= 0)
+		close(fd);
+	if (sfd >= 0)
+		close(sfd);
+}
+
 void test_setget_sockopt(void)
 {
 	cg_fd = test__join_cgroup(CG_NAME);
@@ -238,6 +315,7 @@ void test_setget_sockopt(void)
 	test_ktls(AF_INET);
 	test_nonstandard_opt(AF_INET);
 	test_nonstandard_opt(AF_INET6);
+	test_v4mapped_v6_ip_tos();
 
 done:
 	setget_sockopt__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c
index d330b1511979..636a7cd8e2fa 100644
--- a/tools/testing/selftests/bpf/progs/setget_sockopt.c
+++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c
@@ -387,6 +387,24 @@ int _getsockopt(struct bpf_sockopt *ctx)
 	return 1;
 }
 
+int v4mapped_v6_ip_tos_enable;
+int v4mapped_v6_ip_tos_ret;
+int v4mapped_v6_ip_tos_cnt;
+int v4mapped_v6_ip_tos_val;
+
+static void test_v4mapped_v6_ip_tos(struct bpf_sock_ops *skops)
+{
+	int tos = v4mapped_v6_ip_tos_val;
+
+	if (!v4mapped_v6_ip_tos_enable || skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB)
+		return;
+	if (skops->family != AF_INET6)
+		return;
+
+	v4mapped_v6_ip_tos_cnt++;
+	v4mapped_v6_ip_tos_ret = bpf_setsockopt(skops, IPPROTO_IP, IP_TOS, &tos, sizeof(tos));
+}
+
 SEC("sockops")
 int skops_sockopt(struct bpf_sock_ops *skops)
 {
@@ -401,6 +419,11 @@ int skops_sockopt(struct bpf_sock_ops *skops)
 	if (!sk)
 		return 1;
 
+	if (v4mapped_v6_ip_tos_enable) {
+		test_v4mapped_v6_ip_tos(skops);
+		return 1;
+	}
+
 	switch (skops->op) {
 	case BPF_SOCK_OPS_TCP_LISTEN_CB:
 		nr_listen += !(bpf_test_sockopt(skops, sk) ||
-- 
2.54.0


^ permalink raw reply related

* [PATCH bpf-next v4 0/2] bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket
From: Leon Hwang @ 2026-06-13 16:24 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis,
	John Fastabend, Stanislav Fomichev, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Shuah Khan, Leon Hwang, Ihor Solodrai, netdev, linux-kernel,
	linux-kselftest, kernel-patches-bot

When TCP over IPv4 via INET6 API, sk->sk_family is AF_INET6, but it is a
v4 pkt. inet_csk(sk)->icsk_af_ops is ipv6_mapped and use ip_queue_xmit.
The tos sockopt does not work for bpf [get,set]sockopt() helpers.

Changelog:
v3 -> v4:
* Add 'sk->sk_type != SOCK_RAW && !ipv6_only_sock(sk)' check.
* Re-implement test with LLM assistance.
* v3: https://lore.kernel.org/all/20240914103226.71109-1-zhoufeng.zf@bytedance.com/

v2->v3:
* Use sk_is_inet() helper. (Eric Dumazet)
* https://lore.kernel.org/bpf/CANn89i+9GmBLCdgsfH=WWe-tyFYpiO27wONyxaxiU6aOBC6G8g@mail.gmail.com/T/

v1->v2:
* Fix compilation error. (kernel test robot)
* https://lore.kernel.org/bpf/202408152058.YXAnhLgZ-lkp@intel.com/T/

Leon Hwang (2):
  bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket
  selftests/bpf: Add test to verify the fix for bpf_setsockopt() helper

 net/core/filter.c                             | 15 +++-
 .../selftests/bpf/prog_tests/setget_sockopt.c | 78 +++++++++++++++++++
 .../selftests/bpf/progs/setget_sockopt.c      | 23 ++++++
 3 files changed, 115 insertions(+), 1 deletion(-)

-- 
2.54.0


^ permalink raw reply

* [PATCH bpf-next v4 1/2] bpf: Fix bpf_get/setsockopt to tos for ipv4-mapped ipv6 socket
From: Leon Hwang @ 2026-06-13 16:24 UTC (permalink / raw)
  To: bpf
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis,
	John Fastabend, Stanislav Fomichev, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Shuah Khan, Leon Hwang, Ihor Solodrai, netdev, linux-kernel,
	linux-kselftest, kernel-patches-bot, Feng Zhou
In-Reply-To: <20260613162443.60515-1-leon.hwang@linux.dev>

When TCP over IPv4 via INET6 API, bpf_get/setsockopt with ipv4 will
fail, because sk->sk_family is AF_INET6. With ipv6 will success, not
take effect, because inet_csk(sk)->icsk_af_ops is ipv6_mapped and
use ip_queue_xmit, inet_sk(sk)->tos.

To relax this restriction, allow getting/setting tos for those possible
ipv4-mapped ipv6 sockets.

Fixes: ee7f1e1302f5 ("bpf: Change bpf_setsockopt(SOL_IP) to reuse do_ip_setsockopt()")
Signed-off-by: Feng Zhou <zhoufeng.zf@bytedance.com>
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 net/core/filter.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 9590877b0714..57b00c6cc8cc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5544,11 +5544,24 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 				 KERNEL_SOCKPTR(optval), *optlen);
 }
 
+static bool sk_allows_sol_ip_sockopt(struct sock *sk)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		return true;
+	case AF_INET6:
+		/* Allow getting/setting sockopt for possible ipv4-mapped ipv6 socket. */
+		return sk->sk_type != SOCK_RAW && !ipv6_only_sock(sk);
+	default:
+		return false;
+	}
+}
+
 static int sol_ip_sockopt(struct sock *sk, int optname,
 			  char *optval, int *optlen,
 			  bool getopt)
 {
-	if (sk->sk_family != AF_INET)
+	if (!sk_allows_sol_ip_sockopt(sk))
 		return -EINVAL;
 
 	switch (optname) {
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH net v2 0/2] ip_tunnel: fix PMTU ICMP reply routing
From: Jakub Kicinski @ 2026-06-13 16:23 UTC (permalink / raw)
  To: Laika Price
  Cc: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Shuah Khan, netdev, linux-kernel,
	linux-kselftest
In-Reply-To: <CAL=tPgjhj0+8voK40ZPdsKyQ0Pn4vwnSg-JVqRK3qRSXLLB4Kw@mail.gmail.com>

On Sat, 13 Jun 2026 16:38:27 +0100 Laika Price wrote:
> Disregard v2 of this series.
> 
> Apologies, I'm new to kernel development as did not realise that I should
> squash commits that would cause the kernel to not build / fail tests. I am
> sending in a v3 with these squashed.
> 
> Sorry for the noise.

I'm not sure what build failure you're talking about.
Please observe the 24h cooldown between submitting new versions 
of a patch.

^ permalink raw reply

* Re: [PATCH net-next V2] selftests: drv-net: Test queue stall upon reconfig
From: Jakub Kicinski @ 2026-06-13 16:20 UTC (permalink / raw)
  To: Mohsin Bashir
  Cc: netdev, andrew+netdev, davem, edumazet, pabeni, shuah,
	linux-kselftest
In-Reply-To: <20260613014855.1717712-1-mohsin.bashr@gmail.com>

On Fri, 12 Jun 2026 18:48:54 -0700 Mohsin Bashir wrote:
> From: Mohsin Bashir <hmohsin@meta.com>
> 
> Add a reconfig_tx_stall test that detects the possibility of a TX stall
> after ring reconfiguration. The key observation is that drivers using
> netif_tx_start_all_queues() are prone to experiencing a stall when
> reconfiguration completes compared to drivers using
> netif_tx_wake_all_queues(). start_all_queues only clears DRV_XOFF, while
> wake_all_queues also calls __netif_schedule() to kick the qdisc. Without
> the kick, qdisc backlog present at reconfig time can stay stuck until a
> new trigger is issued.
> 
> The test caps the TX ring at 64 entries so it fills quickly, then
> installs FQ on a target TX queue and sends UDP packets with SO_TXTIME
> scheduled in the future. With napi_defer_hard_irqs slowing completions,
> the small ring can fill when FQ releases the burst, leaving requeued
> qdisc backlog with no FQ timer to rescue it. A subsequent ring reconfig
> must wake the queues to drain the backlog. Simply starting the queues can
> leave it stuck.
> 
> On host with problematic driver:
>  Sent 128 SO_TXTIME packets (+100ms)
>  Sent 128 SO_TXTIME packets (+200ms)
>  Backlog before reconfig: 52632 bytes
>  Check| At /root/ksft-net-drv/./drivers/net/ring_reconfig.py, ...
>  Check|     ksft_eq(0, backlog,
>  Check failed 0 != 52632 qdisc backlog stuck on queue 1 after ring reconfig
> not ok 3 ring_reconfig.reconfig_tx_stall
> 
> On host with fixed driver:
>  Sent 128 SO_TXTIME packets (+100ms)
>  Sent 128 SO_TXTIME packets (+200ms)
>  Backlog before reconfig: 76024 bytes
> ok 3 ring_reconfig.reconfig_tx_stall
> 
> Signed-off-by: Mohsin Bashir <hmohsin@meta.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>


pylint is not on board:

+tools/testing/selftests/drivers/net/ring_reconfig.py:169:37: W1514: Using open without explicitly specifying an encoding (unspecified-encoding)
+tools/testing/selftests/drivers/net/ring_reconfig.py:162:17: W0613: Unused argument 'cfg' (unused-argument)
+tools/testing/selftests/drivers/net/ring_reconfig.py:253:0: C0116: Missing function or method docstring (missing-function-docstring)

> diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config
> index 617de8aaf551..1ef07fae74c1 100644
> --- a/tools/testing/selftests/drivers/net/config
> +++ b/tools/testing/selftests/drivers/net/config
> @@ -4,6 +4,10 @@ CONFIG_DEBUG_INFO_BTF_MODULES=n
>  CONFIG_INET_PSP=y
>  CONFIG_IPV6=y
>  CONFIG_MACSEC=m
> +CONFIG_NET_ACT_SKBEDIT=m
> +CONFIG_NET_CLS_ACT=y
> +CONFIG_NET_CLS_FLOWER=m
> +CONFIG_NET_CLS_MATCHALL=m
>  CONFIG_NETCONSOLE=m
>  CONFIG_NETCONSOLE_DYNAMIC=y
>  CONFIG_NETCONSOLE_EXTENDED_LOG=y
> diff --git a/tools/testing/selftests/drivers/net/ring_reconfig.py b/tools/testing/selftests/drivers/net/ring_reconfig.py
> index f9530a8b0856..11491a0b7013 100755
> --- a/tools/testing/selftests/drivers/net/ring_reconfig.py
> +++ b/tools/testing/selftests/drivers/net/ring_reconfig.py
> @@ -5,10 +5,18 @@
>  Test channel and ring size configuration via ethtool (-L / -G).
>  """
>  
> +import socket
> +import struct
> +import time
> +
>  from lib.py import ksft_run, ksft_exit, ksft_pr
>  from lib.py import ksft_eq
> +from lib.py import KsftSkipEx
>  from lib.py import NetDrvEpEnv, EthtoolFamily, GenerateTraffic
> -from lib.py import defer, NlError
> +from lib.py import cmd, defer, rand_port, tc, NlError
> +
> +# Added in Python 3.13; fallback to 61 for x86/ARM/MIPS
> +SO_TXTIME = getattr(socket, "SO_TXTIME", 61)
>  
>  
>  def channels(cfg) -> None:
> @@ -151,6 +159,169 @@ def ringparam(cfg) -> None:
>          GenerateTraffic(cfg).wait_pkts_and_stop(10000)
>  
>  
> +def _write_sysfs(cfg, path, val):
> +    with open(path, "r", encoding="utf-8") as fp:
> +        orig_val = fp.read().strip()
> +    if str(val) == orig_val:
> +        return
> +    with open(path, "w", encoding="utf-8") as fp:
> +        fp.write(str(val))
> +    defer(lambda p=path, v=orig_val: open(p, "w").write(v))
> +
> +
> +def _get_mq_handle(cfg):
> +    qdiscs = tc(f"qdisc show dev {cfg.ifname}", json=True)
> +    for q in qdiscs:
> +        if q.get("kind") == "mq":
> +            return q["handle"]
> +    raise KsftSkipEx(f"no mq qdisc found on {cfg.ifname}")
> +
> +
> +def _get_qdisc_backlog(cfg, queue, mq_handle):
> +    qdiscs = tc(f"-s qdisc show dev {cfg.ifname}", json=True)
> +    target_parent = f"{mq_handle}{queue + 1:x}"
> +    for q in qdiscs:
> +        if q.get("parent", "") == target_parent:
> +            return q.get("backlog")
> +    return None
> +
> +
> +def _setup_fq_qdisc(cfg, mq_handle, port, target_queue, other_queue):
> +    mq_child_parent = f"{mq_handle}{target_queue + 1:x}"
> +
> +    # Save the original child qdisc to restore after test
> +    qdiscs = tc(f"qdisc show dev {cfg.ifname}", json=True)
> +    default_qdisc = cmd("sysctl -n net.core.default_qdisc").stdout.strip()
> +    orig_kind = default_qdisc
> +    for q in qdiscs:
> +        if q.get("parent", "") == mq_child_parent:
> +            orig_kind = q.get("kind", default_qdisc)
> +            break
> +    try:
> +        tc(f"qdisc replace dev {cfg.ifname} parent {mq_child_parent} fq")
> +    except Exception as exc:
> +        raise KsftSkipEx("fq not available (CONFIG_NET_SCH_FQ)") from exc
> +    defer(tc,
> +          f"qdisc replace dev {cfg.ifname} parent {mq_child_parent} {orig_kind}")
> +
> +    qdisc_j = tc(f"qdisc show dev {cfg.ifname}", json=True)
> +    has_clsact = any(q['kind'] == 'clsact' for q in qdisc_j)
> +    if not has_clsact:
> +        tc(f"qdisc add dev {cfg.ifname} clsact")
> +        defer(tc, f"qdisc del dev {cfg.ifname} clsact")
> +
> +    proto = "ipv6" if int(cfg.addr_ipver) == 6 else "ip"
> +    try:
> +        tc(f"filter add dev {cfg.ifname} egress protocol {proto} "
> +           f"pref 1 flower ip_proto udp dst_port {port} "
> +           f"action skbedit queue_mapping {target_queue}")
> +    except Exception as exc:
> +        raise KsftSkipEx("tc flower/act_skbedit not available") from exc
> +    defer(tc, f"filter del dev {cfg.ifname} egress pref 1")
> +
> +    tc(f"filter add dev {cfg.ifname} egress pref 100 "
> +       f"matchall action skbedit queue_mapping {other_queue}")
> +    defer(tc, f"filter del dev {cfg.ifname} egress pref 100")
> +
> +
> +def _create_sotxtime_socket(cfg):
> +    sock = socket.socket(socket.AF_INET6 if cfg.addr_ipver == "6"
> +                         else socket.AF_INET, socket.SOCK_DGRAM)
> +    try:
> +        sock.setsockopt(socket.SOL_SOCKET, SO_TXTIME, struct.pack("Ii", 1, 0))
> +    except OSError as exc:
> +        sock.close()
> +        raise KsftSkipEx("SO_TXTIME not supported") from exc
> +    sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE,
> +                    cfg.ifname.encode())
> +    return sock
> +
> +
> +def _send_sotxtime_burst(sock, addr, port, count, delay_ns, ipver):
> +    payload = b'\x00' * 1400
> +    txtime_ns = time.clock_gettime_ns(time.CLOCK_MONOTONIC) + delay_ns
> +
> +    ancdata = [(socket.SOL_SOCKET, SO_TXTIME, struct.pack("Q", txtime_ns))]
> +    if int(ipver) == 6:
> +        dest = (addr, port, 0, 0)
> +    else:
> +        dest = (addr, port)
> +    for _ in range(count):
> +        sock.sendmsg([payload], ancdata, 0, dest)
> +
> +
> +def reconfig_tx_stall(cfg) -> None:
> +    target_queue = 1
> +    other_queue = 0
> +
> +    ehdr = {'header': {'dev-index': cfg.ifindex}}
> +    chans = cfg.eth.channels_get(ehdr)
> +
> +    if 'combined-max' not in chans:
> +        raise KsftSkipEx("device does not support combined channels")
> +    if chans['combined-count'] < 2:
> +        raise KsftSkipEx("need at least 2 combined channels")
> +
> +    rings = cfg.eth.rings_get(ehdr)
> +    if 'rx' not in rings or 'tx' not in rings:
> +        raise KsftSkipEx("device does not expose rx/tx ring params")
> +    tx_cur = rings['tx']
> +    if tx_cur <= 64:
> +        raise KsftSkipEx("tx ring size already at minimum")
> +    defer(cfg.eth.rings_set, ehdr | {'tx': tx_cur})
> +
> +    tx_min = 64
> +    cfg.eth.rings_set(ehdr | {'tx': tx_min})
> +
> +    # Slow completions so the ring stays full after FQ releases packets
> +    napi_defer = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs"
> +    gro_timeout = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout"
> +    _write_sysfs(cfg, napi_defer, 100)
> +    _write_sysfs(cfg, gro_timeout, 1000000000)
> +
> +    mq_handle = _get_mq_handle(cfg)
> +    port = rand_port()
> +    _setup_fq_qdisc(cfg, mq_handle, port, target_queue, other_queue)
> +
> +    sock = _create_sotxtime_socket(cfg)
> +    defer(sock.close)
> +
> +    pkt_count = tx_min * 2
> +
> +    for delay_ms in [100, 200, 500]:
> +        delay_ns = delay_ms * 1_000_000
> +        _send_sotxtime_burst(sock, cfg.remote_addr, port, pkt_count,
> +                             delay_ns, cfg.addr_ipver)
> +        ksft_pr(f"Sent {pkt_count} SO_TXTIME packets (+{delay_ms}ms)")
> +        time.sleep(delay_ms / 1000 + 0.3)
> +
> +        backlog = _get_qdisc_backlog(cfg, target_queue, mq_handle)
> +        if backlog:
> +            break
> +    else:
> +        raise KsftSkipEx("failed to build qdisc backlog")
> +
> +    ksft_pr(f"Backlog before reconfig: {backlog} bytes")
> +
> +    # Trigger ring reconfig — driver should call wake, not just start
> +    cfg.eth.rings_set(ehdr | {'tx': tx_cur})
> +
> +    # Let completions proceed normally
> +    _write_sysfs(cfg, napi_defer, 0)
> +    _write_sysfs(cfg, gro_timeout, 0)
> +
> +    # Poll for backlog to drain
> +    for _ in range(100):
> +        backlog = _get_qdisc_backlog(cfg, target_queue, mq_handle)
> +        if not backlog:
> +            break
> +        time.sleep(0.1)
> +
> +    ksft_eq(0, backlog,
> +            comment=f"qdisc backlog stuck on queue {target_queue} "
> +                    f"after ring reconfig")
> +
> +
>  def main() -> None:
>      """ Ksft boiler plate main """
>  
> @@ -158,7 +329,8 @@ def main() -> None:

the NetDrvEpEnv() setup needs to ask for 2+ queues, otherwise this
fails in netdevsim mode:

# ok 3 ring_reconfig.reconfig_tx_stall # SKIP need at least 2 combined channels

>          cfg.eth = EthtoolFamily()
>  
>          ksft_run([channels,
> -                  ringparam],
> +                  ringparam,
> +                  reconfig_tx_stall],
>                   args=(cfg, ))
>      ksft_exit()
>  


^ permalink raw reply

* Re: [PATCH net-next v2 8/8] net: dsa: mt7530: implement port_change_conduit op
From: Daniel Golle @ 2026-06-13 16:09 UTC (permalink / raw)
  To: Chester A. Unal, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Matthias Brugger,
	AngeloGioacchino Del Regno, Russell King, netdev, linux-kernel,
	linux-arm-kernel, linux-mediatek
In-Reply-To: <8dd8cfe32bc8e38b92c49e30a6255090fb0998fb.1781312667.git.daniel@makrotopia.org>

On Sat, Jun 13, 2026 at 02:11:45AM +0100, Daniel Golle wrote:
> Allow changing the CPU port affinity of user ports at runtime via the
> IFLA_DSA_CONDUIT netlink attribute. This updates the port matrix to
> forward to the new CPU port instead of the old one.
> 
> Limit the operation to MT7531. There, trapped link-local frames follow
> the per-port affinity, as the MT7531_CPU_PMAP destination mask is
> further restricted by the port matrix. A conduit change is hence fully
> honoured by the hardware, for regular traffic as well as for trapped
> frames.
> 
> The MT7530 switch, including the variant embedded in the MT7621 SoC,
> instead traps frames to the single CPU port set in the CPU_PORT field
> of the MFC register, regardless of the affinity of the inbound user
> port. With user ports affine to different CPU ports there is no
> correct value for that field, so per-port CPU affinity cannot be fully
> implemented for trapped frames. Routing a WAN port via the second SoC
> GMAC is conventionally covered by the PHY muxing feature on these
> switches, which bypasses the switch fabric and does not involve a CPU
> port at all.
> 
> The switches on the MT7988, EN7581 and AN7583 SoCs only have a
> single CPU port, leaving no other conduit to change to.
> 
> Signed-off-by: Daniel Golle <daniel@makrotopia.org>

I forgot to include the previously received

Acked-by: Chester A. Unal <chester.a.unal@arinc9.com>

See also:

https://patchwork.kernel.org/comment/27003848/

https://lore.kernel.org/all/02ad5de0-ea6a-4267-8686-72e3f98fce4e@arinc9.com/

^ permalink raw reply

* [PATCH] net/mlx5: Fix wrong register access in mlx5_query_mtppse()
From: lirongqing @ 2026-06-13 15:36 UTC (permalink / raw)
  To: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-rdma, linux-kernel
  Cc: Li RongQing

From: Li RongQing <lirongqing@baidu.com>

In mlx5_query_mtppse(), the result of mtppse_reg query should be read
from the output buffer 'out', not the input buffer 'in'. The function
currently reads event_arm and event_generation_mode from 'in', which
contains the uninitialized query parameters rather than the actual
register values.

Fix by reading from the correct buffer 'out'.

Fixes: f9a1ef720e9e ("net/mlx5: Add MTPPS and MTPPSE registers infrastructure")
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/port.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index ee8b976..2ab6a6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -921,8 +921,8 @@ int mlx5_query_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 *arm, u8 *mode)
 	if (err)
 		return err;
 
-	*arm = MLX5_GET(mtppse_reg, in, event_arm);
-	*mode = MLX5_GET(mtppse_reg, in, event_generation_mode);
+	*arm = MLX5_GET(mtppse_reg, out, event_arm);
+	*mode = MLX5_GET(mtppse_reg, out, event_generation_mode);
 
 	return err;
 }
-- 
2.9.4


^ permalink raw reply related

* [PATCH] net/mlx5: Free steering tag data on release
From: lirongqing @ 2026-06-13 15:37 UTC (permalink / raw)
  To: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-rdma, linux-kernel
  Cc: Li RongQing

From: Li RongQing <lirongqing@baidu.com>

mlx5_st_alloc_index() allocates an mlx5_st_idx_data object for
each new steering tag table index and stores it in the xarray.
When the last user releases the index, mlx5_st_dealloc_index()
removes the entry from the xarray but did not free the backing
object, leaking memory.

Free idx_data after erasing the xarray entry once the refcount
reaches zero.

Fixes: 888a7776f4fb0 ("net/mlx5: Add support for device steering tag")
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/st.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
index 997be91..7cedc34 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c
@@ -175,6 +175,7 @@ int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
 
 	if (refcount_dec_and_test(&idx_data->usecount)) {
 		xa_erase(&st->idx_xa, st_index);
+		kfree(idx_data);
 		/* We leave PCI config space as was before, no mkey will refer to it */
 	}
 
-- 
2.9.4


^ permalink raw reply related

* [PATCH] net/mlx5: Fix L3 tunnel entropy refcount leak
From: lirongqing @ 2026-06-13 15:36 UTC (permalink / raw)
  To: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-rdma, linux-kernel
  Cc: Li RongQing

From: Li RongQing <lirongqing@baidu.com>

mlx5_tun_entropy_refcount_inc() counts both VXLAN and L2-to-L3
tunnel reformat entries as entropy-enabling users. The matching
decrement path only handled VXLAN, leaving L2-to-L3 tunnel entries
counted after release.

Handle MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL in
mlx5_tun_entropy_refcount_dec() as well so the enabling entry
refcount remains balanced.

Fixes: f828ca6a2fb6 ("net/mlx5e: Add support for hw encapsulation of MPLS over UDP")
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c
index 4571c56..97f6097 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/port_tun.c
@@ -176,7 +176,8 @@ void mlx5_tun_entropy_refcount_dec(struct mlx5_tun_entropy *tun_entropy,
 				   int reformat_type)
 {
 	mutex_lock(&tun_entropy->lock);
-	if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN)
+	if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_VXLAN ||
+	    reformat_type == MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL)
 		tun_entropy->num_enabling_entries--;
 	else if (reformat_type == MLX5_REFORMAT_TYPE_L2_TO_NVGRE &&
 		 --tun_entropy->num_disabling_entries == 0)
-- 
2.9.4


^ permalink raw reply related

* [PATCH iproute2-next v2] ipaddress: add support for showing IPv4 devconf attributes
From: Fernando Fernandez Mancera @ 2026-06-13  6:57 UTC (permalink / raw)
  To: netdev
  Cc: dsahern, stephen, davem, edumazet, kuba, pabeni, horms,
	Fernando Fernandez Mancera

This patch introduces support for showing IPv4 devconf attributes on
detailed output of an interface e.g "ip -d link show dev enp1s0".

Additionally, this refactors 'print_af_spec()' to sequentially process
both AF_INET and AF_INET6 attributes rather than returning early if
AF_INET6 is missing.

Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de>
---
v2: changed print_string to print_bool for boolean attributes
---
 ip/ipaddress.c | 239 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 199 insertions(+), 40 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 6017bc83..0dd2aa87 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
 #include <linux/if_infiniband.h>
+#include <linux/ip.h>
 #include <linux/sockios.h>
 #include <linux/net_namespace.h>
 
@@ -294,53 +295,211 @@ static void print_linktype(FILE *fp, struct rtattr *tb)
 	close_json_object();
 }
 
+static void print_inet(FILE *fp, struct rtattr *inet_attr)
+{
+	struct rtattr *tb[IFLA_INET_MAX + 1];
+
+	parse_rtattr_nested(tb, IFLA_INET_MAX, inet_attr);
+
+	if (tb[IFLA_INET_CONF]) {
+		int *conf = RTA_DATA(tb[IFLA_INET_CONF]);
+		int max_elements = RTA_PAYLOAD(tb[IFLA_INET_CONF]) / sizeof(int);
+
+		if (max_elements >= IPV4_DEVCONF_FORWARDING)
+			print_bool(PRINT_ANY, "forwarding", "forwarding %s ",
+				   conf[IPV4_DEVCONF_FORWARDING - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_MC_FORWARDING)
+			print_bool(PRINT_ANY, "mc_forwarding", "mc_forwarding %s ",
+				   conf[IPV4_DEVCONF_MC_FORWARDING - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_PROXY_ARP)
+			print_bool(PRINT_ANY, "proxy_arp", "proxy_arp %s ",
+				   conf[IPV4_DEVCONF_PROXY_ARP - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ACCEPT_REDIRECTS)
+			print_bool(PRINT_ANY, "accept_redirects",
+				   "accept_redirects %s ",
+				   conf[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_SECURE_REDIRECTS)
+			print_bool(PRINT_ANY, "secure_redirects",
+				   "secure_redirects %s ",
+				   conf[IPV4_DEVCONF_SECURE_REDIRECTS - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_SEND_REDIRECTS)
+			print_bool(PRINT_ANY, "send_redirects", "send_redirects %s ",
+				   conf[IPV4_DEVCONF_SEND_REDIRECTS - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_SHARED_MEDIA)
+			print_bool(PRINT_ANY, "shared_media", "shared_media %s ",
+				   conf[IPV4_DEVCONF_SHARED_MEDIA - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_RP_FILTER)
+			print_int(PRINT_ANY, "rp_filter", "rp_filter %d ",
+				  conf[IPV4_DEVCONF_RP_FILTER - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE)
+			print_bool(PRINT_ANY, "accept_source_route",
+				   "accept_source_route %s ",
+				   conf[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_BOOTP_RELAY)
+			print_bool(PRINT_ANY, "bootp_relay", "bootp_relay %s ",
+				   conf[IPV4_DEVCONF_BOOTP_RELAY - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_LOG_MARTIANS)
+			print_bool(PRINT_ANY, "log_martians", "log_martians %s ",
+				   conf[IPV4_DEVCONF_LOG_MARTIANS - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_TAG)
+			print_int(PRINT_ANY, "tag", "tag %d ",
+				  conf[IPV4_DEVCONF_TAG - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ARPFILTER)
+			print_bool(PRINT_ANY, "arpfilter", "arpfilter %s ",
+				   conf[IPV4_DEVCONF_ARPFILTER - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_MEDIUM_ID)
+			print_int(PRINT_ANY, "medium_id", "medium_id %d ",
+				  conf[IPV4_DEVCONF_MEDIUM_ID - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_NOXFRM)
+			print_bool(PRINT_ANY, "noxfrm", "noxfrm %s ",
+				   conf[IPV4_DEVCONF_NOXFRM - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_NOPOLICY)
+			print_bool(PRINT_ANY, "nopolicy", "nopolicy %s ",
+				   conf[IPV4_DEVCONF_NOPOLICY - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_FORCE_IGMP_VERSION)
+			print_int(PRINT_ANY, "force_igmp_version", "force_igmp_version %d ",
+				  conf[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ARP_ANNOUNCE)
+			print_int(PRINT_ANY, "arp_announce", "arp_announce %d ",
+				  conf[IPV4_DEVCONF_ARP_ANNOUNCE - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ARP_IGNORE)
+			print_int(PRINT_ANY, "arp_ignore", "arp_ignore %d ",
+				  conf[IPV4_DEVCONF_ARP_IGNORE - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_PROMOTE_SECONDARIES)
+			print_bool(PRINT_ANY, "promote_secondaries",
+				   "promote_secondaries %s ",
+				   conf[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ARP_ACCEPT)
+			print_int(PRINT_ANY, "arp_accept", "arp_accept %d ",
+				  conf[IPV4_DEVCONF_ARP_ACCEPT - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ARP_NOTIFY)
+			print_bool(PRINT_ANY, "arp_notify", "arp_notify %s ",
+				   conf[IPV4_DEVCONF_ARP_NOTIFY - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ACCEPT_LOCAL)
+			print_bool(PRINT_ANY, "accept_local", "accept_local %s ",
+				   conf[IPV4_DEVCONF_ACCEPT_LOCAL - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_SRC_VMARK)
+			print_bool(PRINT_ANY, "src_vmark", "src_vmark %s ",
+				   conf[IPV4_DEVCONF_SRC_VMARK - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_PROXY_ARP_PVLAN)
+			print_bool(PRINT_ANY, "proxy_arp_pvlan", "proxy_arp_pvlan %s ",
+				   conf[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ROUTE_LOCALNET)
+			print_bool(PRINT_ANY, "route_localnet", "route_localnet %s ",
+				   conf[IPV4_DEVCONF_ROUTE_LOCALNET - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_BC_FORWARDING)
+			print_bool(PRINT_ANY, "bc_forwarding", "bc_forwarding %s ",
+				   conf[IPV4_DEVCONF_BC_FORWARDING - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL)
+			print_int(PRINT_ANY, "igmpv2_unsolicited_report_interval",
+				  "igmpv2_unsolicited_report_interval %d ",
+				  conf[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL)
+			print_int(PRINT_ANY, "igmpv3_unsolicited_report_interval",
+				  "igmpv3_unsolicited_report_interval %d ",
+				  conf[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN)
+			print_bool(PRINT_ANY, "ignore_routes_with_linkdown",
+				   "ignore_routes_with_linkdown %s ",
+				   conf[IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST)
+			print_bool(PRINT_ANY, "drop_unicast_in_l2_multicast",
+				   "drop_unicast_in_l2_multicast %s ",
+				   conf[IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_DROP_GRATUITOUS_ARP)
+			print_bool(PRINT_ANY, "drop_gratuitous_arp",
+				   "drop_gratuitous_arp %s ",
+				   conf[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1]);
+
+		if (max_elements >= IPV4_DEVCONF_ARP_EVICT_NOCARRIER)
+			print_bool(PRINT_ANY, "arp_evict_nocarrier",
+				   "arp_evict_nocarrier %s ",
+				   conf[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1]);
+	}
+}
+
 static void print_af_spec(FILE *fp, struct rtattr *af_spec_attr)
 {
-	struct rtattr *inet6_attr;
 	struct rtattr *tb[IFLA_INET6_MAX + 1];
+	struct rtattr *inet6_attr;
+	struct rtattr *inet_attr;
 
-	inet6_attr = parse_rtattr_one_nested(AF_INET6, af_spec_attr);
-	if (!inet6_attr)
-		return;
+	inet_attr = parse_rtattr_one_nested(AF_INET, af_spec_attr);
+	if (inet_attr)
+		print_inet(fp, inet_attr);
 
-	parse_rtattr_nested(tb, IFLA_INET6_MAX, inet6_attr);
+	inet6_attr = parse_rtattr_one_nested(AF_INET6, af_spec_attr);
+	if (inet6_attr) {
+		parse_rtattr_nested(tb, IFLA_INET6_MAX, inet6_attr);
 
-	if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
-		__u8 mode = rta_getattr_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
-		SPRINT_BUF(b1);
+		if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
+			__u8 mode = rta_getattr_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
 
-		switch (mode) {
-		case IN6_ADDR_GEN_MODE_EUI64:
-			print_string(PRINT_ANY,
-				     "inet6_addr_gen_mode",
-				     "addrgenmode %s ",
-				     "eui64");
-			break;
-		case IN6_ADDR_GEN_MODE_NONE:
-			print_string(PRINT_ANY,
-				     "inet6_addr_gen_mode",
-				     "addrgenmode %s ",
-				     "none");
-			break;
-		case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
-			print_string(PRINT_ANY,
-				     "inet6_addr_gen_mode",
-				     "addrgenmode %s ",
-				     "stable_secret");
-			break;
-		case IN6_ADDR_GEN_MODE_RANDOM:
-			print_string(PRINT_ANY,
-				     "inet6_addr_gen_mode",
-				     "addrgenmode %s ",
-				     "random");
-			break;
-		default:
-			snprintf(b1, sizeof(b1), "%#.2hhx", mode);
-			print_string(PRINT_ANY,
-				     "inet6_addr_gen_mode",
-				     "addrgenmode %s ",
-				     b1);
-			break;
+			SPRINT_BUF(b1);
+			switch (mode) {
+			case IN6_ADDR_GEN_MODE_EUI64:
+				print_string(PRINT_ANY,
+					     "inet6_addr_gen_mode",
+					     "addrgenmode %s ",
+					     "eui64");
+				break;
+			case IN6_ADDR_GEN_MODE_NONE:
+				print_string(PRINT_ANY,
+					     "inet6_addr_gen_mode",
+					     "addrgenmode %s ",
+					     "none");
+				break;
+			case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
+				print_string(PRINT_ANY,
+					     "inet6_addr_gen_mode",
+					     "addrgenmode %s ",
+					     "stable_secret");
+				break;
+			case IN6_ADDR_GEN_MODE_RANDOM:
+				print_string(PRINT_ANY,
+					     "inet6_addr_gen_mode",
+					     " addrgenmode %s ",
+					     "random");
+				break;
+			default:
+				snprintf(b1, sizeof(b1), "%#.2hhx", mode);
+				print_string(PRINT_ANY,
+					     "inet6_addr_gen_mode",
+					     "addrgenmode %s ",
+					     b1);
+				break;
+			}
 		}
 	}
 }
-- 
2.54.0


^ permalink raw reply related

* [PATCH net v2 2/2] selftests: pmtu: fix incorrect PMTU exception generation
From: Laika Price via B4 Relay @ 2026-06-13 15:12 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan
  Cc: netdev, linux-kernel, linux-kselftest, Laika Price
In-Reply-To: <20260613-master-v2-0-061b70fd45dd@gmail.com>

From: Laika Price <laikabcprice@gmail.com>

pmtu_ipv4_br_vxlan4_exception generates PMTU exceptions by pinging an IP
on the other side of a tunnel. This was incorrect as it would return upon
the first ICMP Fragmentation Needed due to the -w flag being used in
conjunction with || return 1.

This patch updates pmtu_ipv4_br_vxlan4_exception to be in line with how
PMTU exceptions are generated in other tests such as in test_pmtu_ipvX

    run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1}
    run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2}

Signed-off-by: Laika Price <laikabcprice@gmail.com>
---
 tools/testing/selftests/net/pmtu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index a3323c21f..9498d9f53 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -1456,8 +1456,8 @@ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
 	mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
 	mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
 
-	run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 10 -s $((${ll_mtu} + 500)) ${dst} || return 1
-	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1  -s $((${ll_mtu} + 500)) ${dst} || return 1
+	run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
+	run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
 
 	# Check that exceptions were created
 	pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"

-- 
2.54.0



^ permalink raw reply related

* [PATCH net v2 1/2] ip_tunnel: drop stale dst from generated PMTU ICMP replies
From: Laika Price via B4 Relay @ 2026-06-13 15:12 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan
  Cc: netdev, linux-kernel, linux-kselftest, Laika Price
In-Reply-To: <20260613-master-v2-0-061b70fd45dd@gmail.com>

From: Laika Price <laikabcprice@gmail.com>

iptunnel_pmtud_build_icmp(...) and iptunnel_pmtud_build_icmpv6(...) take
in an sk_buff, modify it to create a PMTU ICMP error reply, and return it.
As part of these modifications, the source/destination ethernet and IP
addresses are swapped around which makes the sk_buff's current dst invalid.

If the stale dst is left, the packet can skip input routing and be
forwarded using the original output device. This was observed when sending
packets to a VXLAN over a WireGuard tunnel - the ICMP reply was generated
but it was sent over the VXLAN instead of to the WireGuard tunnel.

Drop the stale dst after building the PMTU reply so that the packet is
routed using its new headers when it is reinjected.

Signed-off-by: Laika Price <laikabcprice@gmail.com>
---
 net/ipv4/ip_tunnel_core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index d3c677e9b..949150e43 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -267,6 +267,7 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
 
 	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
 	skb_reset_mac_header(skb);
+	skb_dst_drop(skb);
 
 	return skb->len;
 }
@@ -370,6 +371,7 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
 
 	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
 	skb_reset_mac_header(skb);
+	skb_dst_drop(skb);
 
 	return skb->len;
 }

-- 
2.54.0



^ permalink raw reply related

* [PATCH net v2 0/2] ip_tunnel: fix PMTU ICMP reply routing
From: Laika Price via B4 Relay @ 2026-06-13 15:12 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Shuah Khan
  Cc: netdev, linux-kernel, linux-kselftest, Laika Price



---
Changes in v2:
- Fix incorrect PMTU exceptions test
- Link to v1: https://patch.msgid.link/20260613-master-v1-1-df796e8e2d74@gmail.com

To: David Ahern <dsahern@kernel.org>
To: Ido Schimmel <idosch@nvidia.com>
To: "David S. Miller" <davem@davemloft.net>
To: Eric Dumazet <edumazet@google.com>
To: Jakub Kicinski <kuba@kernel.org>
To: Paolo Abeni <pabeni@redhat.com>
To: Simon Horman <horms@kernel.org>
To: Shuah Khan <shuah@kernel.org>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-kselftest@vger.kernel.org

---
Laika Price (2):
      [net] ip_tunnel: drop stale dst from generated PMTU ICMP replies
      [net] selftests: pmtu: fix incorrect PMTU exception generation

 net/ipv4/ip_tunnel_core.c           | 2 ++
 tools/testing/selftests/net/pmtu.sh | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)
---
base-commit: 2a2974b5145cdf2f4db134be1a2157e9ca4a1cf0
change-id: 20260613-master-a299166b9069

Best regards,
--  
Laika Price <laikabcprice@gmail.com>



^ permalink raw reply

* [PATCH net] appletalk: aarp: fix proxy probe conflict lookup
From: Yizhou Zhao @ 2026-06-13 15:00 UTC (permalink / raw)
  To: netdev
  Cc: Yizhou Zhao, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Kito Xu (veritas501), Kees Cook,
	linux-kernel, Yuxiang Yang, Ao Wang, Xuewei Feng, Qi Li, Ke Xu,
	stable

aarp_rcv() computes hash from the packet source node and later uses it
for the normal AARP reply lookup against the unresolved table. The same
hash is also reused earlier for the proxy probe conflict check, but that
check builds its lookup key from the packet destination address.

Proxy AARP entries are inserted into the proxy table using the proxied
address node as the hash key. AARP packets are not required to have the
same source and destination node numbers, so the proxy probe conflict
check can search the wrong bucket and miss an entry that is still in
ATIF_PROBE state.

If that happens, SIOCSARP can accept a proxy address even though a
conflicting AARP packet was observed on the wire. This can create
duplicate AppleTalk address ownership. Depending on the network setup,
traffic for that address may then be misdirected, or the address may
become intermittently unreachable.

Look up the proxy probe entry using a hash derived from da.s_node, which
matches how proxy entries are inserted and removed. Leave the source-node
hash unchanged for the later unresolved-entry reply handling.

In a veth/SNAP/AARP reproducer on a KASAN-enabled kernel, a conflicting
AARP packet with different source and destination nodes allowed SIOCSARP
to succeed before this change. With this change, the same conflict
returns EADDRINUSE, while a no-conflict proxy add still succeeds.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable@vger.kernel.org
Reported-by: Yizhou Zhao <zhaoyz24@mails.tsinghua.edu.cn>
Reported-by: Yuxiang Yang <yangyx22@mails.tsinghua.edu.cn>
Reported-by: Ao Wang <wangao@seu.edu.cn>
Reported-by: Xuewei Feng <fengxw06@126.com>
Reported-by: Qi Li <qli01@tsinghua.edu.cn>
Reported-by: Ke Xu <xuke@tsinghua.edu.cn>
Assisted-by: GLM:GLM-5.1
Signed-off-by: Yizhou Zhao <zhaoyz24@mails.tsinghua.edu.cn>
---
 net/appletalk/aarp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 078fb7a6efa5..1352ede79668 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -755,7 +755,8 @@ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
 	da.s_net  = ea->pa_dst_net;
 
 	write_lock_bh(&aarp_lock);
-	a = __aarp_find_entry(proxies[hash], dev, &da);
+	a = __aarp_find_entry(proxies[da.s_node % (AARP_HASH_SIZE - 1)],
+			      dev, &da);
 
 	if (a && a->status & ATIF_PROBE) {
 		a->status |= ATIF_PROBE_FAIL;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH] r8152: add vendor/device ID for CoreChips SR9900
From: Nicolai Buchwitz @ 2026-06-13 14:57 UTC (permalink / raw)
  To: zjzhao; +Cc: hayeswang, andrew+netdev, linux-usb, netdev, linux-kernel
In-Reply-To: <20260613090154.1975753-1-zjzhao@edatec.cn>

On 13.6.2026 11:01, zjzhao@edatec.cn wrote:
> From: zjzhao-eda <zjzhao@edatec.cn>
> 
> The CoreChips SR9900 (0x0fe6:0x9900) is a USB 2.0 10/100
> Ethernet adapter. Testing shows it works correctly with the
> r8152 driver, reaching wire speed (94 Mbps) with zero packet
> loss on both TCP and UDP.

Do you know how they differ to the other CoreChip devices (eg.
drivers/net/usb/sr9800.c and others)?

> 
> Tested on Raspberry Pi, including hotplug and extended data
> transfer.
> 
> Signed-off-by: zjzhao-eda <zjzhao@edatec.cn>
> ---
>  drivers/net/usb/r8152.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
> index d61074178279..ea1733e3619c 100644
> --- a/drivers/net/usb/r8152.c
> +++ b/drivers/net/usb/r8152.c
> @@ -10062,6 +10062,7 @@ static const struct usb_device_id 
> rtl8152_table[] = {
>  	{ USB_DEVICE(VENDOR_ID_DELL,    0xb097) },
>  	{ USB_DEVICE(VENDOR_ID_ASUS,    0x1976) },
>  	{ USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) },
> +	{ USB_DEVICE(0x0fe6, 0x9900) },

Instead of hardcoded 0x0fe6, please add a proper VENDOR_ID define in 
include/linux/usb/r8152.h

>  	{}
>  };

Thanks
Nicolai

^ permalink raw reply

* Re: [PATCH 6.12.y v3 0/2] xfrm: hold dev ref until after transport_finish NF_HOOK
From: Sasha Levin @ 2026-06-13 14:51 UTC (permalink / raw)
  To: Steffen Klassert, Herbert Xu, David S . Miller, David Ahern,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman, netdev,
	linux-kernel, stable, Simon Liebold
  Cc: Sasha Levin, Simon Liebold
In-Reply-To: <20260612111327.1613710-1-simonlie@amazon.de>

On Fri, Jun 12, 2026 at 11:13:25AM +0000, Simon Liebold wrote:
> Thanks for the detailed analysis on v2, Sasha. Here's v3.
>
> v3: Backport b05d42eefac7 ("xfrm: hold device only for the asynchronous
> decryption") as a prerequisite, making the tree structurally match mainline so
> the fix applies without the lifetime gap Sasha identified in v2, where the
> dev_put at resume: dropped the ref before the re-hold could cover it.

Whole series queued for 6.12.y, thanks.

--
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs
From: Simon Schippers @ 2026-06-13 14:14 UTC (permalink / raw)
  To: hawk, netdev
  Cc: kernel-team, Jonas Köppeler, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Alexei Starovoitov,
	Daniel Borkmann, John Fastabend, Stanislav Fomichev, linux-kernel,
	bpf
In-Reply-To: <20260612083530.1650245-6-hawk@kernel.org>

On 6/12/26 10:35, hawk@kernel.org wrote:
> From: Simon Schippers <simon.schippers@tu-dortmund.de>
> 
> Per-packet BQL completion forces DQL to converge on limit=2, causing
> excessive NAPI scheduling overhead and qdisc requeues.
> 
> Accumulate BQL completions and flush them when a configurable time
> threshold (tx-usecs) is exceeded, letting DQL discover a limit that
> bounds actual queuing delay to the configured interval. Coalescing
> state persists across NAPI polls in struct veth_rq so completions can
> accumulate beyond a single budget=64 cycle.
> 
> The flush condition is:
> 
> state->time + bql_flush_ns <= current_time || state->n_bql > dql.limit
> 
> Flushing when n_bql exceeds dql.limit handles BQL starvation.
> 
> The comparison is strictly greater-than because netdev_tx_sent_queue()
> always lets the producer exceed the limit by one before it stops, so
> n_bql == dql.limit is a normal in-flight state. dql.limit lives in
> the same cacheline as the completion path, so the check is cheap.
> 
> Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
> setting tx-usecs to 0 disables coalescing and falls back to per-packet
> completion.
> 
>   ethtool -C <veth-dev> tx-usecs 500  # 500us coalescing
>   ethtool -C <veth-dev> tx-usecs 0    # per-packet (no coalescing)
> 
> Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
> Co-developed-by: Jonas Köppeler <j.koeppeler@tu-berlin.de>
> Signed-off-by: Jonas Köppeler <j.koeppeler@tu-berlin.de>
> Signed-off-by: Simon Schippers <simon.schippers@tu-dortmund.de>
> ---
>  drivers/net/veth.c | 123 ++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 117 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 2473f730734b..c62d87a8402c 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -28,6 +28,7 @@
>  #include <linux/bpf_trace.h>
>  #include <linux/net_tstamp.h>
>  #include <linux/skbuff_ref.h>
> +#include <linux/sched/clock.h>
>  #include <net/page_pool/helpers.h>
>  
>  #define DRV_NAME	"veth"
> @@ -50,6 +51,7 @@
>   * delay => 64 * 250 ms = 16 s.
>   */
>  #define VETH_WATCHDOG_TIMEOUT_MS	(64 * 250)
> +#define VETH_BQL_COAL_TX_USECS	100 /* default tx-usecs for BQL batching */
>  
>  struct veth_stats {
>  	u64	rx_drops;
> @@ -69,6 +71,11 @@ struct veth_rq_stats {
>  	struct u64_stats_sync	syncp;
>  };
>  
> +struct veth_bql_state {
> +	u64	time;	/* sched_clock() when current coalescing window started */
> +	uint	n_bql;	/* BQL completions batched in the current window */
> +};
> +
>  struct veth_rq {
>  	struct napi_struct	xdp_napi;
>  	struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
> @@ -76,6 +83,7 @@ struct veth_rq {
>  	struct bpf_prog __rcu	*xdp_prog;
>  	struct xdp_mem_info	xdp_mem;
>  	struct veth_rq_stats	stats;
> +	struct veth_bql_state	bql_state;
>  	bool			rx_notify_masked;
>  	struct ptr_ring		xdp_ring;
>  	struct xdp_rxq_info	xdp_rxq;
> @@ -88,6 +96,7 @@ struct veth_priv {
>  	struct bpf_prog		*_xdp_prog;
>  	struct veth_rq		*rq;
>  	unsigned int		requested_headroom;
> +	unsigned int		tx_coal_usecs;	/* BQL completion coalescing */
>  };
>  
>  struct veth_xdp_tx_bq {
> @@ -272,7 +281,56 @@ static void veth_get_channels(struct net_device *dev,
>  static int veth_set_channels(struct net_device *dev,
>  			     struct ethtool_channels *ch);
>  
> +static int veth_get_coalesce(struct net_device *dev,
> +			     struct ethtool_coalesce *ec,
> +			     struct kernel_ethtool_coalesce *kernel_coal,
> +			     struct netlink_ext_ack *extack)
> +{
> +	struct veth_priv *priv = netdev_priv(dev);
> +
> +	ec->tx_coalesce_usecs = priv->tx_coal_usecs;
> +	return 0;
> +}
> +
> +static int veth_set_coalesce(struct net_device *dev,
> +			     struct ethtool_coalesce *ec,
> +			     struct kernel_ethtool_coalesce *kernel_coal,
> +			     struct netlink_ext_ack *extack)
> +{
> +	struct veth_priv *priv = netdev_priv(dev);
> +	struct net_device *peer;
> +
> +	/* The coalescing window delays BQL completions, so keep tx-usecs well
> +	 * below the tx_timeout watchdog; otherwise a large value could stall a
> +	 * stopped queue long enough to trip a false watchdog timeout. Cap at
> +	 * half the watchdog to leave a generous safety margin. tx-usecs is
> +	 * microseconds, the watchdog is milliseconds.
> +	 */
> +	if (ec->tx_coalesce_usecs > VETH_WATCHDOG_TIMEOUT_MS / 2 * USEC_PER_MSEC) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "tx-usecs must stay below half the tx_timeout watchdog");
> +		return -ERANGE;
> +	}
> +
> +	/* Paired with READ_ONCE in veth_xdp_rcv(). */
> +	WRITE_ONCE(priv->tx_coal_usecs, ec->tx_coalesce_usecs);
> +
> +	/* veth_xdp_rcv() reads each device's own value, so mirror it onto
> +	 * the peer to keep the pair symmetric: both directions coalesce
> +	 * with the same tx-usecs. Called under RTNL, rtnl_dereference() is safe.
> +	 */
> +	peer = rtnl_dereference(priv->peer);
> +	if (peer) {
> +		struct veth_priv *peer_priv = netdev_priv(peer);
> +
> +		WRITE_ONCE(peer_priv->tx_coal_usecs, ec->tx_coalesce_usecs);
> +	}
> +
> +	return 0;
> +}
> +
>  static const struct ethtool_ops veth_ethtool_ops = {
> +	.supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
>  	.get_drvinfo		= veth_get_drvinfo,
>  	.get_link		= ethtool_op_get_link,
>  	.get_strings		= veth_get_strings,
> @@ -282,6 +340,8 @@ static const struct ethtool_ops veth_ethtool_ops = {
>  	.get_ts_info		= ethtool_op_get_ts_info,
>  	.get_channels		= veth_get_channels,
>  	.set_channels		= veth_set_channels,
> +	.get_coalesce		= veth_get_coalesce,
> +	.set_coalesce		= veth_set_coalesce,
>  };
>  
>  /* general routines */
> @@ -969,13 +1029,54 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
>  	return NULL;
>  }
>  
> +static void veth_bql_maybe_complete(struct veth_bql_state *state,
> +				    struct netdev_queue *peer_txq,
> +				    u64 bql_flush_ns)
> +{
> +	u64 current_time;
> +
> +	/* There is no reason to complete with 0 and
> +	 * peer_txq could go away.
> +	 */
> +	if (!state->n_bql || !peer_txq)
> +		return;
> +
> +	current_time = sched_clock();
> +
> +	/* We complete if:
> +	 * 1. We reach bql_flush_ns.
> +	 * 2. We potentially have BQL starvation.
> +	 */
> +	if (state->time + bql_flush_ns <= current_time ||
> +	    state->n_bql > peer_txq->dql.limit) {

Both Sashiko-Nipa and Sashiko-Gemini are right, this is missing a 
#ifdef CONFIG_BQL. Not sure what is the best way to add them.
And for the struct we could maybe do:

#ifdef CONFIG_BQL
struct veth_bql_state {
    u64	time;	/* sched_clock() when current coalescing window started */
    uint	n_bql;	/* BQL completions batched in the current window */
};
#else
struct veth_bql_state {};
#endif

> +		netdev_tx_completed_queue(peer_txq, state->n_bql,
> +					  state->n_bql * VETH_BQL_UNIT);
> +		state->time = current_time;
> +		state->n_bql = 0;
> +	}
> +}
> +
>  static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>  			struct veth_xdp_tx_bq *bq,
>  			struct veth_stats *stats,
>  			struct netdev_queue *peer_txq)
>  {
> +	struct veth_priv *priv = netdev_priv(rq->dev);
> +	struct veth_bql_state *state = &rq->bql_state;
>  	int i, done = 0, n_xdpf = 0;
>  	void *xdpf[VETH_XDP_BATCH];
> +	u64 bql_flush_ns;
> +
> +	/* Mirrored to both peers; paired with WRITE_ONCE() in veth_set_coalesce */
> +	bql_flush_ns = (u64)READ_ONCE(priv->tx_coal_usecs) * 1000;
> +
> +	/* Clamp stored timestamp in case we migrated to a CPU with a behind
> +	 * sched_clock(); tries to reduce late BQL flushes.
> +	 */
> +	state->time = min(state->time, sched_clock());
> +
> +	/* Flush completions that timed out since the previous NAPI poll. */
> +	veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
>  
>  	for (i = 0; i < budget; i++) {
>  		void *ptr = __ptr_ring_consume(&rq->xdp_ring);
> @@ -1000,12 +1101,11 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>  			}
>  		} else {
>  			/* ndo_start_xmit */
> -			bool bql_charged = veth_ptr_is_bql(ptr);
>  			struct sk_buff *skb = veth_ptr_to_skb(ptr);
>  
> +			if (veth_ptr_is_bql(ptr))
> +				state->n_bql++;
>  			stats->xdp_bytes += skb->len;
> -			if (peer_txq && bql_charged)
> -				netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
>  
>  			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
>  			if (skb) {
> @@ -1015,6 +1115,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
>  					napi_gro_receive(&rq->xdp_napi, skb);
>  			}
>  		}
> +		veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
>  		done++;

Sashiko-Nipa reports:

"If veth_xdp_rcv() finishes and returns a done count less than the budget,
NAPI will go to sleep in veth_poll(). Do we need to unconditionally flush
any stranded BQL completions in veth_poll() before sleeping?
If completions are left in rq->bql_state indefinitely across NAPI idle
periods, it might present an artificially massive delay to DQL. This could
cause DQL to mistakenly conclude the hardware is extremely slow and
aggressively shrink dql.limit to its minimum, crippling throughput on
subsequent bursts."

Again the issue that I found to be non-problematic in [1] and can be
seen by an BQL inflight > 0 when for example pktgen suddenly stops.

If we would "unconditionally flush any stranded BQL completions in
veth_poll() before sleeping" we would *not* accumulate BQL completions
across NAPI polls but we want to do that.

Do you agree?

[1] https://lore.kernel.org/netdev/c8650d3a-e488-4279-b28f-549d766c23a1@tu-dortmund.de/

^ permalink raw reply

* [PATCH net-next] selftests/net/openvswitch: add ICMPv6 echo type match test
From: Minxi Hou @ 2026-06-13 14:14 UTC (permalink / raw)
  To: netdev
  Cc: aconole, echaudro, i.maximets, davem, edumazet, kuba, pabeni,
	horms, shuah, dev, linux-kselftest, Minxi Hou

Register OVS_KEY_ATTR_ICMPV6 in the flow key parser so that
icmpv6(type=...) can be used in flow specifications. Without this
registration the parser silently drops the token and the kernel
rejects the flow with EINVAL because the expected ICMPv6 key
attribute is missing.

While here, add convert_int() to the ovs_key_ipv6 and ovs_key_icmp
fields_map entries so that specifying a field value produces the
correct wildcard mask (0xff for bytes, 0xffffffff for the label)
instead of using the value itself as the mask. The ipv4 counterpart
already does this via convert_int(); the ipv6 and icmp classes were
simply missing the fifth tuple element. Existing callers that pass
empty parentheses are unaffected because convert_int("") returns
(0, 0).

Add test_icmpv6 exercising the ICMPv6 echo flow key. The test uses
static neighbour entries to bypass NDP, then verifies in three steps:
install icmpv6(type=128) and icmpv6(type=129) flows and confirm ping
works, remove the flows and confirm ping fails, reinstall and confirm
recovery.

Signed-off-by: Minxi Hou <houminxi@gmail.com>
---
 .../selftests/net/openvswitch/openvswitch.sh  | 63 +++++++++++++++++++
 .../selftests/net/openvswitch/ovs-dpctl.py    | 26 +++++---
 2 files changed, 82 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh
index d533decca5c1..8923224fa88e 100755
--- a/tools/testing/selftests/net/openvswitch/openvswitch.sh
+++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh
@@ -31,6 +31,7 @@ tests="
 	pop_vlan				vlan: POP_VLAN action strips tag
 	dec_ttl					ttl: dec_ttl decrements IP TTL
 	flow_set				flow-set: Flow modify
+	icmpv6					icmpv6: ICMPv6 echo type match
 	psample					psample: Sampling packets with psample"
 
 info() {
@@ -377,6 +378,68 @@ test_flow_set() {
 	return 0
 }
 
+test_icmpv6() {
+	sbx_add "test_icmpv6" || return $?
+	ovs_add_dp "test_icmpv6" icmpv6 || return 1
+
+	info "create namespaces"
+	for ns in client server; do
+		ovs_add_netns_and_veths "test_icmpv6" "icmpv6" \
+			"$ns" "${ns:0:1}0" "${ns:0:1}1" || return 1
+	done
+
+	ip netns exec client ip addr add fd00::1/64 dev c1 nodad
+	ip netns exec client ip link set c1 up
+	ip netns exec server ip addr add fd00::2/64 dev s1 nodad
+	ip netns exec server ip link set s1 up
+
+	local cl_mac sl_mac
+	cl_mac=$(ip netns exec client \
+		ip link show c1 | awk '/link\/ether/ {print $2}')
+	[ -z "$cl_mac" ] && \
+		{ info "failed to get c1 hwaddr"; return 1; }
+	sl_mac=$(ip netns exec server \
+		ip link show s1 | awk '/link\/ether/ {print $2}')
+	[ -z "$sl_mac" ] && \
+		{ info "failed to get s1 hwaddr"; return 1; }
+	ip netns exec client \
+		ip -6 neigh add fd00::2 lladdr "$sl_mac" dev c1
+	ip netns exec server \
+		ip -6 neigh add fd00::1 lladdr "$cl_mac" dev s1
+
+	ovs_add_flow "test_icmpv6" icmpv6 \
+	  'in_port(1),eth(),eth_type(0x86dd),ipv6(proto=58),icmpv6(type=128)' \
+	  '2' || return 1
+	ovs_add_flow "test_icmpv6" icmpv6 \
+	  'in_port(2),eth(),eth_type(0x86dd),ipv6(proto=58),icmpv6(type=129)' \
+	  '1' || return 1
+
+	info "verify ICMPv6 echo with type-specific flows"
+	ovs_sbx "test_icmpv6" ip netns exec client \
+		ping -6 -c 1 -W 2 fd00::2 || return 1
+
+	ovs_del_flows "test_icmpv6" icmpv6
+
+	info "verify ping fails without echo flows"
+	ovs_sbx "test_icmpv6" ip netns exec client \
+		ping -6 -c 1 -W 2 fd00::2 >/dev/null 2>&1 \
+		&& { info "FAIL: ping should fail without flows"
+		     return 1; }
+
+	ovs_add_flow "test_icmpv6" icmpv6 \
+	  'in_port(1),eth(),eth_type(0x86dd),ipv6(proto=58),icmpv6(type=128)' \
+	  '2' || return 1
+	ovs_add_flow "test_icmpv6" icmpv6 \
+	  'in_port(2),eth(),eth_type(0x86dd),ipv6(proto=58),icmpv6(type=129)' \
+	  '1' || return 1
+
+	info "verify connectivity restored"
+	ovs_sbx "test_icmpv6" ip netns exec client \
+		ping -6 -c 1 -W 2 fd00::2 || return 1
+
+	return 0
+}
+
 # psample test
 # - use psample to observe packets
 test_psample() {
diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
index e1ecfad2c03e..049791b2573b 100644
--- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
+++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py
@@ -1255,11 +1255,16 @@ class ovskey(nla):
                 lambda x: ipaddress.IPv6Address(x).packed if x else 0,
                 convert_ipv6,
             ),
-            ("label", "label", "%d", lambda x: int(x) if x else 0),
-            ("proto", "proto", "%d", lambda x: int(x) if x else 0),
-            ("tclass", "tclass", "%d", lambda x: int(x) if x else 0),
-            ("hlimit", "hlimit", "%d", lambda x: int(x) if x else 0),
-            ("frag", "frag", "%d", lambda x: int(x) if x else 0),
+            ("label", "label", "%d", lambda x: int(x) if x else 0,
+                convert_int(32)),
+            ("proto", "proto", "%d", lambda x: int(x) if x else 0,
+                convert_int(8)),
+            ("tclass", "tclass", "%d", lambda x: int(x) if x else 0,
+                convert_int(8)),
+            ("hlimit", "hlimit", "%d", lambda x: int(x) if x else 0,
+                convert_int(8)),
+            ("frag", "frag", "%d", lambda x: int(x) if x else 0,
+                convert_int(8)),
         )
 
         def __init__(
@@ -1344,8 +1349,10 @@ class ovskey(nla):
         )
 
         fields_map = (
-            ("type", "type", "%d", lambda x: int(x) if x else 0),
-            ("code", "code", "%d", lambda x: int(x) if x else 0),
+            ("type", "type", "%d", lambda x: int(x) if x else 0,
+                convert_int(8)),
+            ("code", "code", "%d", lambda x: int(x) if x else 0,
+                convert_int(8)),
         )
 
         def __init__(
@@ -1982,6 +1989,11 @@ class ovskey(nla):
                 "icmp",
                 ovskey.ovs_key_icmp,
             ),
+            (
+                "OVS_KEY_ATTR_ICMPV6",
+                "icmpv6",
+                ovskey.ovs_key_icmpv6,
+            ),
             (
                 "OVS_KEY_ATTR_TCP_FLAGS",
                 "tcp_flags",
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH net-next v2 1/2] netdev: expose io_uring rx_page_order order via netlink
From: Dragos Tatulea @ 2026-06-13 14:09 UTC (permalink / raw)
  To: Pavel Begunkov, Donald Hunter, Jakub Kicinski, David S. Miller,
	Eric Dumazet, Paolo Abeni, Simon Horman, Andrew Lunn, Jens Axboe
  Cc: Yael Chemla, Tariq Toukan, netdev, linux-kernel, io-uring
In-Reply-To: <d0401fab-61c5-43e7-93ae-d4757433eb7a@gmail.com>



On 13.06.26 11:53, Pavel Begunkov wrote:
> On 6/12/26 22:17, Dragos Tatulea wrote:
>> This adds observability for the io_uring zcrx rx-buf-len configuration.
> 
> It might be nicer to look it up in the queue, e.g. rxq->mp_params,
> and make it a queue attribute instead of zcrx specific one. In either
> case, no objections.
> 
In io_pp_nl_fill() or in page_pool_nl_fill() as it was done in v1 for order?

Thanks,
Dragos

^ permalink raw reply

* Re: [PATCH net-next v7 0/5] veth: add Byte Queue Limits (BQL) support
From: Simon Schippers @ 2026-06-13 13:57 UTC (permalink / raw)
  To: Jonas Köppeler, hawk, netdev
  Cc: kernel-team, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Chris Arges, Mike Freemon,
	Toke Høiland-Jørgensen, Breno Leitao,
	Alexei Starovoitov, Daniel Borkmann, John Fastabend,
	Stanislav Fomichev, bpf
In-Reply-To: <4ddf3bcb-db5d-4821-ab32-577de93973a7@tu-berlin.de>

On 6/12/26 19:21, Jonas Köppeler wrote:
> On 6/12/26 16:10, Simon Schippers wrote:
>> On 6/12/26 10:35, hawk@kernel.org wrote:
>>> From: Jesper Dangaard Brouer <hawk@kernel.org>
>>>
>>> This series adds BQL (Byte Queue Limits) to the veth driver, reducing
>>> latency by dynamically limiting in-flight packets in the ptr_ring and
>>> moving buffering into the qdisc where AQM algorithms can act on it.
>>
>> LGTM, thanks for the detailed changelog :)
>>
>> Maybe we should stop searching for the perfect tx-usecs value.
>> 100us is probably fine for most hardware to not have a performance
>> regression. And lowering it does not really improve the RTT anyways.
>> Do you agree?
> I agree, I already thought that it just might be a very lucky case when using 50us where something accidentally aligns nicely. Interestingly, I could also reproduce that 50us was consistently a little better compared to 100us on an Intel CPU. Maybe if I get the time, I'll have another look at it, but in general I think 50us or 100us does not really matter.
> 

Interesting.

I ran the benchmarks again, the results are at [1].
I tested values between 0-9us, 10-90us, 100, 500, 1000, 5000 and 10000us.

TLDR: Throughput is fine for everything > 0us. RTT only improves
      slightly for < 100us. So 100us is fine.

[1] https://github.com/simoschip2000/veth-backpressure-performance-testing/blob/v7/results/tx-usecs/text_sweep.txt

>>
>> Nevertheless, I will compile and run the benchmarks again.
>>
>> I will go on vacation from 15th to 24th of June, so I will not be able
>> to contribute code or run benchmarks then.
>>
>> Thanks,
>> Simon
>>
> 

^ permalink raw reply

* Re: [RFC net-next 08/15] ipxlat: add translation engine and dispatch core
From: Ralf Lici @ 2026-06-13 13:17 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: netdev, Daniel Gröber, Antonio Quartulli, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	linux-kernel
In-Reply-To: <87y0gm8x5k.fsf@toke.dk>

On Wed, 10 Jun 2026 13:14:47 +0200, Toke Høiland-Jørgensen <toke@kernel.org> wrote:
> Ralf Lici <ralf@mandelbit.com> writes:
>
> > Hi Toke,
> >
> > On Thu, 04 Jun 2026 20:23:51 +0200, Toke Høiland-Jørgensen <toke@kernel.org> wrote:
> >> Ralf Lici <ralf@mandelbit.com> writes:
> >>
> >> > This commit introduces the core start_xmit processing flow: validate,
> >> > select action, translate, and forward. It centralizes action resolution
> >> > in the dispatch layer and keeps per-direction translation logic separate
> >> > from device glue. The result is a single data-path entry point with
> >> > explicit control over drop/forward/emit behavior.
> >> >
> >> > Signed-off-by: Ralf Lici <ralf@mandelbit.com>
> >>
> >> This is very cool! Going quickly through the series, this seems like
> >> thorough work that will be cool to have available in the kernel, so
> >> thanks for doing this! I'll be quite happy to retire my barebones
> >> BPF-based implementation once this lands :)
> >>
> >
> > Thanks, glad to hear this looks useful. I have not had much time to work
> > on ipxlat lately, but I hope to respin the RFC soon.
> >
> >> One comment on the device model below (which is also why I chose this
> >> patch to reply to):
> >>
> >> > +static void ipxlat_forward_pkt(struct ipxlat_priv *ipxlat, struct sk_buff *skb)
> >> > +{
> >> > +	const unsigned int len = skb->len;
> >> > +	int err;
> >> > +
> >> > +	/* reinject as a fresh packet with scrubbed metadata */
> >> > +	skb_set_queue_mapping(skb, 0);
> >> > +	skb_scrub_packet(skb, false);
> >> > +
> >> > +	err = gro_cells_receive(&ipxlat->gro_cells, skb);
> >>
> >> So given that you're not resetting skb->dev here, IIUC, this means that
> >> the translated packet will magically re-appear as if it arrived on the
> >> interface it first came in on, right?
> >>
> >> That seems... a bit too magical? Sending a packet to one device making
> >> it suddenly reappear on a different, unrelated, device seems like it
> >> will just create confusion. It's like the ipxlat device can't really
> >> device if it's a device or a tunnel? :)
> >>
> >
> > That's not quite what happens in the routed xmit path. There the stack
> > sets skb->dev to the selected output device before handing the skb to
> > the device. For IPv4 and IPv6 this happens in ip_output/ip6_output,
> > where the output device is taken from the skb dst. So when the route
> > selects the ipxlat device, the skb reaches ndo_start_xmit with skb->dev
> > already pointing at the ipxlat device, not at the original ingress
> > device.
> >
> > The internal 4-to-6 pre-fragmentation path should preserve the same
> > property as well: ip_do_fragment copies the skb metadata to the
> > generated fragments, including skb->dev, and the temporary dst used for
> > that path also points at the ipxlat device. The fragment callback then
> > feeds those fragments back into the same ipxlat processing path.
> >
> > That said, I agree that relying on this implicitly is not great.
> > gro_cells_receive uses skb->dev directly, and the intended receive-side
> > re-injection model should be obvious at the call site. I will set
> > skb->dev = ipxlat->dev explicitly before gro_cells_receive in the next
> > version.
>
> Right, sounds good. I'm also wondering if you actually need the gro_cells
> infrastructure at all? IIUC, the purpose of that is to allow tunnels to
> create GRO superframes of packets after they are decapsulated (and thus
> their l4 commonality becomes apparent). But you're not decapsulating
> anything, you're just translating between protocols the kernel already
> understands. So presumably any opportunity to coalesce GRO packets would
> already have happened pre-translation? So any reason why you can't just
> do what loopback.c does, and do a straight __netif_rx() call in the
> transmit function?
>

No, I think you're right that gro_cells is not justified here, I was
probably biased by my work on tunnel interfaces. Unlike a tunnel decap
path, ipxlat does not reveal a new same-family L4 flow after
decapsulation, so I don't see a translation-specific GRO opportunity
there, and a loopback-style receive handoff would be the simpler version
of that design.

That said, after thinking more about the rest of your feedback, I think
the right fix is probably not just replacing gro_cells with __netif_rx.
The deeper issue is the netdevice/RX-reinjection model itself.

> >> I think a better model is to treat the device as basically a loopback
> >> device that translates packets before looping them back (so when they
> >> come back they appear to be coming from that device).
> >>
> >> Any reason why that wouldn't work?
> >>
> >
> > That's indeed the intended model for the ipxlat netdevice: route packets
> > to it, translate them, then loop them back into the stack as packets
> > received from that same device. That seemed like the simplest model and
> > the one that exposes the translation point most clearly.
>
> Right. I think this could be made a bit more explicit in the
> documentation as well, since it's a bit of an unusual model.
>
> And, well, taking a step back: is it really the right model? Regular NAT
> lives in netfilter, why can't this be a netfilter module as well? Seems
> to me you could have something like:
>
> table ip xlat4 {
> 	chain postrouting {
> 		type nat hook postrouting priority srcnat; policy accept;
> 		ip daddr 0.0.0.0/0 oifname "eth0" xlat to 64:ff9b::/96
> 	}
> }
> table ip6 xlat6 {
> 	chain prerouting {
> 		type nat hook prerouting priority dstnat; policy accept;
> 		ip6 saddr 64::ff0b::/96 iifname "eth0" xlat from 64::ff9b::/96
> 	}
> }
>
> and that would provide the functionality without having to implement a
> new interface type and the associated multiple traversals through the
> stack? Did you consider this as an alternative to the new device type?
>

We did consider netfilter, and your example is syntactically attractive,
but I am no longer convinced it is the cleanest model for SIIT.

An nft expression cannot simply rewrite ETH_P_IP <-> ETH_P_IPV6 and
return ACCEPT as if this were normal NAT because the current hook
invocation, dst, and conntrack-related state were established for the
packet as it entered that hook. A cross-family translator would need to
consume the skb, clear or rebuild route and ct metadata as appropriate,
do an other-family route lookup, and resume at a well-defined point in
that family. That seems possible, but it would be a new stateless
cross-family action, not just a new mode of the existing nft nat
expression (which is built around nf_nat_setup_info and assumes the
packet's L3 family does not change AFAICT).

My second concern is that the SIIT boundary would be a property of rule
and hook placement. That gives flexibility, but it also means the
translation point has to be constrained and documented very carefully to
avoid ambiguous TTL/Hop Limit, PMTU/ICMP, and hook-order behavior. For
this use case I would rather have the route that matches the translation
prefix also be the object that says: leave this family here and continue
in the other one.

After looking at the available kernel mechanisms again, I think the
better model is probably LWT: routes carry an ipxlat encap referencing a
named translator domain configured over netlink. That should represent
the stateless, prefix-based and symmetric nature of ipxlat.

Very roughly, userspace could look like:

    ip xlat add siit0 prefix6 64:ff9b::/96
    ip route add ... encap ipxlat id siit0
    ip -6 route add ... encap ipxlat id siit0

There are some useful precedents for this: ILA is stateless address
translation as LWT, seg6_local already has cross-family LWT actions, and
ioam6 has a similar split between separately configured objects and
route attachments.

The invariant I would like v2 to follow is that the original-family
route lookup selects translation as its terminal route action. The
translated skb then gets a fresh lookup in the other family. From that
point on, TTL/Hop Limit where applicable, PMTU, ICMP errors, and
netfilter visibility belong to the translated family.

So I think your question addresses the core design issue in this RFC. My
current preference is to rework the next version around an LWT/domain
model instead of the virtual netdevice model, unless prototyping shows a
fundamental problem with that approach.

Does that model make sense to you?

Thanks for pushing on this.

-- 
Ralf Lici
Mandelbit Srl

^ permalink raw reply

* Re: [PATCH] r8152: add vendor/device ID for CoreChips SR9900
From: Nicolai Buchwitz @ 2026-06-13 12:52 UTC (permalink / raw)
  To: zjzhao, hayeswang
  Cc: andrew+netdev, linux-usb, netdev, linux-kernel, zjzhao-eda
In-Reply-To: <20260613090154.1975753-1-zjzhao@edatec.cn>

Hi

On June 13, 2026 11:01:54 AM GMT+02:00, zjzhao@edatec.cn wrote:
>From: zjzhao-eda <zjzhao@edatec.cn>
>
>The CoreChips SR9900 (0x0fe6:0x9900) is a USB 2.0 10/100
>Ethernet adapter. Testing shows it works correctly with the
>r8152 driver, reaching wire speed (94 Mbps) with zero packet
>loss on both TCP and UDP.
>
>Tested on Raspberry Pi, including hotplug and extended data
>transfer.
>
>Signed-off-by: zjzhao-eda <zjzhao@edatec.cn>

AFAIK the DCO must contain a full name and not just an alias

>---
> drivers/net/usb/r8152.c | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
>index d61074178279..ea1733e3619c 100644
>--- a/drivers/net/usb/r8152.c
>+++ b/drivers/net/usb/r8152.c
>@@ -10062,6 +10062,7 @@ static const struct usb_device_id rtl8152_table[] = {
> 	{ USB_DEVICE(VENDOR_ID_DELL,    0xb097) },
> 	{ USB_DEVICE(VENDOR_ID_ASUS,    0x1976) },
> 	{ USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) },
>+	{ USB_DEVICE(0x0fe6, 0x9900) },
> 	{}
> };
> 

Also please indicate the target tree for your patch in the subject (eg. net-next). For furher details, have a look at the netdev FAQ.

[1] https://www.kernel.org/doc/html/v6.1/process/maintainer-netdev.html

Thanks 
Nicolai

^ permalink raw reply

* Re: [PATCH] nbd: Reclassify sockets to avoid lockdep circular dependency
From: Jens Axboe @ 2026-06-13 12:34 UTC (permalink / raw)
  To: Josef Bacik, Eric Dumazet
  Cc: linux-kernel, linux-block, nbd, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Kuniyuki Iwashima, netdev,
	syzbot+607cdcf978b3e79da878
In-Reply-To: <20260613042619.1108126-1-edumazet@google.com>


On Sat, 13 Jun 2026 04:26:19 +0000, Eric Dumazet wrote:
> syzbot reported a possible circular locking dependency in udp_sendmsg()
> where fs_reclaim can be triggered while holding sk_lock, and fs_reclaim
> can eventually depend on another sk_lock (e.g., if NBD is used for swap
> or writeback and NBD uses TLS/TCP which acquires sk_lock).
> 
> Since the UDP socket and the NBD TCP/TLS socket are different, this is a
> false positive. Fix this by reclassifying NBD sockets to a separate lock
> class when they are added to the NBD device.
> 
> [...]

Applied, thanks!

[1/1] nbd: Reclassify sockets to avoid lockdep circular dependency
      commit: d532cddb6c6049ced414d64d83c6ce7149a6421a

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH net-next v2 1/2] virtio_net: xsk: fix race in rx wake up
From: Menglong Dong @ 2026-06-13 12:26 UTC (permalink / raw)
  To: menglong8.dong, xuanzhuo, eperezma, Bui Quang Minh
  Cc: mst, jasowang, andrew+netdev, davem, edumazet, kuba, pabeni,
	kerneljasonxing, netdev, virtualization, linux-kernel
In-Reply-To: <41eefa1d-99bf-450d-988e-7dec67c6b61e@gmail.com>

On 2026/6/12 00:24, Bui Quang Minh wrote:
> On 6/11/26 09:56, menglong8.dong@gmail.com wrote:
> > From: Menglong Dong <dongml2@chinatelecom.cn>
> >
> > During packet receiving in virtio-net, the rq can be empty, which means
> > "rq->vq->num_free == virtqueue_get_vring_size(rq->vq)", in
> > virtnet_add_recvbuf_xsk(), if we are using xsk. Meanwhile, the fill ring
> > can be empty too, which means we can't allocate anything from
> > xsk_buff_alloc_batch(). Then, we will set the XDP_RING_NEED_WAKEUP flag.
> >
> > However, if the user clean all the data in rx ring and fill the
> > "fill ring" and check the XDP_RING_NEED_WAKEUP flag after
> > xsk_buff_alloc_batch() and before xsk_set_rx_need_wakeup(), then the rx
> > napi will never be scheduled: the rx ring is empty, which means we will
> > never receive a packet to trigger the further recv fill. The rx ring is
> > empty now, so the user will not check the flag too.
> >
> > Fix this by set the XDP_RING_NEED_WAKEUP flag before
> > xsk_buff_alloc_batch() if both rq->vq and fill ring are empty.
> >
> > Meanwhile, set the XDP_RING_NEED_WAKEUP flag if we have any free entry in
> > rq->vq.
> >
> > Fixes: e3f8800aa243 ("virtio-net: xsk: Support wakeup on RX side")
> > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> > ---
> >   drivers/net/virtio_net.c | 25 ++++++++++++++++++++++---
> >   1 file changed, 22 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index f4adcfee7a80..4b5b3fa62008 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -1323,16 +1323,27 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
> >   				   struct xsk_buff_pool *pool, gfp_t gfp)
> >   {
> >   	struct xdp_buff **xsk_buffs;
> > +	bool need_wakeup;
> >   	dma_addr_t addr;
> >   	int err = 0;
> >   	u32 len, i;
> >   	int num;
> >   
> > +	need_wakeup = xsk_uses_need_wakeup(pool);
> >   	xsk_buffs = rq->xsk_buffs;
> >   
> > +	/* If both rq->vq and fill ring are empty, and then the user submit
> > +	 * all the chunks to the fill ring and check the wake up flag
> > +	 * after xsk_buff_alloc_batch() and before xsk_set_rx_need_wakeup(),
> > +	 * we will lose the chance to wake up the rx napi, so we have to
> > +	 * set the need_wakeup flag here.
> > +	 */
> > +	if (need_wakeup && virtqueue_get_vring_size(rq->vq) == rq->vq->num_free)
> > +		xsk_set_rx_need_wakeup(pool);
> 

Hi, Bui Quang. Thanks for your reply. I spent some time learning
what you said.

> I think when polling the receive queue, the userspace program needs to 
> check the XDP_RING_NEED_WAKEUP flag if it does not see any packets. The 
> flag check is quite lightweight in my opinion. Here are some examples I find
> 
> - 
> https://github.com/xdp-project/xdp-tools/blob/e9469501622aa22a7e452a671000bec8685edcde/lib/util/xdpsock.c#L1206

You are right, I'm over concerned about this point. My origin
concern is that we can't wake up from the poll syscall in this case:

The chunk of the umem is 2000. In the beginning, the xsk->fill_ring
is filled with 2000 chunk, and then the user fall asleep and don't
do anything.

Kernel: the 2000th packet is received
Kernel: xsk_buff_alloc_batch return 0(xsk->fill_ring is empty and xsk->rx_ring is full)

        User: handle the xsk->rx_ring
        User: fill the xsk->fill_ring with 2000 chunks
        User: check the wake up flag
        User: no need_wakeup flag, fall asleep with poll() syscall

Kernel: call xsk_set_rx_need_wakeup()
Kernel: virio-net rx ringbuf is empty, we can't receive any packet further
Kernel: to call virtnet_add_recvbuf_xsk(), we are dead

But then, I found that we can still be wake up with the 2000th
packet from the poll syscall, which means that the case that
the NAPI and the user can't both be waked up doesn't exist.

> - 
> https://github.com/xdp-project/bpf-examples/blob/43e565901c4287efa863edca7f0e6cd6e35ed896/AF_XDP-forwarding/xsk_fwd.c#L540
> 
> Furthermore, the XDP_RING_NEED_WAKEUP flag related functions does not 
> provide any memory orderings. So even with your patch, I'm worried that 
> this case is possible
> 
> kernel userspace
> 
> xsk_buff_alloc_batch -> failed
>                                                              submit fill 
> ring
>                                                              flag != 
> XDP_RING_NEED_WAKEUP
> // reordering due to lack of memory orderings
> xsk_set_rx_need_wakeup
> 
> I'm not expert here, so correct me if I'm wrong. I think the wake up 
> flag is designed with no orderings so we cannot rely on it to reason and 
> skip further checks.
> 
> > +
> >   	num = xsk_buff_alloc_batch(pool, xsk_buffs, rq->vq->num_free);
[....]
> > +
> 
> Why do we need to set XDP_RING_NEED_WAKEUP even when 
> xsk_buff_alloc_batch succeeds?

Ah, don't mind here. I just thought that if xsk_buff_alloc_batch()
didn't allocate enough chunks as we need, we can wake up
the NAPI as soon as possible, in case that the virtio-net ringbuf
is full and cause packet dropping :)

Anyway, I'll remove the first patch, and send the second patch
only in the V3.

Thanks!
Menglong Dong

> 
> >   	return num;
> >   
> >   err:
> 
> Thanks,
> Quang Minh.
> 
> 
> 
> 





^ permalink raw reply

* [PATCH v3] flow_dissector: fix uninit-value in __skb_flow_dissect() for ETH_ADDRS
From: Yun Zhou @ 2026-06-13 11:31 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, horms, qingfang.deng, jiri
  Cc: netdev, linux-kernel, yun.zhou

__skb_flow_dissect() unconditionally reads 12 bytes from eth_hdr(skb)
when FLOW_DISSECTOR_KEY_ETH_ADDRS is requested. This assumes the skb
has a valid Ethernet header at mac_header, which is not always the case.

The problem can be triggered by:
 1. Creating a TUN device in L3 mode (IFF_TUN, hard_header_len=0)
 2. Attaching a multiq qdisc with a flower filter matching on eth_src
 3. Sending a packet through AF_PACKET

Since TUN in L3 mode has no link-layer header, mac_header points to
the L3 data area. The flow dissector reads 12 bytes of uninitialized
skb memory, which then propagates through fl_set_masked_key() and is
used as a rhashtable lookup key in __fl_lookup(), as reported by KMSAN.

Rejecting the filter in the control path (at tc filter add time) is
not feasible because TC filter blocks can be shared between arbitrary
devices -- a filter installed on an Ethernet device may later classify
packets on a headerless device through a shared block. The device
association is not fixed at filter creation time.

Fix this in the data path by checking skb->dev->hard_header_len before
reading. If the device does not have a link-layer header large enough
to contain the Ethernet addresses, zero the key so the filter will not
match.

Reported-by: syzbot+fa2f5b1fb06147be5e16@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=fa2f5b1fb06147be5e16
Fixes: 67a900cc0436 ("flow_dissector: introduce support for Ethernet addresses")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
v3: Replace skb_tail_pointer() - skb_mac_header() length check with
    skb->dev->hard_header_len check.

v2: Adjust commit message and comment.

 net/core/flow_dissector.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2a98f5fa74eb..0b235ec0743f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1173,13 +1173,20 @@ bool __skb_flow_dissect(const struct net *net,
 
 	if (dissector_uses_key(flow_dissector,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
-		struct ethhdr *eth = eth_hdr(skb);
 		struct flow_dissector_key_eth_addrs *key_eth_addrs;
 
 		key_eth_addrs = skb_flow_dissector_target(flow_dissector,
 							  FLOW_DISSECTOR_KEY_ETH_ADDRS,
 							  target_container);
-		memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
+		/* TC filter blocks can be shared across devices with
+		 * different header lengths, so we cannot validate this
+		 * when the filter is installed -- check at dissect time.
+		 */
+		if (skb->dev &&
+		    skb->dev->hard_header_len >= sizeof(*key_eth_addrs))
+			memcpy(key_eth_addrs, eth_hdr(skb), sizeof(*key_eth_addrs));
+		else
+			memset(key_eth_addrs, 0, sizeof(*key_eth_addrs));
 	}
 
 	if (dissector_uses_key(flow_dissector,
-- 
2.43.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox