From: Mohsin Bashir <mohsin.bashr@gmail.com>
To: netdev@vger.kernel.org
Cc: andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
kuba@kernel.org, pabeni@redhat.com, shuah@kernel.org,
linux-kselftest@vger.kernel.org
Subject: [PATCH net-next V2] selftests: drv-net: Test queue stall upon reconfig
Date: Fri, 12 Jun 2026 18:48:54 -0700 [thread overview]
Message-ID: <20260613014855.1717712-1-mohsin.bashr@gmail.com> (raw)
From: Mohsin Bashir <hmohsin@meta.com>
Add a reconfig_tx_stall test that detects the possibility of a TX stall
after ring reconfiguration. The key observation is that drivers using
netif_tx_start_all_queues() are prone to experiencing a stall when
reconfiguration completes compared to drivers using
netif_tx_wake_all_queues(). start_all_queues only clears DRV_XOFF, while
wake_all_queues also calls __netif_schedule() to kick the qdisc. Without
the kick, qdisc backlog present at reconfig time can stay stuck until a
new trigger is issued.
The test caps the TX ring at 64 entries so it fills quickly, then
installs FQ on a target TX queue and sends UDP packets with SO_TXTIME
scheduled in the future. With napi_defer_hard_irqs slowing completions,
the small ring can fill when FQ releases the burst, leaving requeued
qdisc backlog with no FQ timer to rescue it. A subsequent ring reconfig
must wake the queues to drain the backlog. Simply starting the queues can
leave it stuck.
On host with problematic driver:
Sent 128 SO_TXTIME packets (+100ms)
Sent 128 SO_TXTIME packets (+200ms)
Backlog before reconfig: 52632 bytes
Check| At /root/ksft-net-drv/./drivers/net/ring_reconfig.py, ...
Check| ksft_eq(0, backlog,
Check failed 0 != 52632 qdisc backlog stuck on queue 1 after ring reconfig
not ok 3 ring_reconfig.reconfig_tx_stall
On host with fixed driver:
Sent 128 SO_TXTIME packets (+100ms)
Sent 128 SO_TXTIME packets (+200ms)
Backlog before reconfig: 76024 bytes
ok 3 ring_reconfig.reconfig_tx_stall
Signed-off-by: Mohsin Bashir <hmohsin@meta.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
Changelog:
V2:
- Reoreder config file changes to handle check_selftest failure
- Chain exception with from exc in _create_sotxtime_socket
- Use getattr to get arch-specific value for SO_TXTIME, fallback to 61
V1: https://lore.kernel.org/netdev/20260612001754.2489868-1-mohsin.bashr@gmail.com
---
tools/testing/selftests/drivers/net/config | 4 +
.../selftests/drivers/net/ring_reconfig.py | 176 +++++++++++++++++-
2 files changed, 178 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config
index 617de8aaf551..1ef07fae74c1 100644
--- a/tools/testing/selftests/drivers/net/config
+++ b/tools/testing/selftests/drivers/net/config
@@ -4,6 +4,10 @@ CONFIG_DEBUG_INFO_BTF_MODULES=n
CONFIG_INET_PSP=y
CONFIG_IPV6=y
CONFIG_MACSEC=m
+CONFIG_NET_ACT_SKBEDIT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
CONFIG_NETCONSOLE_EXTENDED_LOG=y
diff --git a/tools/testing/selftests/drivers/net/ring_reconfig.py b/tools/testing/selftests/drivers/net/ring_reconfig.py
index f9530a8b0856..11491a0b7013 100755
--- a/tools/testing/selftests/drivers/net/ring_reconfig.py
+++ b/tools/testing/selftests/drivers/net/ring_reconfig.py
@@ -5,10 +5,18 @@
Test channel and ring size configuration via ethtool (-L / -G).
"""
+import socket
+import struct
+import time
+
from lib.py import ksft_run, ksft_exit, ksft_pr
from lib.py import ksft_eq
+from lib.py import KsftSkipEx
from lib.py import NetDrvEpEnv, EthtoolFamily, GenerateTraffic
-from lib.py import defer, NlError
+from lib.py import cmd, defer, rand_port, tc, NlError
+
+# Added in Python 3.13; fallback to 61 for x86/ARM/MIPS
+SO_TXTIME = getattr(socket, "SO_TXTIME", 61)
def channels(cfg) -> None:
@@ -151,6 +159,169 @@ def ringparam(cfg) -> None:
GenerateTraffic(cfg).wait_pkts_and_stop(10000)
+def _write_sysfs(cfg, path, val):
+ with open(path, "r", encoding="utf-8") as fp:
+ orig_val = fp.read().strip()
+ if str(val) == orig_val:
+ return
+ with open(path, "w", encoding="utf-8") as fp:
+ fp.write(str(val))
+ defer(lambda p=path, v=orig_val: open(p, "w").write(v))
+
+
+def _get_mq_handle(cfg):
+ qdiscs = tc(f"qdisc show dev {cfg.ifname}", json=True)
+ for q in qdiscs:
+ if q.get("kind") == "mq":
+ return q["handle"]
+ raise KsftSkipEx(f"no mq qdisc found on {cfg.ifname}")
+
+
+def _get_qdisc_backlog(cfg, queue, mq_handle):
+ qdiscs = tc(f"-s qdisc show dev {cfg.ifname}", json=True)
+ target_parent = f"{mq_handle}{queue + 1:x}"
+ for q in qdiscs:
+ if q.get("parent", "") == target_parent:
+ return q.get("backlog")
+ return None
+
+
+def _setup_fq_qdisc(cfg, mq_handle, port, target_queue, other_queue):
+ mq_child_parent = f"{mq_handle}{target_queue + 1:x}"
+
+ # Save the original child qdisc to restore after test
+ qdiscs = tc(f"qdisc show dev {cfg.ifname}", json=True)
+ default_qdisc = cmd("sysctl -n net.core.default_qdisc").stdout.strip()
+ orig_kind = default_qdisc
+ for q in qdiscs:
+ if q.get("parent", "") == mq_child_parent:
+ orig_kind = q.get("kind", default_qdisc)
+ break
+ try:
+ tc(f"qdisc replace dev {cfg.ifname} parent {mq_child_parent} fq")
+ except Exception as exc:
+ raise KsftSkipEx("fq not available (CONFIG_NET_SCH_FQ)") from exc
+ defer(tc,
+ f"qdisc replace dev {cfg.ifname} parent {mq_child_parent} {orig_kind}")
+
+ qdisc_j = tc(f"qdisc show dev {cfg.ifname}", json=True)
+ has_clsact = any(q['kind'] == 'clsact' for q in qdisc_j)
+ if not has_clsact:
+ tc(f"qdisc add dev {cfg.ifname} clsact")
+ defer(tc, f"qdisc del dev {cfg.ifname} clsact")
+
+ proto = "ipv6" if int(cfg.addr_ipver) == 6 else "ip"
+ try:
+ tc(f"filter add dev {cfg.ifname} egress protocol {proto} "
+ f"pref 1 flower ip_proto udp dst_port {port} "
+ f"action skbedit queue_mapping {target_queue}")
+ except Exception as exc:
+ raise KsftSkipEx("tc flower/act_skbedit not available") from exc
+ defer(tc, f"filter del dev {cfg.ifname} egress pref 1")
+
+ tc(f"filter add dev {cfg.ifname} egress pref 100 "
+ f"matchall action skbedit queue_mapping {other_queue}")
+ defer(tc, f"filter del dev {cfg.ifname} egress pref 100")
+
+
+def _create_sotxtime_socket(cfg):
+ sock = socket.socket(socket.AF_INET6 if cfg.addr_ipver == "6"
+ else socket.AF_INET, socket.SOCK_DGRAM)
+ try:
+ sock.setsockopt(socket.SOL_SOCKET, SO_TXTIME, struct.pack("Ii", 1, 0))
+ except OSError as exc:
+ sock.close()
+ raise KsftSkipEx("SO_TXTIME not supported") from exc
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE,
+ cfg.ifname.encode())
+ return sock
+
+
+def _send_sotxtime_burst(sock, addr, port, count, delay_ns, ipver):
+ payload = b'\x00' * 1400
+ txtime_ns = time.clock_gettime_ns(time.CLOCK_MONOTONIC) + delay_ns
+
+ ancdata = [(socket.SOL_SOCKET, SO_TXTIME, struct.pack("Q", txtime_ns))]
+ if int(ipver) == 6:
+ dest = (addr, port, 0, 0)
+ else:
+ dest = (addr, port)
+ for _ in range(count):
+ sock.sendmsg([payload], ancdata, 0, dest)
+
+
+def reconfig_tx_stall(cfg) -> None:
+ target_queue = 1
+ other_queue = 0
+
+ ehdr = {'header': {'dev-index': cfg.ifindex}}
+ chans = cfg.eth.channels_get(ehdr)
+
+ if 'combined-max' not in chans:
+ raise KsftSkipEx("device does not support combined channels")
+ if chans['combined-count'] < 2:
+ raise KsftSkipEx("need at least 2 combined channels")
+
+ rings = cfg.eth.rings_get(ehdr)
+ if 'rx' not in rings or 'tx' not in rings:
+ raise KsftSkipEx("device does not expose rx/tx ring params")
+ tx_cur = rings['tx']
+ if tx_cur <= 64:
+ raise KsftSkipEx("tx ring size already at minimum")
+ defer(cfg.eth.rings_set, ehdr | {'tx': tx_cur})
+
+ tx_min = 64
+ cfg.eth.rings_set(ehdr | {'tx': tx_min})
+
+ # Slow completions so the ring stays full after FQ releases packets
+ napi_defer = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs"
+ gro_timeout = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout"
+ _write_sysfs(cfg, napi_defer, 100)
+ _write_sysfs(cfg, gro_timeout, 1000000000)
+
+ mq_handle = _get_mq_handle(cfg)
+ port = rand_port()
+ _setup_fq_qdisc(cfg, mq_handle, port, target_queue, other_queue)
+
+ sock = _create_sotxtime_socket(cfg)
+ defer(sock.close)
+
+ pkt_count = tx_min * 2
+
+ for delay_ms in [100, 200, 500]:
+ delay_ns = delay_ms * 1_000_000
+ _send_sotxtime_burst(sock, cfg.remote_addr, port, pkt_count,
+ delay_ns, cfg.addr_ipver)
+ ksft_pr(f"Sent {pkt_count} SO_TXTIME packets (+{delay_ms}ms)")
+ time.sleep(delay_ms / 1000 + 0.3)
+
+ backlog = _get_qdisc_backlog(cfg, target_queue, mq_handle)
+ if backlog:
+ break
+ else:
+ raise KsftSkipEx("failed to build qdisc backlog")
+
+ ksft_pr(f"Backlog before reconfig: {backlog} bytes")
+
+ # Trigger ring reconfig — driver should call wake, not just start
+ cfg.eth.rings_set(ehdr | {'tx': tx_cur})
+
+ # Let completions proceed normally
+ _write_sysfs(cfg, napi_defer, 0)
+ _write_sysfs(cfg, gro_timeout, 0)
+
+ # Poll for backlog to drain
+ for _ in range(100):
+ backlog = _get_qdisc_backlog(cfg, target_queue, mq_handle)
+ if not backlog:
+ break
+ time.sleep(0.1)
+
+ ksft_eq(0, backlog,
+ comment=f"qdisc backlog stuck on queue {target_queue} "
+ f"after ring reconfig")
+
+
def main() -> None:
""" Ksft boiler plate main """
@@ -158,7 +329,8 @@ def main() -> None:
cfg.eth = EthtoolFamily()
ksft_run([channels,
- ringparam],
+ ringparam,
+ reconfig_tx_stall],
args=(cfg, ))
ksft_exit()
--
2.52.0
next reply other threads:[~2026-06-13 1:49 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-13 1:48 Mohsin Bashir [this message]
2026-06-13 16:20 ` [PATCH net-next V2] selftests: drv-net: Test queue stall upon reconfig Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260613014855.1717712-1-mohsin.bashr@gmail.com \
--to=mohsin.bashr@gmail.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=shuah@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.