Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Add a test-only TUN ioctl that inflates RX skb->truesize, plus the
packetdrill-side helper needed to drive that ioctl through packetdrill's
own TUN queue file descriptor.

Use that plumbing to cover the receive-window regressions where
scaling_ratio drifts after advertisement, alongside the baseline too-big
packetdrill cases that exercise the same sender-visible rwnd accounting
from the non-injected path.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/tun.c                             |  65 ++++++++
 include/uapi/linux/if_tun.h                   |   4 +
 .../tcp_rcv_neg_window_truesize.pkt           | 143 ++++++++++++++++++
 .../net/packetdrill/tcp_rcv_toobig.pkt        |  35 +++++
 .../packetdrill/tcp_rcv_toobig_default.pkt    |  97 ++++++++++++
 .../tcp_rcv_toobig_default_truesize.pkt       | 118 +++++++++++++++
 .../tcp_rcv_wnd_shrink_allowed_truesize.pkt   |  49 ++++++
 tools/testing/selftests/net/tun.c             | 140 ++++++++++++++++-
 8 files changed, 650 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c492fda6fc15..2cef62cebe88 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -53,6 +53,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/if_vlan.h>
+#include <linux/overflow.h>
 #include <linux/crc32.h>
 #include <linux/math.h>
 #include <linux/nsproxy.h>
@@ -85,8 +86,13 @@
 
 #include "tun_vnet.h"
 
+struct tun_file;
+
+#define TUNSETTRUESIZE_OLD _IOW('T', 228, unsigned int)
+
 static void tun_default_link_ksettings(struct net_device *dev,
 				       struct ethtool_link_ksettings *cmd);
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb);
 
 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
@@ -138,6 +144,7 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	u32 rx_extra_truesize;
 	struct napi_struct napi;
 	bool napi_enabled;
 	bool napi_frags_enabled;
@@ -1817,6 +1824,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		goto free_skb;
 	}
 
+	tun_rx_update_truesize(tfile, skb);
 	switch (tun->flags & TUN_TYPE_MASK) {
 	case IFF_TUN:
 		if (tun->flags & IFF_NO_PI) {
@@ -2373,6 +2381,25 @@ static void tun_put_page(struct tun_page *tpage)
 		__page_frag_cache_drain(tpage->page, tpage->count);
 }
 
+/* Tests can inflate skb->truesize on ingress to exercise receive-memory
+ * accounting against a scaling_ratio that drifts after a window was
+ * advertised. The knob is per queue file, defaults to zero, and only changes
+ * behavior when explicitly enabled through the TUN fd.
+ */
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb)
+{
+	u32 extra = READ_ONCE(tfile->rx_extra_truesize);
+	unsigned int truesize;
+
+	if (!extra)
+		return;
+
+	if (check_add_overflow(skb->truesize, extra, &truesize))
+		truesize = UINT_MAX;
+
+	skb->truesize = truesize;
+}
+
 static int tun_xdp_one(struct tun_struct *tun,
 		       struct tun_file *tfile,
 		       struct xdp_buff *xdp, int *flush,
@@ -2459,6 +2486,7 @@ static int tun_xdp_one(struct tun_struct *tun,
 		goto out;
 	}
 
+	tun_rx_update_truesize(tfile, skb);
 	skb->protocol = eth_type_trans(skb, tun->dev);
 	skb_reset_network_header(skb);
 	skb_probe_transport_header(skb);
@@ -3045,6 +3073,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
 	unsigned int carrier;
+	unsigned int extra_truesize;
 	struct ifreq ifr;
 	kuid_t owner;
 	kgid_t group;
@@ -3309,6 +3338,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = tun_net_change_carrier(tun->dev, (bool)carrier);
 		break;
 
+	/* Support both the legacy pointer-payload form and the scalar form
+	 * used by the selftest helper when injecting truesize from
+	 * packetdrill shell commands.
+	 */
+	case TUNSETTRUESIZE:
+	case TUNSETTRUESIZE_OLD:
+		ret = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto unlock;
+
+		if (cmd == TUNSETTRUESIZE_OLD) {
+			ret = -EFAULT;
+			if (copy_from_user(&extra_truesize, argp,
+					   sizeof(extra_truesize))) {
+				ret = -EINVAL;
+				if (arg > U32_MAX)
+					goto unlock;
+
+				extra_truesize = arg;
+			}
+		} else {
+			ret = -EINVAL;
+			if (arg > U32_MAX)
+				goto unlock;
+
+			extra_truesize = arg;
+		}
+
+		WRITE_ONCE(tfile->rx_extra_truesize, extra_truesize);
+		netif_info(tun, drv, tun->dev,
+			   "rx extra truesize set to %u\n", extra_truesize);
+		ret = 0;
+		break;
+
 	case TUNGETDEVNETNS:
 		ret = -EPERM;
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -3348,6 +3411,7 @@ static long tun_chr_compat_ioctl(struct file *file,
 	case TUNGETSNDBUF:
 	case TUNSETSNDBUF:
 	case SIOCGIFHWADDR:
+	case TUNSETTRUESIZE_OLD:
 	case SIOCSIFHWADDR:
 		arg = (unsigned long)compat_ptr(arg);
 		break;
@@ -3408,6 +3472,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
+	tfile->rx_extra_truesize = 0;
 
 	init_waitqueue_head(&tfile->socket.wq.wait);
 
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 79d53c7a1ebd..4be63efe6540 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,10 @@
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
 #define TUNSETCARRIER _IOW('T', 226, int)
 #define TUNGETDEVNETNS _IO('T', 227)
+/* Test-only: add scalar bytes to skb->truesize on RX after TUN allocates
+ * an skb.
+ */
+#define TUNSETTRUESIZE _IO('T', 228)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
new file mode 100644
index 000000000000..1c5550fff509
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+// Run the negative-window / max-advertised-window regression with inflated
+// TUN skb->truesize so scaling_ratio drifts throughout the flow. The sequence
+// checks and drop counters should remain identical to the uninflated case.
+
+--mss=1000
+
+`./defaults.sh`
+
+    0 `nstat -n`
+
+// Establish a connection.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1000000], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 4>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Put 1040000 bytes into the receive buffer.
+   +0 < P. 1:65001(65000) ack 1 win 257
+    * > .  1:1(0) ack 65001
+   +0 < P. 65001:130001(65000) ack 1 win 257
+    * > .  1:1(0) ack 130001
+   +0 < P. 130001:195001(65000) ack 1 win 257
+    * > .  1:1(0) ack 195001
+   +0 < P. 195001:260001(65000) ack 1 win 257
+    * > .  1:1(0) ack 260001
+   +0 < P. 260001:325001(65000) ack 1 win 257
+    * > .  1:1(0) ack 325001
+   +0 < P. 325001:390001(65000) ack 1 win 257
+    * > .  1:1(0) ack 390001
+   +0 < P. 390001:455001(65000) ack 1 win 257
+    * > .  1:1(0) ack 455001
+   +0 < P. 455001:520001(65000) ack 1 win 257
+    * > .  1:1(0) ack 520001
+   +0 < P. 520001:585001(65000) ack 1 win 257
+    * > .  1:1(0) ack 585001
+   +0 < P. 585001:650001(65000) ack 1 win 257
+    * > .  1:1(0) ack 650001
+   +0 < P. 650001:715001(65000) ack 1 win 257
+    * > .  1:1(0) ack 715001
+   +0 < P. 715001:780001(65000) ack 1 win 257
+    * > .  1:1(0) ack 780001
+   +0 < P. 780001:845001(65000) ack 1 win 257
+    * > .  1:1(0) ack 845001
+   +0 < P. 845001:910001(65000) ack 1 win 257
+    * > .  1:1(0) ack 910001
+   +0 < P. 910001:975001(65000) ack 1 win 257
+    * > .  1:1(0) ack 975001
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+    * > .  1:1(0) ack 1040001
+
+// Start inflating future TUN skbs only after the baseline sender-visible
+// window has been established, so the negative-window checks below exercise
+// ratio drift without changing the initial max advertised window.
+   +0 `../tun --set-rx-truesize tun0 65536`
+
+// Trigger an extreme memory squeeze by shrinking SO_RCVBUF.
+   +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [16000], 4) = 0
+
+   +0 < P. 1040001:1105001(65000) ack 1 win 257
+    * > .  1:1(0) ack 1040001 win 0
+// Check LINUX_MIB_TCPRCVQDROP has been incremented.
+   +0 `nstat -s | grep TcpExtTCPRcvQDrop | grep -q " 1 "`
+
+// RWIN == 0: rcv_wup = 1040001, rcv_wnd = 0, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window.
+   +0 write(4, ..., 1000) = 1000
+   +0 > P. 1:1001(1000) ack 1040001 win 0
+   +0 < .  1105001:1105001(0) ack 1001 win 257
+
+// In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+   +0 < P. 1040001:1041001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Ooo partial segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+   +0 < P. 1039001:1041001(2000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0 <nop,nop,sack 1039001:1040001>
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented twice.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 2 "`
+
+// Ooo segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW).
+   +0 < P. 1105001:1106001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Ooo segment, beyond max adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE).
+   +0 < P. 2000001:2001001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice.
+   +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 2 "`
+
+// Read all data.
+   +0 read(4, ..., 2000000) = 1040000
+    * > .  1001:1001(0) ack 1040001
+
+// RWIN > 0: rcv_wup = 1040001, 0 < rcv_wnd < 32000, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window, beyond adv. window.
+   +0 write(4, ..., 1000) = 1000
+   +0 > P.  1001:2001(1000) ack 1040001
+   +0 < . 1105001:1105001(0) ack 2001 win 257
+
+// In order segment, in max adv. window, in adv. window -> accept.
+   +0 < P. 1040001:1041001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1041001
+
+// Ooo partial segment, in adv. window -> accept.
+   +0 < P. 1040001:1042001(2000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1042001 <nop,nop,sack 1040001:1041001>
+
+// Ooo segment, in max adv. window, beyond adv. window -> drop.
+   +0 < P. 1105001:1106001(1000) ack 2001 win 257
+   +0 > .  2001:2001(0) ack 1042001
+// Ooo segment, beyond max adv. window, beyond adv. window -> drop.
+   +0 < P. 2000001:2001001(1000) ack 2001 win 257
+   +0 > .  2001:2001(0) ack 1042001
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice more.
+   +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 4 "`
+
+// We are allowed to go beyond the window and buffer with one packet.
+   +0 < P. 1042001:1062001(20000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1062001
+   +0 < P. 1062001:1082001(20000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001 win 0
+
+// But not more: in-order segment, in max adv. window -> drop.
+   +0 < P. 1082001:1083001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented again.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 3 "`
+
+// Another ratio drop must not change the final zero-window decision.
+   +0 `../tun --set-rx-truesize tun0 131072`
+
+   +0 < P. 1082001:1083001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented once more.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 4 "`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
new file mode 100644
index 000000000000..837ba3633752
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh`
+
+    0 `nstat -n`
+
+// Establish a connection.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 < P. 1:20001(20000) ack 1 win 257
+ +.04 > .  1:1(0) ack 20001 win 18000
+
+   +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0
+   +0 < P. 20001:80001(60000) ack 1 win 257
+   +0 > .  1:1(0) ack 20001 win 18000
+
+   +0 read(4, ..., 20000) = 20000
+
+// A too big packet is accepted if the receive queue is empty, but the
+// stronger admission path must not zero the receive buffer while doing so.
+   +0 < P. 20001:80001(60000) ack 1 win 257
+    * > .  1:1(0) ack 80001 win 0
+   +0 %{ assert SK_MEMINFO_RCVBUF > 0, SK_MEMINFO_RCVBUF }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
new file mode 100644
index 000000000000..b2e4950e0b83
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. Leave a large skb in
+// the queue, then deliver another one which still fits the remaining rwnd.
+// We should grow sk_rcvbuf to honor the already-advertised window instead of
+// dropping the packet.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its 128kB default.
+   +0 < P. 1:65001(65000) ack 1 win 257
+   * > .  1:1(0) ack 65001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 65001:130001(65000) ack 1 win 257
+   * > .  1:1(0) ack 130001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 130001:195001(65000) ack 1 win 257
+   * > .  1:1(0) ack 195001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 195001:260001(65000) ack 1 win 257
+   * > .  1:1(0) ack 260001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 260001:325001(65000) ack 1 win 257
+   * > .  1:1(0) ack 325001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 325001:390001(65000) ack 1 win 257
+   * > .  1:1(0) ack 390001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 390001:455001(65000) ack 1 win 257
+   * > .  1:1(0) ack 455001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 455001:520001(65000) ack 1 win 257
+   * > .  1:1(0) ack 520001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 520001:585001(65000) ack 1 win 257
+   * > .  1:1(0) ack 585001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 585001:650001(65000) ack 1 win 257
+   * > .  1:1(0) ack 650001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 650001:715001(65000) ack 1 win 257
+   * > .  1:1(0) ack 715001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 715001:780001(65000) ack 1 win 257
+   * > .  1:1(0) ack 780001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 780001:845001(65000) ack 1 win 257
+   * > .  1:1(0) ack 845001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 845001:910001(65000) ack 1 win 257
+   * > .  1:1(0) ack 910001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 910001:975001(65000) ack 1 win 257
+   * > .  1:1(0) ack 975001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1040001
+   +0 read(4, ..., 65000) = 65000
+
+// Leave about 60kB queued, then accept another large skb which still fits
+// the rwnd we already exposed to the peer. The regression is the drop; the
+// exact sk_rcvbuf growth path is an implementation detail.
+   +0 < P. 1040001:1102001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1102001
+
+   +0 < P. 1102001:1167001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1167001
+   +0 read(4, ..., 127000) = 127000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
new file mode 100644
index 000000000000..c2ebe11d75f7
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. The warmup traffic
+// keeps the socket in the normal data path without changing its default
+// sk_rcvbuf. Then inflate skb->truesize on future TUN RX packets so the live
+// scaling_ratio drops after we already exposed a larger rwnd to the peer.
+// The follow-up packet should still be admitted, and tcp_clamp_window() should
+// grow sk_rcvbuf to honor the sender-visible window instead of dropping data.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its initial default.
+   +0 < P. 1:65001(65000) ack 1 win 257
+   * > .  1:1(0) ack 65001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 65001:130001(65000) ack 1 win 257
+   * > .  1:1(0) ack 130001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 130001:195001(65000) ack 1 win 257
+   * > .  1:1(0) ack 195001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 195001:260001(65000) ack 1 win 257
+   * > .  1:1(0) ack 260001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 260001:325001(65000) ack 1 win 257
+   * > .  1:1(0) ack 325001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 325001:390001(65000) ack 1 win 257
+   * > .  1:1(0) ack 390001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 390001:455001(65000) ack 1 win 257
+   * > .  1:1(0) ack 455001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 455001:520001(65000) ack 1 win 257
+   * > .  1:1(0) ack 520001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 520001:585001(65000) ack 1 win 257
+   * > .  1:1(0) ack 585001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 585001:650001(65000) ack 1 win 257
+   * > .  1:1(0) ack 650001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 650001:715001(65000) ack 1 win 257
+   * > .  1:1(0) ack 715001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 715001:780001(65000) ack 1 win 257
+   * > .  1:1(0) ack 780001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 780001:845001(65000) ack 1 win 257
+   * > .  1:1(0) ack 845001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 845001:910001(65000) ack 1 win 257
+   * > .  1:1(0) ack 910001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 910001:975001(65000) ack 1 win 257
+   * > .  1:1(0) ack 975001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1040001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 %{ base_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+// Leave about 60kB queued, then make future TUN skbs look more expensive in
+// two steps. Both inflated skbs still fit the already-advertised window and
+// must be admitted, and sk_rcvbuf should keep growing as the live
+// scaling_ratio drops further.
+   +0 < P. 1040001:1102001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1102001
+
+   +0 `../tun --set-rx-truesize tun0 4096`
+
+   +0 < P. 1102001:1167001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1167001
+   +0 %{ assert SK_MEMINFO_RCVBUF > base_rcvbuf, (base_rcvbuf, SK_MEMINFO_RCVBUF) }%
+   +0 %{ small_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+   +0 < P. 1167001:1229001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1229001
+
+   +0 `../tun --set-rx-truesize tun0 65536`
+
+   +0 < P. 1229001:1294001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1294001
+   +0 %{ assert SK_MEMINFO_RCVBUF > small_rcvbuf, (base_rcvbuf, small_rcvbuf, SK_MEMINFO_RCVBUF) }%
+
+   +0 < P. 1294001:1356001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1356001
+   +0 read(4, ..., 254000) = 254000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
new file mode 100644
index 000000000000..08da5fddaa12
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_shrink_window=1
+sysctl -q net.ipv4.tcp_rmem="4096 32768 $((32*1024*1024))"`
+
+   0 `nstat -n`
+
+// Establish a connection. After the first payload we know the peer has seen a
+// scaled receive window reaching sequence 25361. Inflate later TUN skbs in two
+// steps so the live scaling_ratio drops more than once, then verify that:
+//   1) a segment one byte beyond the max advertised window is still dropped,
+//   2) a segment exactly using the previously advertised max window is still
+//      accepted even though the current live ratio no longer matches that
+//      original advertisement basis.
+  +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+  +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+  +0 bind(3, ..., ...) = 0
+  +0 listen(3, 1) = 0
+
+  +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+  +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 10>
+  +0 < . 1:1(0) ack 1 win 257
+
+  +0 accept(3, ..., ...) = 4
+
+  +0 < P. 1:10001(10000) ack 1 win 257
+   * > .  1:1(0) ack 10001 win 15
+
+// Max window seq advertised here is 10001 + 15*1024 = 25361.
+  +0 `../tun --set-rx-truesize tun0 4096`
+
+  +0 < P. 10001:11024(1023) ack 1 win 257
+   * > .  1:1(0) ack 11024
+
+  +0 `../tun --set-rx-truesize tun0 65536`
+
+// Segment beyond the max window stays invalid even after ratio drift.
+  +0 < P. 11024:25362(14338) ack 1 win 257
+   * > .  1:1(0) ack 11024
+
+// Segment exactly using the max window must still be accepted.
+  +0 < P. 11024:25361(14337) ack 1 win 257
+   * > .  1:1(0) ack 25361
+
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented once.
+  +0 `nstat | grep TcpExtBeyondWindow | grep -q " 1 "`
diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index cf106a49b55e..473992b3784d 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -2,14 +2,17 @@
 
 #define _GNU_SOURCE
 
+#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <linux/if_tun.h>
 #include <sys/ioctl.h>
+#include <sys/syscall.h>
 #include <sys/socket.h>
 
 #include "kselftest_harness.h"
@@ -174,6 +177,135 @@ static int tun_delete(char *dev)
 	return ip_link_del(dev);
 }
 
+static bool is_numeric_name(const char *name)
+{
+	for (; *name; name++) {
+		if (*name < '0' || *name > '9')
+			return false;
+	}
+
+	return true;
+}
+
+static int packetdrill_dup_fd(int pidfd, const char *fd_name)
+{
+	char *end;
+	unsigned long tmp;
+
+	errno = 0;
+	tmp = strtoul(fd_name, &end, 10);
+	if (errno || *end || tmp > INT_MAX) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	return syscall(SYS_pidfd_getfd, pidfd, (int)tmp, 0);
+}
+
+static int open_packetdrill_tunfd(pid_t pid, const char *ifname)
+{
+	char fd_dir[PATH_MAX];
+	struct dirent *dent;
+	struct ifreq ifr = {};
+	int pidfd;
+	int saved_errno = ENOENT;
+	DIR *dir;
+
+	snprintf(fd_dir, sizeof(fd_dir), "/proc/%ld/fd", (long)pid);
+
+	pidfd = syscall(SYS_pidfd_open, pid, 0);
+	if (pidfd < 0)
+		return -1;
+
+	dir = opendir(fd_dir);
+	if (!dir) {
+		close(pidfd);
+		return -1;
+	}
+
+	while ((dent = readdir(dir))) {
+		int fd;
+
+		if (!is_numeric_name(dent->d_name))
+			continue;
+
+		/* Reopen via pidfd_getfd() so we duplicate packetdrill's attached
+		 * queue file, instead of opening a fresh /dev/net/tun instance.
+		 */
+		fd = packetdrill_dup_fd(pidfd, dent->d_name);
+		if (fd < 0) {
+			saved_errno = errno;
+			continue;
+		}
+
+		memset(&ifr, 0, sizeof(ifr));
+		if (!ioctl(fd, TUNGETIFF, &ifr) &&
+		    !strncmp(ifr.ifr_name, ifname, IFNAMSIZ)) {
+			close(pidfd);
+			closedir(dir);
+			return fd;
+		}
+
+		if (errno)
+			saved_errno = errno;
+		close(fd);
+	}
+
+	close(pidfd);
+	closedir(dir);
+	errno = saved_errno;
+	return -1;
+}
+
+/* Packetdrill owns the TUN queue fd, so drive the test ioctl through that
+ * exact file descriptor found under /proc/$PACKETDRILL_PID/fd.
+ */
+static int packetdrill_set_rx_truesize(const char *ifname, const char *value)
+{
+	char *packetdrill_pid, *end;
+	unsigned long long tmp;
+	unsigned int extra;
+	pid_t pid;
+	int fd;
+
+	packetdrill_pid = getenv("PACKETDRILL_PID");
+	if (!packetdrill_pid || !*packetdrill_pid) {
+		fprintf(stderr, "PACKETDRILL_PID is not set\n");
+		return 1;
+	}
+
+	errno = 0;
+	tmp = strtoull(packetdrill_pid, &end, 10);
+	if (errno || *end || !tmp || tmp > INT_MAX) {
+		fprintf(stderr, "invalid PACKETDRILL_PID: %s\n", packetdrill_pid);
+		return 1;
+	}
+	pid = (pid_t)tmp;
+
+	errno = 0;
+	tmp = strtoull(value, &end, 0);
+	if (errno || *end || tmp > UINT_MAX) {
+		fprintf(stderr, "invalid truesize value: %s\n", value);
+		return 1;
+	}
+	extra = (unsigned int)tmp;
+
+	fd = open_packetdrill_tunfd(pid, ifname);
+	if (fd < 0) {
+		perror("open_packetdrill_tunfd");
+		return 1;
+	}
+
+	if (ioctl(fd, TUNSETTRUESIZE, (unsigned long)extra)) {
+		perror("ioctl(TUNSETTRUESIZE)");
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	return 0;
+}
+
 static int tun_open(char *dev, const int flags, const int hdrlen,
 		    const int features, const unsigned char *mac_addr)
 {
@@ -985,4 +1117,10 @@ XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet);
 XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet);
 XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet);
 
-TEST_HARNESS_MAIN
+int main(int argc, char **argv)
+{
+	if (argc == 4 && !strcmp(argv[1], "--set-rx-truesize"))
+		return packetdrill_set_rx_truesize(argv[2], argv[3]);
+
+	return test_harness_run(argc, argv);
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Add a debugfs-controlled peer RX truesize knob to netdevsim, inflate the
forwarded skb only on the peer RX side, and cover the resulting socket
memory-accounting behavior with a dedicated selftest.

This keeps the synthetic cost out of the sender-side skb geometry while
giving the selftests a second runtime vehicle for the receive-memory
accounting exercised by the TCP rwnd work.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/netdevsim/netdev.c                | 145 +++++-
 drivers/net/netdevsim/netdevsim.h             |   4 +
 .../selftests/drivers/net/netdevsim/Makefile  |   1 +
 .../drivers/net/netdevsim/peer-rx-truesize.sh | 426 ++++++++++++++++++
 4 files changed, 575 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 5ec028a00c62..22238df79b6a 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -17,8 +17,10 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool_netlink.h>
 #include <linux/kernel.h>
+#include <linux/kstrtox.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <linux/refcount.h>
 #include <linux/slab.h>
 #include <net/netdev_queues.h>
 #include <net/netdev_rx_queue.h>
@@ -37,6 +39,91 @@ MODULE_IMPORT_NS("NETDEV_INTERNAL");
 
 #define NSIM_RING_SIZE		256
 
+struct nsim_rx_truesize {
+	refcount_t refs;
+	u32 value;
+};
+
+static struct nsim_rx_truesize *
+nsim_rx_truesize_get(struct nsim_rx_truesize *rx_truesize)
+{
+	if (!rx_truesize)
+		return NULL;
+
+	if (!refcount_inc_not_zero(&rx_truesize->refs))
+		return NULL;
+
+	return rx_truesize;
+}
+
+static void nsim_rx_truesize_put(struct nsim_rx_truesize *rx_truesize)
+{
+	if (!rx_truesize)
+		return;
+
+	if (refcount_dec_and_test(&rx_truesize->refs))
+		kfree(rx_truesize);
+}
+
+static ssize_t nsim_rx_truesize_read(struct file *file, char __user *user_buf,
+				     size_t count, loff_t *ppos)
+{
+	struct nsim_rx_truesize *rx_truesize = file->private_data;
+	char buf[24];
+	int len;
+
+	len = scnprintf(buf, sizeof(buf), "%u\n",
+			READ_ONCE(rx_truesize->value));
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t nsim_rx_truesize_write(struct file *file,
+				      const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	struct nsim_rx_truesize *rx_truesize = file->private_data;
+	u32 value;
+	int err;
+
+	err = kstrtou32_from_user(user_buf, count, 0, &value);
+	if (err)
+		return err;
+
+	WRITE_ONCE(rx_truesize->value, value);
+
+	return count;
+}
+
+static int nsim_rx_truesize_open(struct inode *inode, struct file *file)
+{
+	struct nsim_rx_truesize *rx_truesize;
+
+	rx_truesize = nsim_rx_truesize_get(inode->i_private);
+	if (!rx_truesize)
+		return -ENODEV;
+
+	file->private_data = rx_truesize;
+
+	return nonseekable_open(inode, file);
+}
+
+static int nsim_rx_truesize_release(struct inode *inode, struct file *file)
+{
+	nsim_rx_truesize_put(file->private_data);
+
+	return 0;
+}
+
+static const struct file_operations nsim_rx_truesize_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nsim_rx_truesize_open,
+	.read		= nsim_rx_truesize_read,
+	.write		= nsim_rx_truesize_write,
+	.release	= nsim_rx_truesize_release,
+	.llseek		= noop_llseek,
+};
+
 static void nsim_start_peer_tx_queue(struct net_device *dev, struct nsim_rq *rq)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -117,6 +204,28 @@ static int nsim_forward_skb(struct net_device *tx_dev,
 	return nsim_napi_rx(tx_dev, rx_dev, rq, skb);
 }
 
+/* Tests can inflate peer RX skb->truesize to exercise receiver-side TCP
+ * accounting under scaling-ratio drift without perturbing sender-side skb
+ * ownership.
+ */
+static void nsim_rx_update_truesize(struct sk_buff *skb, u32 extra)
+{
+	unsigned int truesize;
+
+	if (!extra)
+		return;
+
+	if (check_add_overflow(skb->truesize, extra, &truesize))
+		truesize = UINT_MAX;
+
+	skb->truesize = truesize;
+}
+
+static u32 nsim_rx_extra_truesize(const struct netdevsim *ns)
+{
+	return READ_ONCE(ns->rx_truesize->value);
+}
+
 static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -125,7 +234,9 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned int len = skb->len;
 	struct netdevsim *peer_ns;
 	struct netdev_config *cfg;
+	struct sk_buff *nskb;
 	struct nsim_rq *rq;
+	u32 extra;
 	int rxq;
 	int dr;
 
@@ -160,7 +271,24 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	      cfg->hds_thresh > len)))
 		skb_linearize(skb);
 
+	extra = nsim_rx_extra_truesize(peer_ns);
 	skb_tx_timestamp(skb);
+	if (extra) {
+		/* Clone before inflating truesize so only the peer RX path sees
+		 * the synthetic cost; sender-side skb accounting stays put.
+		 */
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (!nskb) {
+			if (psp_ext)
+				__skb_ext_put(psp_ext);
+			goto out_drop_free;
+		}
+
+		consume_skb(skb);
+		skb = nskb;
+		nsim_rx_update_truesize(skb, extra);
+	}
+
 	if (unlikely(nsim_forward_skb(dev, peer_dev,
 				      skb, rq, psp_ext) == NET_RX_DROP))
 		goto out_drop_cnt;
@@ -1121,6 +1249,7 @@ struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
 			      u8 perm_addr[ETH_ALEN])
 {
 	struct net_device *dev;
+	struct nsim_rx_truesize *rx_truesize;
 	struct netdevsim *ns;
 	int err;
 
@@ -1140,6 +1269,13 @@ struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
 	ns->nsim_bus_dev = nsim_dev->nsim_bus_dev;
 	SET_NETDEV_DEV(dev, &ns->nsim_bus_dev->dev);
 	SET_NETDEV_DEVLINK_PORT(dev, &nsim_dev_port->devlink_port);
+	rx_truesize = kzalloc_obj(*rx_truesize);
+	if (!rx_truesize) {
+		err = -ENOMEM;
+		goto err_free_netdev;
+	}
+	refcount_set(&rx_truesize->refs, 1);
+	ns->rx_truesize = rx_truesize;
 	nsim_ethtool_init(ns);
 	if (nsim_dev_port_is_pf(nsim_dev_port))
 		err = nsim_init_netdevsim(ns);
@@ -1153,21 +1289,27 @@ struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
 	ns->qr_dfs = debugfs_create_file("queue_reset", 0200,
 					 nsim_dev_port->ddir, ns,
 					 &nsim_qreset_fops);
+	ns->rx_truesize_dfs = debugfs_create_file("rx_extra_truesize", 0600,
+						  nsim_dev_port->ddir,
+						  ns->rx_truesize,
+						  &nsim_rx_truesize_fops);
 	return ns;
 
 err_free_netdev:
+	nsim_rx_truesize_put(ns->rx_truesize);
 	free_netdev(dev);
 	return ERR_PTR(err);
 }
 
 void nsim_destroy(struct netdevsim *ns)
 {
+	struct nsim_rx_truesize *rx_truesize = ns->rx_truesize;
 	struct net_device *dev = ns->netdev;
 	struct netdevsim *peer;
 
+	debugfs_remove(ns->rx_truesize_dfs);
 	debugfs_remove(ns->qr_dfs);
 	debugfs_remove(ns->pp_dfs);
-
 	if (ns->nb.notifier_call)
 		unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb,
 						      &ns->nn);
@@ -1198,6 +1340,7 @@ void nsim_destroy(struct netdevsim *ns)
 	}
 
 	free_netdev(dev);
+	nsim_rx_truesize_put(rx_truesize);
 }
 
 bool netdev_is_nsim(struct net_device *dev)
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index f767fc8a7505..972ad274060e 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -75,6 +75,8 @@ struct nsim_macsec {
 	u8 nsim_secy_count;
 };
 
+struct nsim_rx_truesize;
+
 struct nsim_ethtool_pauseparam {
 	bool rx;
 	bool tx;
@@ -144,6 +146,8 @@ struct netdevsim {
 	} udp_ports;
 
 	struct page *page;
+	struct nsim_rx_truesize *rx_truesize;
+	struct dentry *rx_truesize_dfs;
 	struct dentry *pp_dfs;
 	struct dentry *qr_dfs;
 
diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index 1a228c5430f5..9e9e48d5913b 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -14,6 +14,7 @@ TEST_PROGS := \
 	macsec-offload.sh \
 	nexthop.sh \
 	peer.sh \
+	peer-rx-truesize.sh \
 	psample.sh \
 	tc-mq-visibility.sh \
 	udp_tunnel_nic.sh \
diff --git a/tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh b/tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh
new file mode 100755
index 000000000000..6d1101d20847
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh
@@ -0,0 +1,426 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+set -euo pipefail
+
+lib_dir=$(dirname "$0")/../../../net
+source "$lib_dir"/lib.sh
+
+NSIM_SRV_ID=$((1024 + RANDOM % 1024))
+NSIM_CLI_ID=$((2048 + RANDOM % 1024))
+NSIM_SYS_LINK=/sys/bus/netdevsim/link_device
+SERVER_ADDR=192.0.2.1
+CLIENT_ADDR=192.0.2.2
+RMEM_PORT=12345
+WARM_PORT=12346
+RMEM_QUEUED_LEN=65000
+RMEM_INFLATED_LEN=65000
+RMEM_SMALL_EXTRA=4096
+RMEM_LARGE_EXTRA=65536
+WARM_WARMUP_ROUNDS=16
+WARM_WARMUP_LEN=65000
+WARM_QUEUED_LEN=62000
+WARM_INFLATED_LEN=65000
+WARM_EXTRA=65536
+
+srv_dev=
+cli_dev=
+srv_pid=
+cli_pid=
+srv_fd=
+cli_fd=
+stage_dir=
+CASE_BASE_METRIC=
+CASE_FINAL_METRIC=
+
+cleanup()
+{
+	local rc=$?
+
+	if [ -n "${srv_pid:-}" ]; then
+		kill "${srv_pid}" 2>/dev/null || true
+		wait "${srv_pid}" 2>/dev/null || true
+	fi
+
+	if [ -n "${cli_pid:-}" ]; then
+		kill "${cli_pid}" 2>/dev/null || true
+		wait "${cli_pid}" 2>/dev/null || true
+	fi
+
+	if [ -n "${srv_fd:-}" ]; then
+		eval "exec ${srv_fd}<&-"
+	fi
+
+	if [ -n "${cli_fd:-}" ]; then
+		eval "exec ${cli_fd}<&-"
+	fi
+
+	if [ -d "${stage_dir:-}" ]; then
+		rm -rf "${stage_dir}"
+	fi
+
+	cleanup_netdevsim "${NSIM_SRV_ID}" 2>/dev/null || true
+	cleanup_netdevsim "${NSIM_CLI_ID}" 2>/dev/null || true
+	cleanup_ns "${SRV:-}" "${CLI:-}" 2>/dev/null || true
+
+	exit "${rc}"
+}
+
+trap cleanup EXIT
+
+ensure_debugfs()
+{
+	if mount | grep -q 'on /sys/kernel/debug type debugfs'; then
+		return 0
+	fi
+
+	if ! mount -t debugfs none /sys/kernel/debug >/dev/null 2>&1; then
+		echo "SKIP: failed to mount debugfs"
+		exit "${ksft_skip}"
+	fi
+}
+
+ensure_netdevsim()
+{
+	if [ -w /sys/bus/netdevsim/new_device ]; then
+		return 0
+	fi
+
+	if ! modprobe netdevsim >/dev/null 2>&1; then
+		echo "SKIP: no netdevsim support"
+		exit "${ksft_skip}"
+	fi
+}
+
+create_nsim()
+{
+	local id="$1"
+	local ns="$2"
+	local addr="$3"
+	local dev
+
+	echo "${id}" | ip netns exec "${ns}" tee /sys/bus/netdevsim/new_device >/dev/null
+	udevadm settle
+
+	dev=$(ip netns exec "${ns}" ls /sys/bus/netdevsim/devices/netdevsim"${id}"/net)
+	ip -netns "${ns}" link set dev "${dev}" name "nsim${id}"
+	ip -netns "${ns}" addr add "${addr}/24" dev "nsim${id}"
+	ip -netns "${ns}" link set dev "nsim${id}" up
+
+	echo "nsim${id}"
+}
+
+link_nsim_peers()
+{
+	local srv_ifindex
+	local cli_ifindex
+
+	eval "exec {srv_fd}</var/run/netns/${SRV}"
+	eval "exec {cli_fd}</var/run/netns/${CLI}"
+
+	srv_ifindex=$(ip netns exec "${SRV}" cat /sys/class/net/"${srv_dev}"/ifindex)
+	cli_ifindex=$(ip netns exec "${CLI}" cat /sys/class/net/"${cli_dev}"/ifindex)
+
+	echo "${srv_fd}:${srv_ifindex} ${cli_fd}:${cli_ifindex}" > "${NSIM_SYS_LINK}"
+}
+
+wait_for_file()
+{
+	local path="$1"
+	local i
+
+	for i in $(seq 100); do
+		if [ -e "${path}" ]; then
+			return 0
+		fi
+		sleep 0.1
+	done
+
+	return 1
+}
+
+server_python='
+import array
+import fcntl
+import os
+import socket
+import struct
+import sys
+import time
+
+SO_MEMINFO = 55
+SK_MEMINFO_RMEM_ALLOC = 0
+TCP_MAXSEG = getattr(socket, "TCP_MAXSEG", 2)
+FIONREAD = 0x541B
+POLL_INTERVAL = 0.01
+POLL_TIMEOUT = 20.0
+
+(mode, host, port, warmup_rounds, warmup_len, queued_len, inflated_len,
+ ready_file, result_file) = sys.argv[1:]
+port = int(port)
+warmup_rounds = int(warmup_rounds)
+warmup_len = int(warmup_len)
+queued_len = int(queued_len)
+inflated_len = int(inflated_len)
+
+def queued_bytes(sock):
+    buf = array.array("I", [0])
+    fcntl.ioctl(sock.fileno(), FIONREAD, buf, True)
+    return buf[0]
+
+def wait_for_queued(sock, target):
+    deadline = time.time() + POLL_TIMEOUT
+    while time.time() < deadline:
+        if queued_bytes(sock) >= target:
+            return
+        time.sleep(POLL_INTERVAL)
+    raise SystemExit(f"timed out waiting for {target} queued bytes")
+
+def meminfo(sock):
+    raw = sock.getsockopt(socket.SOL_SOCKET, SO_MEMINFO, 9 * 4)
+    return struct.unpack("=9I", raw)
+
+def wait_for_growth(sock, idx, base):
+    deadline = time.time() + POLL_TIMEOUT
+    while time.time() < deadline:
+        cur = meminfo(sock)[idx]
+        if cur > base:
+            return cur
+        time.sleep(POLL_INTERVAL)
+    raise SystemExit(f"timed out waiting for SO_MEMINFO[{idx}] growth from {base}")
+
+def write_metric(path, value):
+    with open(path, "w", encoding="ascii") as fp:
+        fp.write(f"{value}\n")
+
+def recv_all(sock, total):
+    remaining = total
+    while remaining:
+        chunk = sock.recv(min(65536, remaining))
+        if not chunk:
+            raise SystemExit("unexpected EOF while draining receive data")
+        remaining -= len(chunk)
+
+listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+listener.setsockopt(socket.IPPROTO_TCP, TCP_MAXSEG, 1000)
+listener.bind((host, port))
+listener.listen(1)
+conn, _ = listener.accept()
+
+for _ in range(warmup_rounds):
+    recv_all(conn, warmup_len)
+
+if mode == "rmem_alloc":
+    wait_for_queued(conn, queued_len)
+    base_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(ready_file, base_metric)
+
+    recv_all(conn, queued_len)
+    wait_for_queued(conn, inflated_len)
+    grown_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(result_file, grown_metric)
+elif mode == "rmem_alloc_warm":
+    wait_for_queued(conn, queued_len)
+    base_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(ready_file, base_metric)
+
+    wait_for_queued(conn, queued_len + 1)
+    grown_metric = wait_for_growth(conn, SK_MEMINFO_RMEM_ALLOC, base_metric)
+    write_metric(result_file, grown_metric)
+elif mode == "rmem_alloc_growth":
+    # The growth cases compare against a live socket metric, so wait for
+    # observed growth instead of trusting one instantaneous post-queue sample.
+    wait_for_queued(conn, queued_len)
+    base_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(ready_file, base_metric)
+
+    recv_all(conn, queued_len)
+    wait_for_queued(conn, inflated_len)
+    grown_metric = wait_for_growth(conn, SK_MEMINFO_RMEM_ALLOC, base_metric)
+    write_metric(result_file, grown_metric)
+else:
+    raise SystemExit(f"unknown mode: {mode}")
+'
+
+client_python='
+import os
+import socket
+import sys
+import time
+
+POLL_INTERVAL = 0.01
+POLL_TIMEOUT = 20.0
+
+host, port, warmup_rounds, warmup_len, queued_len, inflated_len, gate_file = sys.argv[1:]
+port = int(port)
+warmup_rounds = int(warmup_rounds)
+warmup_len = int(warmup_len)
+queued_len = int(queued_len)
+inflated_len = int(inflated_len)
+
+def send_all(sock, total):
+    payload = b"a" * min(total, 65536)
+    left = total
+    while left:
+        chunk = payload[: min(len(payload), left)]
+        sent = sock.send(chunk)
+        if sent <= 0:
+            raise SystemExit("short send")
+        left -= sent
+
+def wait_for_file(path):
+    deadline = time.time() + POLL_TIMEOUT
+    while time.time() < deadline:
+        if os.path.exists(path):
+            return
+        time.sleep(POLL_INTERVAL)
+    raise SystemExit(f"timed out waiting for {path}")
+
+cli = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+cli.setsockopt(socket.IPPROTO_TCP, socket.TCP_MAXSEG, 1000)
+cli.connect((host, port))
+for _ in range(warmup_rounds):
+    send_all(cli, warmup_len)
+send_all(cli, queued_len)
+wait_for_file(gate_file)
+send_all(cli, inflated_len)
+cli.close()
+'
+
+read_metric()
+{
+	local path="$1"
+	local value
+
+	if ! read -r value < "${path}"; then
+		echo "FAIL: unable to read metric from ${path}"
+		exit "${ksft_fail}"
+	fi
+
+	printf '%s\n' "${value}"
+}
+
+run_case()
+{
+	local case_id="$1"
+	local mode="$2"
+	local port="$3"
+	local warmups="$4"
+	local warmup_len="$5"
+	local queued_len="$6"
+	local inflated_len="$7"
+	local extra="$8"
+	local label="$9"
+	local ready_file="${stage_dir}/${case_id}.ready"
+	local result_file="${stage_dir}/${case_id}.result"
+	local gate_file="${stage_dir}/${case_id}.gate"
+
+	rm -f "${ready_file}" "${result_file}" "${gate_file}"
+	echo 0 > "${dfs_file}"
+
+	ip netns exec "${SRV}" python3 - "${mode}" "${SERVER_ADDR}" "${port}" \
+		"${warmups}" "${warmup_len}" "${queued_len}" "${inflated_len}" \
+		"${ready_file}" "${result_file}" <<PY &
+${server_python}
+PY
+	srv_pid=$!
+
+	wait_local_port_listen "${SRV}" "${port}" tcp
+
+	ip netns exec "${CLI}" python3 - "${SERVER_ADDR}" "${port}" \
+		"${warmups}" "${warmup_len}" "${queued_len}" "${inflated_len}" \
+		"${gate_file}" <<PY &
+${client_python}
+PY
+	cli_pid=$!
+
+	if ! wait_for_file "${ready_file}"; then
+		echo "FAIL: ${label}: ready marker did not appear"
+		exit "${ksft_fail}"
+	fi
+
+	echo "${extra}" > "${dfs_file}"
+	touch "${gate_file}"
+
+	wait "${cli_pid}"
+	cli_pid=
+	wait "${srv_pid}"
+	srv_pid=
+
+	CASE_BASE_METRIC=$(read_metric "${ready_file}")
+	CASE_FINAL_METRIC=$(read_metric "${result_file}")
+
+	echo "PASS: ${label}"
+}
+
+# This test only proves that injected truesize reaches socket memory
+# accounting. Packetdrill covers the sender-visible rwnd accept/drop logic.
+
+assert_no_growth()
+{
+	local label="$1"
+
+	if [ "${CASE_FINAL_METRIC}" -gt "${CASE_BASE_METRIC}" ]; then
+		echo "FAIL: ${label}: metric grew unexpectedly:" \
+		     "base=${CASE_BASE_METRIC}" \
+		     "after=${CASE_FINAL_METRIC}"
+		exit "${ksft_fail}"
+	fi
+}
+
+assert_growth()
+{
+	local label="$1"
+
+	if [ "${CASE_FINAL_METRIC}" -le "${CASE_BASE_METRIC}" ]; then
+		echo "FAIL: ${label}: metric did not grow:" \
+		     "base=${CASE_BASE_METRIC}" \
+		     "after=${CASE_FINAL_METRIC}"
+		exit "${ksft_fail}"
+	fi
+}
+
+ensure_debugfs
+ensure_netdevsim
+set +u
+setup_ns SRV CLI
+set -u
+
+srv_dev=$(create_nsim "${NSIM_SRV_ID}" "${SRV}" "${SERVER_ADDR}")
+cli_dev=$(create_nsim "${NSIM_CLI_ID}" "${CLI}" "${CLIENT_ADDR}")
+link_nsim_peers
+
+ip netns exec "${SRV}" sysctl -wq net.ipv4.tcp_moderate_rcvbuf=0
+
+stage_dir=$(mktemp -d)
+dfs_file="/sys/kernel/debug/netdevsim/netdevsim${NSIM_SRV_ID}/ports/0/rx_extra_truesize"
+
+run_case "rmem_noop" "rmem_alloc" "${RMEM_PORT}" 0 0 \
+	"${RMEM_QUEUED_LEN}" "${RMEM_INFLATED_LEN}" 0 \
+	"peer rx truesize zero no-op"
+assert_no_growth "peer rx truesize zero no-op"
+
+run_case "rmem_small" "rmem_alloc_growth" "${RMEM_PORT}" 0 0 \
+	"${RMEM_QUEUED_LEN}" "${RMEM_INFLATED_LEN}" "${RMEM_SMALL_EXTRA}" \
+	"peer rx truesize small rmem_alloc"
+assert_growth "peer rx truesize small rmem_alloc"
+small_delta=$((CASE_FINAL_METRIC - CASE_BASE_METRIC))
+
+run_case "rmem_large" "rmem_alloc_growth" "${RMEM_PORT}" 0 0 \
+	"${RMEM_QUEUED_LEN}" "${RMEM_INFLATED_LEN}" "${RMEM_LARGE_EXTRA}" \
+	"peer rx truesize large rmem_alloc"
+assert_growth "peer rx truesize large rmem_alloc"
+large_delta=$((CASE_FINAL_METRIC - CASE_BASE_METRIC))
+
+if [ "${large_delta}" -le "${small_delta}" ]; then
+	echo "FAIL: peer rx truesize stepped rmem_alloc:" \
+	     "small_delta=${small_delta}" \
+	     "large_delta=${large_delta}"
+	exit "${ksft_fail}"
+fi
+
+run_case "rmem_warm" "rmem_alloc_warm" "${WARM_PORT}" "${WARM_WARMUP_ROUNDS}" "${WARM_WARMUP_LEN}" \
+	"${WARM_QUEUED_LEN}" "${WARM_INFLATED_LEN}" "${WARM_EXTRA}" \
+	"peer rx truesize warm rmem_alloc"
+assert_growth "peer rx truesize warm rmem_alloc"
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

nsim_do_psp() can leave an extra extension reference pinned until
nsim_psp_handle_ext() reattaches it to a forwarded skb.

Route the drop paths through a common helper and release that extra
reference when __dev_forward_skb() fails and when start_xmit() drops the
skb before it reaches the peer RX path.

This is separate from the peer RX truesize test hook itself.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/netdevsim/netdev.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 22238df79b6a..c22513c523d6 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -187,6 +187,15 @@ static int nsim_napi_rx(struct net_device *tx_dev, struct net_device *rx_dev,
 	return NET_RX_SUCCESS;
 }
 
+/* nsim_do_psp() pins an extra extension ref until nsim_psp_handle_ext()
+ * reattaches it to a forwarded skb.
+ */
+static void nsim_psp_ext_put(struct skb_ext *psp_ext)
+{
+	if (psp_ext)
+		__skb_ext_put(psp_ext);
+}
+
 static int nsim_forward_skb(struct net_device *tx_dev,
 			    struct net_device *rx_dev,
 			    struct sk_buff *skb,
@@ -196,8 +205,10 @@ static int nsim_forward_skb(struct net_device *tx_dev,
 	int ret;
 
 	ret = __dev_forward_skb(rx_dev, skb);
-	if (ret)
+	if (ret) {
+		nsim_psp_ext_put(psp_ext);
 		return ret;
+	}
 
 	nsim_psp_handle_ext(skb, psp_ext);
 
@@ -278,11 +289,8 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * the synthetic cost; sender-side skb accounting stays put.
 		 */
 		nskb = skb_clone(skb, GFP_ATOMIC);
-		if (!nskb) {
-			if (psp_ext)
-				__skb_ext_put(psp_ext);
+		if (!nskb)
 			goto out_drop_free;
-		}
 
 		consume_skb(skb);
 		skb = nskb;
@@ -303,6 +311,7 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 out_drop_any:
 	dr = SKB_DROP_REASON_NOT_SPECIFIED;
 out_drop_free:
+	nsim_psp_ext_put(psp_ext);
 	kfree_skb_reason(skb, dr);
 out_drop_cnt:
 	rcu_read_unlock();
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH 0/9] Kernel API Specification Framework
From: David Laight @ 2026-03-14 22:44 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Sasha Levin, linux-api, linux-kernel, linux-doc, linux-fsdevel,
	linux-kbuild, linux-kselftest, workflows, tools, x86,
	Thomas Gleixner, Paul E. McKenney, Greg Kroah-Hartman,
	Jonathan Corbet, Dmitry Vyukov, Randy Dunlap, Cyril Hrubis,
	Kees Cook, Jake Edge, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <20260314111822.63a2ba4a@kernel.org>

On Sat, 14 Mar 2026 11:18:22 -0700
Jakub Kicinski <kuba@kernel.org> wrote:

> On Fri, 13 Mar 2026 11:09:10 -0400 Sasha Levin wrote:
> > This enables static analysis tools to verify userspace API usage at compile
> > time, test generation based on formal specifications, consistent error handling
> > validation, automated documentation generation, and formal verification of
> > kernel interfaces.  
> 
> Could you give some examples? We have machine readable descriptions for
> Netlink interfaces, we approached syzbot folks and they did not really
> seem to care for those.

The whole thing reminds me of doxygen comment blocks.
They tend to make it hard to read the source files, hard to search
the source files (due to all the extra matches) and are pretty much
always out of date.

The kerndoc comment blocks for trivial helper functions are hard enough
to keep up to date.

The only way even parameter descriptions are going to stay correct is if the
compiler is using the definition and only the comment part is extra.
For error returns you'll need the documentation to be at the return site.

	David

^ permalink raw reply

* Re: [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests
From: Jakub Kicinski @ 2026-03-15  1:18 UTC (permalink / raw)
  To: atwellwea
  Cc: netdev, davem, pabeni, edumazet, ncardwell, linux-kernel,
	linux-api, linux-doc, linux-kselftest, linux-trace-kernel, mptcp,
	dsahern, horms, kuniyu, andrew+netdev, willemdebruijn.kernel,
	jasowang, skhan, corbet, matttbe, martineau, geliang, rostedt,
	mhiramat, mathieu.desnoyers, 0x7f454c46
In-Reply-To: <20260314201348.1786972-14-atwellwea@gmail.com>

On Sat, 14 Mar 2026 14:13:47 -0600 atwellwea@gmail.com wrote:
> diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
> index 1a228c5430f5..9e9e48d5913b 100644
> --- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
> +++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
> @@ -14,6 +14,7 @@ TEST_PROGS := \
>  	macsec-offload.sh \
>  	nexthop.sh \
>  	peer.sh \
> +	peer-rx-truesize.sh \
>  	psample.sh \
>  	tc-mq-visibility.sh \
>  	udp_tunnel_nic.sh \

I think our order checker thinks that is not alphabetical.
You can find the scripts here:
https://github.com/linux-netdev/nipa/tree/main/tests/patch/check_selftest

It'd be good if you can try the "running locally" section from README

^ permalink raw reply

* Re: [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests
From: Jakub Kicinski @ 2026-03-15  1:18 UTC (permalink / raw)
  To: atwellwea
  Cc: netdev, davem, pabeni, edumazet, ncardwell, linux-kernel,
	linux-api, linux-doc, linux-kselftest, linux-trace-kernel, mptcp,
	dsahern, horms, kuniyu, andrew+netdev, willemdebruijn.kernel,
	jasowang, skhan, corbet, matttbe, martineau, geliang, rostedt,
	mhiramat, mathieu.desnoyers, 0x7f454c46
In-Reply-To: <20260314201348.1786972-13-atwellwea@gmail.com>

On Sat, 14 Mar 2026 14:13:46 -0600 atwellwea@gmail.com wrote:
> Add a test-only TUN ioctl that inflates RX skb->truesize, plus the
> packetdrill-side helper needed to drive that ioctl through packetdrill's
> own TUN queue file descriptor.
> 
> Use that plumbing to cover the receive-window regressions where
> scaling_ratio drifts after advertisement, alongside the baseline too-big
> packetdrill cases that exercise the same sender-visible rwnd accounting
> from the non-injected path.

You missed adding the tun program to the Makefile.

  tcp-rcv-neg-window-truesize-pkt                                               
  tcp-rcv-toobig-default-truesize-pkt                                           
  tcp-rcv-wnd-shrink-allowed-truesize-pkt                                       
                                                                               
Fail with:                                                            

  sh: line 1: ../tun: No such file or directory                                 
  tcp_rcv_*_truesize.pkt:*: error executing `../tun --set-rx-truesize tun0 6553\
6` command: non-zero status 127        

https://netdev.bots.linux.dev/contest.html?pw-n=0&branch=net-next-2026-03-15--00-00&pw-n=0&pass=0

^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Jakub Kicinski @ 2026-03-15  1:19 UTC (permalink / raw)
  To: atwellwea
  Cc: netdev, davem, pabeni, edumazet, ncardwell, linux-kernel,
	linux-api, linux-doc, linux-kselftest, linux-trace-kernel, mptcp,
	dsahern, horms, kuniyu, andrew+netdev, willemdebruijn.kernel,
	jasowang, skhan, corbet, matttbe, martineau, geliang, rostedt,
	mhiramat, mathieu.desnoyers, 0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

On Sat, 14 Mar 2026 14:13:34 -0600 atwellwea@gmail.com wrote:
> This series keeps sender-visible TCP receive-window accounting tied to the
> scaling basis that was in force when the window was advertised, even if
> later receive-side truesize inflation lowers scaling_ratio or the live
> receive window retracts below the largest right edge already exposed to the
> sender.

Please wait until at least Monday before reposting this.
LLM-pocalypse is hitting us hard

^ permalink raw reply

* Re: [PATCH 0/9] Kernel API Specification Framework
From: Sasha Levin @ 2026-03-15  6:36 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: linux-api, linux-kernel, linux-doc, linux-fsdevel, linux-kbuild,
	linux-kselftest, workflows, tools, x86, Thomas Gleixner,
	Paul E. McKenney, Greg Kroah-Hartman, Jonathan Corbet,
	Dmitry Vyukov, Randy Dunlap, Cyril Hrubis, Kees Cook, Jake Edge,
	David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <20260314111822.63a2ba4a@kernel.org>

On Sat, Mar 14, 2026 at 11:18:22AM -0700, Jakub Kicinski wrote:
>On Fri, 13 Mar 2026 11:09:10 -0400 Sasha Levin wrote:
>> This enables static analysis tools to verify userspace API usage at compile
>> time, test generation based on formal specifications, consistent error handling
>> validation, automated documentation generation, and formal verification of
>> kernel interfaces.
>
>Could you give some examples? We have machine readable descriptions for
>Netlink interfaces, we approached syzbot folks and they did not really
>seem to care for those.

Once the API is in a machine-readable format, we can write formatters to
output whatever downstream tools need. The kapi tool in the series
already ships with plain text, JSON, and RST formatters, and adding new
output formats is straightforward. We don't need to convince the
syzkaller folks to consume our specs, we can just output them in a
format that syzkaller already understands.

For example, I have a syzlang formatter that produces the following
from the sys_read spec in this series:

   # --- read ---
   # Read data from a file descriptor
   #
   # @context process, sleepable
   #
   # @capability CAP_DAC_OVERRIDE: Bypass discretionary access control on read permission
   # @capability CAP_DAC_READ_SEARCH: Bypass read permission checks on regular files
   #
   # @error EPERM (-1): Returned by fanotify permission events...
   # @error EINTR (-4): The call was interrupted by a signal before any data was read.
   # @error EIO (-5): A low-level I/O error occurred.
   # @error EBADF (-9): fd is not a valid file descriptor, or fd was not opened for reading.
   # @error EAGAIN (-11): O_NONBLOCK set and read would block.
   # @error EACCES (-13): LSM denied the read operation via security_file_permission().
   # @error EFAULT (-14): buf points outside the accessible address space.
   # @error EISDIR (-21): fd refers to a directory.
   # @error EINVAL (-22): fd not suitable for reading, O_DIRECT misaligned, count negative...
   # @error ENODATA (-61): Data not available in cache...
   # @error EOVERFLOW (-75): File position plus count would exceed LLONG_MAX.
   # @error EOPNOTSUPP (-95): Read not supported for this file type...
   # @error ENOBUFS (-105): Buffer too small for complete notification...
   #
   # @constraint MAX_RW_COUNT: actual_count = min(count, MAX_RW_COUNT)
   # @constraint File must be open for reading: (file->f_mode & FMODE_READ) && (file->f_mode & FMODE_CAN_READ)
   #
   # @signal Any signal: restartable, error=-4
   #
   # @lock file->f_pos_lock (mutex, internal): For seekable files with FMODE_ATOMIC_POS
   # @lock Filesystem-specific locks (rwlock, internal)
   # @lock RCU read-side (rwlock, internal): File descriptor lookup protection
   #
   # @side-effect file->f_pos: f_pos advanced by bytes read [conditional, irreversible]
   # @side-effect inode access time: atime updated unless O_NOATIME [conditional, irreversible]
   # @side-effect task I/O accounting: rchar and syscr updated [conditional, irreversible]
   # @side-effect fsnotify events: FS_ACCESS event generated [conditional, irreversible]
   #
   # @return ssize_t: bytes read on success, negative errno on error
   #
   read(fd fd, buf ptr[out, buffer[out]], count len[buf]) intptr

That said, I don't have a strong end-to-end example with the 4 syscalls
proposed here, as open/close/read/write don't take complex structures,
and the subsystems underneath them aren't specced yet. The value becomes
clearer with richer interfaces where the gap between "we know the type
signature" and "we know the full behavioral contract" is much wider.

-- 
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH 0/9] Kernel API Specification Framework
From: Sasha Levin @ 2026-03-15  6:46 UTC (permalink / raw)
  To: David Laight
  Cc: Jakub Kicinski, linux-api, linux-kernel, linux-doc, linux-fsdevel,
	linux-kbuild, linux-kselftest, workflows, tools, x86,
	Thomas Gleixner, Paul E. McKenney, Greg Kroah-Hartman,
	Jonathan Corbet, Dmitry Vyukov, Randy Dunlap, Cyril Hrubis,
	Kees Cook, Jake Edge, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <20260314224435.35465615@pumpkin>

On Sat, Mar 14, 2026 at 10:44:35PM +0000, David Laight wrote:
>On Sat, 14 Mar 2026 11:18:22 -0700
>Jakub Kicinski <kuba@kernel.org> wrote:
>
>> On Fri, 13 Mar 2026 11:09:10 -0400 Sasha Levin wrote:
>> > This enables static analysis tools to verify userspace API usage at compile
>> > time, test generation based on formal specifications, consistent error handling
>> > validation, automated documentation generation, and formal verification of
>> > kernel interfaces.
>>
>> Could you give some examples? We have machine readable descriptions for
>> Netlink interfaces, we approached syzbot folks and they did not really
>> seem to care for those.
>
>The whole thing reminds me of doxygen comment blocks.
>They tend to make it hard to read the source files, hard to search
>the source files (due to all the extra matches) and are pretty much
>always out of date.
>
>The kerndoc comment blocks for trivial helper functions are hard enough
>to keep up to date.
>
>The only way even parameter descriptions are going to stay correct is if the
>compiler is using the definition and only the comment part is extra.
>For error returns you'll need the documentation to be at the return site.

When CONFIG_KAPI_RUNTIME_CHECKS is enabled, the specs are enforced at
the syscall boundary. The SYSCALL_DEFINEx macro grows a wrapper that
calls kapi_validate_syscall_params() before the real implementation and
kapi_validate_syscall_return() after it. Parameter constraints (ranges,
valid flag masks, alignment) are checked on every syscall entry, and
return values are validated against the documented success/error ranges
on every exit.

If a spec goes stale, it has runtime consequences. A new flag bit added
without updating the spec's valid_mask means callers using that flag get
EINVAL, which any test exercising that path catches immediately. An
implementation returning an undocumented error code triggers a warning
from the return validation.

The selftest in the series (tools/testing/selftests/kapi/test_kapi.c)
exercises this with real syscalls, both valid and invalid inputs,
verifying the validation layer catches violations.

-- 
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH 0/9] Kernel API Specification Framework
From: Dmitry Vyukov @ 2026-03-16  7:05 UTC (permalink / raw)
  To: Jakub Kicinski, syzkaller
  Cc: Sasha Levin, linux-api, linux-kernel, linux-doc, linux-fsdevel,
	linux-kbuild, linux-kselftest, workflows, tools, x86,
	Thomas Gleixner, Paul E. McKenney, Greg Kroah-Hartman,
	Jonathan Corbet, Randy Dunlap, Cyril Hrubis, Kees Cook, Jake Edge,
	David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <20260314111822.63a2ba4a@kernel.org>

On Sat, 14 Mar 2026 at 19:18, Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Fri, 13 Mar 2026 11:09:10 -0400 Sasha Levin wrote:
> > This enables static analysis tools to verify userspace API usage at compile
> > time, test generation based on formal specifications, consistent error handling
> > validation, automated documentation generation, and formal verification of
> > kernel interfaces.
>
> Could you give some examples? We have machine readable descriptions for
> Netlink interfaces, we approached syzbot folks and they did not really
> seem to care for those.

I think our reasoning wrt syzkaller was that not all interfaces in all
relevant kernels are described with netlink yml descriptions, so we
need to continue using the extraction of interfaces from the source
code. And if we have that code, then using yml as an additional data
source only adds code/complexity. Additionally, we may extract some
extra constraints/info from code that are not present in yml.

Realistically system call descriptions may have the same problem for
us at this point, since we extract lots of info from the source code
already:
https://raw.githubusercontent.com/google/syzkaller/refs/heads/master/sys/linux/auto.txt
(and LLMs obviously can allow us to extract more)

^ permalink raw reply

* Re: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: Paolo Abeni @ 2026-03-16 11:04 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-6-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
> 
> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
> more sender-visible window than the current hard receive-memory backing
> can cover.
> 
> The new helper keeps backlog and memory-pressure limits in the same
> units as the rest of the receive path, while __tcp_select_window()
> backs any rounding slack before advertising it.
> 
> Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
> ---
>  include/net/tcp.h     | 12 ++++++++++++
>  net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
>  net/ipv4/tcp_output.c | 15 +++++++++++++--
>  3 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc22ab6b80d5..5b479ad44f89 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
>  enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
>  void tcp_rcv_space_adjust(struct sock *sk);
>  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
>  void tcp_twsk_destructor(struct sock *sk);
> @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
>  	return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
>  }
>  
> +/* Passive children clone the listener's sk_socket until accept() grafts
> + * their own struct socket, so only sockets that point back to themselves
> + * should autotune receive-buffer backing.
> + */
> +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
> +{
> +	struct socket *sock = READ_ONCE(sk->sk_socket);
> +
> +	return sock && READ_ONCE(sock->sk) == sk;

This is executed under the sk socket lock, ONCE annotation not needed.

> +}
> +
>  /* Note: caller must be prepared to deal with negative returns */
>  static inline int tcp_space(const struct sock *sk)
>  {
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 352f814a4ff6..32256519a085 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
>  				    (u32)TCP_INIT_CWND * tp->advmss);
>  }
>  
> +/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
> + * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
> + * already consumed by sk_backlog.len.
> + */
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
> +{
> +	struct net *net = sock_net(sk);
> +	int backlog;
> +	int rmem2;
> +	int target;
> +
> +	needed = max(needed, 0);
> +	backlog = READ_ONCE(sk->sk_backlog.len);
> +	target = tcp_rmem_used(sk) + backlog + needed;
> +
> +	if (target <= READ_ONCE(sk->sk_rcvbuf))
> +		return true;
> +
> +	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
> +	if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
> +	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
> +	    tcp_under_memory_pressure(sk) ||
> +	    sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
> +		return false;
> +
> +	WRITE_ONCE(sk->sk_rcvbuf,
> +		   min_t(int, rmem2,
> +			 max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
> +
> +	return target <= READ_ONCE(sk->sk_rcvbuf);

Same here, and more cases below.

/P


^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Paolo Abeni @ 2026-03-16 11:09 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
> 
> This series keeps sender-visible TCP receive-window accounting tied to the
> scaling basis that was in force when the window was advertised, even if
> later receive-side truesize inflation lowers scaling_ratio or the live
> receive window retracts below the largest right edge already exposed to the
> sender.
> 
> After the receive-window retraction changes, the receive path needs to keep
> track of two related pieces of sender-visible state:
> 
>   1. the live advertised receive window
>   2. the maximum advertised right edge and the basis it was exposed with
> 
> This repost snapshots both, uses them to repair receive-buffer backing when
> ratio drift would otherwise strand sender-visible space, extends
> TCP_REPAIR_WINDOW so repair/restore can round-trip the new state, and adds
> truesize-drift coverage through TUN packetdrill tests and netdevsim-based
> selftests.

The series is IMHO significantly not trivial. Can the end-user meet the
relevant condition in practice? How? What is the net benefit in
practice? Is that observable under usual conditions or require
exceptional circumstances?

I think we need a strong motivation to merge this kind of changes.

/P


^ permalink raw reply

* Re: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: Paolo Abeni @ 2026-03-16 11:24 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-6-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
> 
> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
> more sender-visible window than the current hard receive-memory backing
> can cover.
> 
> The new helper keeps backlog and memory-pressure limits in the same
> units as the rest of the receive path, while __tcp_select_window()
> backs any rounding slack before advertising it.
> 
> Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
> ---
>  include/net/tcp.h     | 12 ++++++++++++
>  net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
>  net/ipv4/tcp_output.c | 15 +++++++++++++--
>  3 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc22ab6b80d5..5b479ad44f89 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
>  enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
>  void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
>  void tcp_rcv_space_adjust(struct sock *sk);
>  int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
>  void tcp_twsk_destructor(struct sock *sk);
> @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
>  	return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
>  }
>  
> +/* Passive children clone the listener's sk_socket until accept() grafts
> + * their own struct socket, 

AFAICS, the above statement is false, see sk_set_socket() in sk_clone()

> so only sockets that point back to themselves
> + * should autotune receive-buffer backing.
> + */
> +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
> +{
> +	struct socket *sock = READ_ONCE(sk->sk_socket);

You can just check `sk->sk_socket`. Also you could re-use this helper in
tcp_data_queue_ofo().

/P


^ permalink raw reply

* Re: [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: Paolo Abeni @ 2026-03-16 11:31 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-6-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 57a2a6daaad3..53781cf591d2 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
>  	 * scaled window will not line up with the MSS boundary anyway.
>  	 */
>  	if (tp->rx_opt.rcv_wscale) {
> +		int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
> +
>  		window = free_space;
>  
>  		/* Advertise enough space so that it won't get scaled away.
> -		 * Import case: prevent zero window announcement if
> +		 * Important case: prevent zero-window announcement if
>  		 * 1<<rcv_wscale > mss.
>  		 */
> -		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
> +		window = ALIGN(window, rcv_wscale);
> +
> +		/* Back any scale-quantization slack before we expose it.
> +		 * Otherwise tcp_can_ingest() can reject data which is still
> +		 * within the sender-visible window.
> +		 */
> +		if (window > free_space &&
> +		    (!tcp_rcvbuf_grow_allowed(sk) ||
> +		     !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
> +			window = round_down(free_space, rcv_wscale);

It looks like this can cause the advertised window to shrink even if we
are in the 'do not allow window to shrink' branch.

Also why the other branch (shrinking allowed) is not touched?

/P


^ permalink raw reply

* Re: [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction
From: Paolo Abeni @ 2026-03-16 11:44 UTC (permalink / raw)
  To: atwellwea, netdev, davem, kuba, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-8-atwellwea@gmail.com>

On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> +/* Sender-visible window rescue does not relax hard receive-memory admission.
> + * If growth did not make room, fall back to the established prune/collapse
> + * path.
> + */
>  static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
>  				 unsigned int size)
>  {
> -	if (!tcp_can_ingest(sk, skb) ||
> -	    !sk_rmem_schedule(sk, skb, size)) {
> +	bool can_ingest = tcp_can_ingest(sk, skb);
> +	bool scheduled = can_ingest && sk_rmem_schedule(sk, skb, size);
> +
> +	if (!scheduled) {
> +		int pruned = tcp_prune_queue(sk, skb);
>  
> -		if (tcp_prune_queue(sk, skb) < 0)
> +		if (pruned < 0)
>  			return -1;
>  
>  		while (!sk_rmem_schedule(sk, skb, size)) {
> -			if (!tcp_prune_ofo_queue(sk, skb))
> +			bool pruned_ofo = tcp_prune_ofo_queue(sk, skb);
> +
> +			if (!pruned_ofo)
>  				return -1;
>  		}
>  	}

The above chunk is AFAICS pure noise. Please have a more careful local
review of this series before any next revision.

/P


^ permalink raw reply

* Re: [PATCH v5 0/4] OPENAT2_REGULAR flag support for openat2
From: Dorjoy Chowdhury @ 2026-03-16 16:12 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
	v9fs, linux-kselftest, viro, brauner, jack, jlayton, chuck.lever,
	alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
	mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
	andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
	sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
	miklos, hansg
In-Reply-To: <20260307140726.70219-1-dorjoychy111@gmail.com>

Ping...
Requesting for review on this patch series please.

Regards,
Dorjoy


On Sat, Mar 7, 2026 at 8:07 PM Dorjoy Chowdhury <dorjoychy111@gmail.com> wrote:
>
> Hi,
>
> I came upon this "Ability to only open regular files" uapi feature suggestion
> from https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> and thought it would be something I could do as a first patch and get to
> know the kernel code a bit better.
>
> The following filesystems have been tested by building and booting the kernel
> x86 bzImage in a Fedora 43 VM in QEMU. I have tested with OPENAT2_REGULAR that
> regular files can be successfully opened and non-regular files (directory, fifo etc)
> return -EFTYPE.
> - btrfs
> - NFS (loopback)
> - SMB (loopback)
>
> Changes in v5:
> - EFTYPE is already used in BSDs mentioned in commit message
> - consistently return -EFTYPE in all filesystems
>
> Changes in v4:
> - changed O_REGULAR to OPENAT2_REGULAR
> - OPENAT2_REGULAR does not affect O_PATH
> - atomic_open codepaths updated to work properly for OPENAT2_REGULAR
> - commit message includes the uapi-group URL
> - v3 is at: https://lore.kernel.org/linux-fsdevel/20260127180109.66691-1-dorjoychy111@gmail.com/T/
>
> Changes in v3:
> - included motivation about O_REGULAR flag in commit message e.g., programs not wanting to be tricked into opening device nodes
> - fixed commit message wrongly referencing ENOTREGULAR instead of ENOTREG
> - fixed the O_REGULAR flag in arch/parisc/include/uapi/asm/fcntl.h from 060000000 to 0100000000
> - added 2 commits converting arch/{mips,sparc}/include/uapi/asm/fcntl.h O_* macros from hex to octal
> - v2 is at: https://lore.kernel.org/linux-fsdevel/20260126154156.55723-1-dorjoychy111@gmail.com/T/
>
> Changes in v2:
> - rename ENOTREGULAR to ENOTREG
> - define ENOTREG in uapi/asm-generic/errno.h (instead of errno-base.h) and in arch/*/include/uapi/asm/errno.h files
> - override O_REGULAR in arch/{alpha,sparc,parisc}/include/uapi/asm/fcntl.h due to clash with include/uapi/asm-generic/fcntl.h
> - I have kept the kselftest but now that O_REGULAR and ENOTREG can have different value on different architectures I am not sure if it's right
> - v1 is at: https://lore.kernel.org/linux-fsdevel/20260125141518.59493-1-dorjoychy111@gmail.com/T/
>
> Thanks.
>
> Regards,
> Dorjoy
>
> Dorjoy Chowdhury (4):
>   openat2: new OPENAT2_REGULAR flag support
>   kselftest/openat2: test for OPENAT2_REGULAR flag
>   sparc/fcntl.h: convert O_* flag macros from hex to octal
>   mips/fcntl.h: convert O_* flag macros from hex to octal
>
>  arch/alpha/include/uapi/asm/errno.h           |  2 +
>  arch/alpha/include/uapi/asm/fcntl.h           |  1 +
>  arch/mips/include/uapi/asm/errno.h            |  2 +
>  arch/mips/include/uapi/asm/fcntl.h            | 22 +++++------
>  arch/parisc/include/uapi/asm/errno.h          |  2 +
>  arch/parisc/include/uapi/asm/fcntl.h          |  1 +
>  arch/sparc/include/uapi/asm/errno.h           |  2 +
>  arch/sparc/include/uapi/asm/fcntl.h           | 35 +++++++++---------
>  fs/ceph/file.c                                |  4 ++
>  fs/gfs2/inode.c                               |  6 +++
>  fs/namei.c                                    |  4 ++
>  fs/nfs/dir.c                                  |  4 ++
>  fs/open.c                                     |  4 +-
>  fs/smb/client/dir.c                           | 14 ++++++-
>  include/linux/fcntl.h                         |  2 +
>  include/uapi/asm-generic/errno.h              |  2 +
>  include/uapi/asm-generic/fcntl.h              |  4 ++
>  tools/arch/alpha/include/uapi/asm/errno.h     |  2 +
>  tools/arch/mips/include/uapi/asm/errno.h      |  2 +
>  tools/arch/parisc/include/uapi/asm/errno.h    |  2 +
>  tools/arch/sparc/include/uapi/asm/errno.h     |  2 +
>  tools/include/uapi/asm-generic/errno.h        |  2 +
>  .../testing/selftests/openat2/openat2_test.c  | 37 ++++++++++++++++++-
>  23 files changed, 127 insertions(+), 31 deletions(-)
>
> --
> 2.53.0
>

^ permalink raw reply

* Re: [PATCH v5 1/4] openat2: new OPENAT2_REGULAR flag support
From: Jeff Layton @ 2026-03-16 16:53 UTC (permalink / raw)
  To: Dorjoy Chowdhury, linux-fsdevel
  Cc: linux-kernel, linux-api, ceph-devel, gfs2, linux-nfs, linux-cifs,
	v9fs, linux-kselftest, viro, brauner, jack, chuck.lever,
	alex.aring, arnd, adilger, mjguzik, smfrench, richard.henderson,
	mattst88, linmag7, tsbogend, James.Bottomley, deller, davem,
	andreas, idryomov, amarkuze, slava, agruenba, trondmy, anna,
	sfrench, pc, ronniesahlberg, sprasad, tom, bharathsm, shuah,
	miklos, hansg
In-Reply-To: <20260307140726.70219-2-dorjoychy111@gmail.com>

On Sat, 2026-03-07 at 20:06 +0600, Dorjoy Chowdhury wrote:
> This flag indicates the path should be opened if it's a regular file.
> This is useful to write secure programs that want to avoid being
> tricked into opening device nodes with special semantics while thinking
> they operate on regular files. This is a requested feature from the
> uapi-group[1].
> 
> A corresponding error code EFTYPE has been introduced. For example, if
> openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> like FreeBSD, macOS.
> 
> When used in combination with O_CREAT, either the regular file is
> created, or if the path already exists, it is opened if it's a regular
> file. Otherwise, -EFTYPE is returned.
> 
> When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> as it doesn't make sense to open a path that is both a directory and a
> regular file.
> 
> [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> 
> Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> ---
>  arch/alpha/include/uapi/asm/errno.h        |  2 ++
>  arch/alpha/include/uapi/asm/fcntl.h        |  1 +
>  arch/mips/include/uapi/asm/errno.h         |  2 ++
>  arch/parisc/include/uapi/asm/errno.h       |  2 ++
>  arch/parisc/include/uapi/asm/fcntl.h       |  1 +
>  arch/sparc/include/uapi/asm/errno.h        |  2 ++
>  arch/sparc/include/uapi/asm/fcntl.h        |  1 +
>  fs/ceph/file.c                             |  4 ++++
>  fs/gfs2/inode.c                            |  6 ++++++
>  fs/namei.c                                 |  4 ++++
>  fs/nfs/dir.c                               |  4 ++++
>  fs/open.c                                  |  4 +++-
>  fs/smb/client/dir.c                        | 14 +++++++++++++-
>  include/linux/fcntl.h                      |  2 ++
>  include/uapi/asm-generic/errno.h           |  2 ++
>  include/uapi/asm-generic/fcntl.h           |  4 ++++
>  tools/arch/alpha/include/uapi/asm/errno.h  |  2 ++
>  tools/arch/mips/include/uapi/asm/errno.h   |  2 ++
>  tools/arch/parisc/include/uapi/asm/errno.h |  2 ++
>  tools/arch/sparc/include/uapi/asm/errno.h  |  2 ++
>  tools/include/uapi/asm-generic/errno.h     |  2 ++
>  21 files changed, 63 insertions(+), 2 deletions(-)
> 
> 

I pointed Claude at this patch and got this back. Both issues that it
found will need to be fixed:

  Analysis Summary

  Commit: 7e7fa2653ca57 - openat2: new OPENAT2_REGULAR flag support

  This patch adds a new OPENAT2_REGULAR flag for openat2() that restricts opens to regular files only, returning a new
  EFTYPE errno for non-regular files. It adds filesystem-specific checks in ceph, gfs2, nfs, and cifs atomic_open paths,
  plus a VFS-level fallback in do_open().

  Issues found:

  1. OPENAT2_REGULAR leaks into f_flags - do_dentry_open() strips open-time-only flags (O_CREAT|O_EXCL|O_NOCTTY|O_TRUNC)
  but does not strip OPENAT2_REGULAR. When a regular file is successfully opened via openat2() with this flag, the bit
  persists in file->f_flags and will be returned by fcntl(fd, F_GETFL).
  2. BUILD_BUG_ON not updated - The compile-time guard checks upper_32_bits(VALID_OPEN_FLAGS) but the code now accepts
  VALID_OPENAT2_FLAGS. The guard should cover the expanded flag set.

  Verified correct:

  - All hex→octal conversions in MIPS and SPARC fcntl.h are numerically correct
  - Legacy open()/openat() properly strips OPENAT2_REGULAR via build_open_how() masking with VALID_OPEN_FLAGS
  - All filesystem cleanup paths (ceph, gfs2, nfs, cifs) properly handle resources when returning -EFTYPE
  - O_DIRECTORY + OPENAT2_REGULAR mutual exclusion is correct
  - O_PATH + OPENAT2_REGULAR is properly rejected by O_PATH_FLAGS check

  Ruled out:

  - NFS -ENOTDIR to -EFTYPE conversion: in atomic_open context, parent path is VFS-resolved, server errors relate to
  target
  - CIFS resource leak: out_err label properly closes server handle and calls iput()
  - OPENAT2_REGULAR + O_TMPFILE: silently accepted but tmpfiles are always regular, so harmless

  FINAL REGRESSIONS FOUND: 2
  FINAL TOKENS USED: ~45000
  False positives eliminated: NFS -ENOTDIR conversion, CIFS resource leak, O_TMPFILE interaction

Cheers,
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* Re: [PATCH v5 1/4] openat2: new OPENAT2_REGULAR flag support
From: Dorjoy Chowdhury @ 2026-03-16 17:22 UTC (permalink / raw)
  To: Jeff Layton
  Cc: linux-fsdevel, linux-kernel, linux-api, ceph-devel, gfs2,
	linux-nfs, linux-cifs, v9fs, linux-kselftest, viro, brauner, jack,
	chuck.lever, alex.aring, arnd, adilger, mjguzik, smfrench,
	richard.henderson, mattst88, linmag7, tsbogend, James.Bottomley,
	deller, davem, andreas, idryomov, amarkuze, slava, agruenba,
	trondmy, anna, sfrench, pc, ronniesahlberg, sprasad, tom,
	bharathsm, shuah, miklos, hansg
In-Reply-To: <5fcc2a6e6d92dae0601c6b3b8faa8b2f83981afb.camel@kernel.org>

On Mon, Mar 16, 2026 at 10:53 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2026-03-07 at 20:06 +0600, Dorjoy Chowdhury wrote:
> > This flag indicates the path should be opened if it's a regular file.
> > This is useful to write secure programs that want to avoid being
> > tricked into opening device nodes with special semantics while thinking
> > they operate on regular files. This is a requested feature from the
> > uapi-group[1].
> >
> > A corresponding error code EFTYPE has been introduced. For example, if
> > openat2 is called on path /dev/null with OPENAT2_REGULAR in the flag
> > param, it will return -EFTYPE. EFTYPE is already used in BSD systems
> > like FreeBSD, macOS.
> >
> > When used in combination with O_CREAT, either the regular file is
> > created, or if the path already exists, it is opened if it's a regular
> > file. Otherwise, -EFTYPE is returned.
> >
> > When OPENAT2_REGULAR is combined with O_DIRECTORY, -EINVAL is returned
> > as it doesn't make sense to open a path that is both a directory and a
> > regular file.
> >
> > [1]: https://uapi-group.org/kernel-features/#ability-to-only-open-regular-files
> >
> > Signed-off-by: Dorjoy Chowdhury <dorjoychy111@gmail.com>
> > ---
> >  arch/alpha/include/uapi/asm/errno.h        |  2 ++
> >  arch/alpha/include/uapi/asm/fcntl.h        |  1 +
> >  arch/mips/include/uapi/asm/errno.h         |  2 ++
> >  arch/parisc/include/uapi/asm/errno.h       |  2 ++
> >  arch/parisc/include/uapi/asm/fcntl.h       |  1 +
> >  arch/sparc/include/uapi/asm/errno.h        |  2 ++
> >  arch/sparc/include/uapi/asm/fcntl.h        |  1 +
> >  fs/ceph/file.c                             |  4 ++++
> >  fs/gfs2/inode.c                            |  6 ++++++
> >  fs/namei.c                                 |  4 ++++
> >  fs/nfs/dir.c                               |  4 ++++
> >  fs/open.c                                  |  4 +++-
> >  fs/smb/client/dir.c                        | 14 +++++++++++++-
> >  include/linux/fcntl.h                      |  2 ++
> >  include/uapi/asm-generic/errno.h           |  2 ++
> >  include/uapi/asm-generic/fcntl.h           |  4 ++++
> >  tools/arch/alpha/include/uapi/asm/errno.h  |  2 ++
> >  tools/arch/mips/include/uapi/asm/errno.h   |  2 ++
> >  tools/arch/parisc/include/uapi/asm/errno.h |  2 ++
> >  tools/arch/sparc/include/uapi/asm/errno.h  |  2 ++
> >  tools/include/uapi/asm-generic/errno.h     |  2 ++
> >  21 files changed, 63 insertions(+), 2 deletions(-)
> >
> >
>
> I pointed Claude at this patch and got this back. Both issues that it
> found will need to be fixed:
>
>   Analysis Summary
>
>   Commit: 7e7fa2653ca57 - openat2: new OPENAT2_REGULAR flag support
>
>   This patch adds a new OPENAT2_REGULAR flag for openat2() that restricts opens to regular files only, returning a new
>   EFTYPE errno for non-regular files. It adds filesystem-specific checks in ceph, gfs2, nfs, and cifs atomic_open paths,
>   plus a VFS-level fallback in do_open().
>
>   Issues found:
>
>   1. OPENAT2_REGULAR leaks into f_flags - do_dentry_open() strips open-time-only flags (O_CREAT|O_EXCL|O_NOCTTY|O_TRUNC)
>   but does not strip OPENAT2_REGULAR. When a regular file is successfully opened via openat2() with this flag, the bit
>   persists in file->f_flags and will be returned by fcntl(fd, F_GETFL).
>   2. BUILD_BUG_ON not updated - The compile-time guard checks upper_32_bits(VALID_OPEN_FLAGS) but the code now accepts
>   VALID_OPENAT2_FLAGS. The guard should cover the expanded flag set.
>

Good catches! I guess for issue 1 I need to modify the line in
do_dentry_open implementation to
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | OPENAT2_REGULAR);
right?
And for issue 2, I should change the VALID_OPEN_FLAGS to
VALID_OPENAT2_FLAGS in both build_open_flags in fs/open.c and in
fcntl_init in fs/fcntl.c, correct?

Regards,
Dorjoy

^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Paolo Abeni @ 2026-03-16 17:47 UTC (permalink / raw)
  To: Wesley Atwell
  Cc: netdev, davem, kuba, edumazet, ncardwell, linux-kernel, linux-api,
	linux-doc, linux-kselftest, linux-trace-kernel, mptcp, dsahern,
	horms, kuniyu, andrew+netdev, willemdebruijn.kernel, jasowang,
	skhan, corbet, matttbe, martineau, geliang, rostedt, mhiramat,
	mathieu.desnoyers, 0x7f454c46
In-Reply-To: <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>

Hi,

On 3/16/26 6:29 PM, Wesley Atwell wrote:
> The strongest real anchor here is the already documented regression
> around sender-visible rwnd diverging from hard receive-memory backing,
> rather than a general receive-accounting cleanup.

I likely missed some of the prior discussion. Could you please share a
pointer/link to the mentioned regression report?

When posting on netdev please:
- use plaintext only messages
- avoid top-posting

Thanks,

Paolo


^ permalink raw reply

* Re: [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift
From: Wesley Atwell @ 2026-03-16 18:03 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, davem, kuba, edumazet, ncardwell, linux-kernel, linux-api,
	linux-doc, linux-kselftest, linux-trace-kernel, mptcp, dsahern,
	horms, kuniyu, andrew+netdev, willemdebruijn.kernel, jasowang,
	skhan, corbet, matttbe, martineau, geliang, rostedt, mhiramat,
	mathieu.desnoyers, 0x7f454c46
In-Reply-To: <e93ce797-4153-4e6e-89b6-3839a0b8bca2@redhat.com>

Hi Paolo,

The link I meant is the netdev regression discussion that led to:

026dfef287c0 ("tcp: give up on stronger sk_rcvbuf checks (for now)")

The report/discussion thread is here:

https://lore.kernel.org/20260225122355.585fd57b@kernel.org

The revert posting itself is here:

https://patch.msgid.link/20260227003359.2391017-1-kuba@kernel.org

I should not have implied that there was a separate prior
regression report beyond that thread.

Thanks for the note on formatting. I will keep follow-ups in plaintext
and avoid top-posting.

Thanks,
Wesley

^ permalink raw reply

* Re: [PATCH 0/9] Kernel API Specification Framework
From: Jakub Kicinski @ 2026-03-16 22:57 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: syzkaller, Sasha Levin, linux-api, linux-kernel, linux-doc,
	linux-fsdevel, linux-kbuild, linux-kselftest, workflows, tools,
	x86, Thomas Gleixner, Paul E. McKenney, Greg Kroah-Hartman,
	Jonathan Corbet, Randy Dunlap, Cyril Hrubis, Kees Cook, Jake Edge,
	David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <CACT4Y+arWePyxnV3hWk5RanWZpoc7=ALQ6DV_2MCuQkNoTtJUw@mail.gmail.com>

On Mon, 16 Mar 2026 08:05:53 +0100 Dmitry Vyukov wrote:
> On Sat, 14 Mar 2026 at 19:18, Jakub Kicinski <kuba@kernel.org> wrote:
> > On Fri, 13 Mar 2026 11:09:10 -0400 Sasha Levin wrote:  
> > > This enables static analysis tools to verify userspace API usage at compile
> > > time, test generation based on formal specifications, consistent error handling
> > > validation, automated documentation generation, and formal verification of
> > > kernel interfaces.  
> >
> > Could you give some examples? We have machine readable descriptions for
> > Netlink interfaces, we approached syzbot folks and they did not really
> > seem to care for those.  
> 
> I think our reasoning wrt syzkaller was that not all interfaces in all
> relevant kernels are described with netlink yml descriptions, so we
> need to continue using the extraction of interfaces from the source
> code. And if we have that code, then using yml as an additional data
> source only adds code/complexity. Additionally, we may extract some
> extra constraints/info from code that are not present in yml.
> 
> Realistically system call descriptions may have the same problem for
> us at this point, since we extract lots of info from the source code
> already:
> https://raw.githubusercontent.com/google/syzkaller/refs/heads/master/sys/linux/auto.txt

yup! we haven't tried to make the yml spec super useful to syzbot 
to be fair. I'm just flagging that example because in our case we
quickly went from "this will obviously be useful to syzbot" to
"although we could, it may not be super practical"

> (and LLMs obviously can allow us to extract more)

Didn't even think of that. LLMs should make short work of this sort of
extraction of information from source code..

^ permalink raw reply

* Re: [PATCH 0/9] Kernel API Specification Framework
From: Sasha Levin @ 2026-03-16 23:29 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Dmitry Vyukov, syzkaller, linux-api, linux-kernel, linux-doc,
	linux-fsdevel, linux-kbuild, linux-kselftest, workflows, tools,
	x86, Thomas Gleixner, Paul E. McKenney, Greg Kroah-Hartman,
	Jonathan Corbet, Randy Dunlap, Cyril Hrubis, Kees Cook, Jake Edge,
	David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <20260316155756.25b985f5@kernel.org>

On Mon, Mar 16, 2026 at 03:57:56PM -0700, Jakub Kicinski wrote:
>Didn't even think of that. LLMs should make short work of this sort of
>extraction of information from source code..

This is the primary reason that this proposal resurfaced :)

I've originally proposed[1] something like this almost a decade ago, but when I
started trying to write the actual specs I hit a brick wall: it was simply not
tractable.

With LLMs, writing the specs is something we can actually pull off, and we can
verify their correctness so LLMs don't get to hallucinate.

The specs you see in the following patches are all LLM generated.

-- 
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH 1/9] kernel/api: introduce kernel API specification framework
From: Jonathan Corbet @ 2026-03-17 17:49 UTC (permalink / raw)
  To: Sasha Levin, linux-api, linux-kernel
  Cc: linux-doc, linux-fsdevel, linux-kbuild, linux-kselftest,
	workflows, tools, x86, Thomas Gleixner, Paul E. McKenney,
	Greg Kroah-Hartman, Dmitry Vyukov, Randy Dunlap, Cyril Hrubis,
	Kees Cook, Jake Edge, David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann, Sasha Levin
In-Reply-To: <20260313150928.2637368-2-sashal@kernel.org>

Sasha Levin <sashal@kernel.org> writes:

> Add a framework for formally documenting kernel APIs with inline
> specifications. This framework provides:
>
> - Structured API documentation with parameter specifications, return
>   values, error conditions, and execution context requirements
> - Runtime validation capabilities for debugging (CONFIG_KAPI_RUNTIME_CHECKS)
> - Export of specifications via debugfs for tooling integration
> - Support for both internal kernel APIs and system calls

So I'll confess I have only scanned over the implementation, but I have
some thoughts on the earlier stuff.

[...]

> diff --git a/Documentation/dev-tools/kernel-api-spec.rst b/Documentation/dev-tools/kernel-api-spec.rst
> new file mode 100644
> index 0000000000000..7c0c1694f1f4a
> --- /dev/null
> +++ b/Documentation/dev-tools/kernel-api-spec.rst
> @@ -0,0 +1,482 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +
> +======================================
> +Kernel API Specification Framework
> +======================================
> +
> +:Author: Sasha Levin <sashal@kernel.org>
> +:Date: June 2025

Has it not changed since then?

> +.. contents:: Table of Contents
> +   :depth: 3
> +   :local:
> +
> +Introduction
> +============

[...]

> +Usage Guide
> +===========
> +
> +Basic API Specification
> +-----------------------
> +
> +API specifications are written as KAPI-annotated kerneldoc comments directly in
> +the source file, immediately preceding the function implementation. The ``kapi``
> +tool extracts these annotations to produce structured specifications.
> +
> +.. code-block:: c
> +
> +    /**
> +     * kmalloc - allocate kernel memory
> +     * @size: Number of bytes to allocate
> +     * @flags: Allocation flags (GFP_*)

Given that the text thus far has talked about user-space API validation,
it's a bit surprising to see an internal function used as an example.

Also, maybe it should be kmalloc_obj()?  <runs away>

> +     * context-flags: KAPI_CTX_PROCESS | KAPI_CTX_SOFTIRQ | KAPI_CTX_HARDIRQ
> +     * param-count: 2

param-count is two, but you only document one of them?

> +     * param: size
> +     *   type: KAPI_TYPE_UINT
> +     *   flags: KAPI_PARAM_IN
> +     *   constraint-type: KAPI_CONSTRAINT_RANGE
> +     *   range: 0, KMALLOC_MAX_SIZE
> +     *
> +     * error: ENOMEM, Out of memory
> +     *   desc: Insufficient memory available for the requested allocation.
> +     */

Honest question: can this be made a bit easier for people to create,
with less shift-key wear?  My biggest worry with a system like this is
that people won't take the time to create and maintain the entries, so
anything that would ease the task would help.  Is there an impediment to
something like:

  contexts: process, softirq, hardirq

  param: size
    type: uint, input
    constraint: range(0, KMALLOC_MAX_SIZE)

See what I'm getting at?  ISTM that your DSL could be made a bit less
verbose and shouty while being just as well defined, but perhaps I'm
missing something?

Even better, of course, would be to add a "description" field for each
parameter, and allow that rather than the @param description that
kerneldoc currently uses.  That would keep all the information together,
at the minor cost of adding another significant complication to the
kernel-doc script.  Mauro won't mind :)

> +    void *kmalloc(size_t size, gfp_t flags)
> +    {
> +        /* Implementation */
> +    }
> +
> +Alternatively, specifications can be defined using the ``DEFINE_KERNEL_API_SPEC``
> +macro for compiled-in specs that are stored in the ``.kapi_specs`` ELF section:
> +
> +.. code-block:: c
> +
> +    #include <linux/kernel_api_spec.h>
> +
> +    DEFINE_KERNEL_API_SPEC(sys_open)
> +    KAPI_DESCRIPTION("Open or create a file")
> +    KAPI_CONTEXT(KAPI_CTX_PROCESS | KAPI_CTX_SLEEPABLE)
> +    /* ... parameter, error, constraint definitions ... */
> +    KAPI_END_SPEC

So the reason for two completely separate mechanisms is not entirely
clear to me.  The kerneldoc variant is essentially documentation, while
the macro stuff is to be built into the executable?  What if you want
both?

It would be nice to only have one way if at all possible; I'm sure that
crossed your mind at some point :)  If there have to be two, having both
examples describe the same function would make the parallels more clear.

> +System Call Specification
> +-------------------------
> +
> +System calls are documented inline in the implementation file (e.g., ``fs/open.c``)
> +using KAPI-annotated kerneldoc comments. When ``CONFIG_KAPI_RUNTIME_CHECKS`` is
> +enabled, the ``SYSCALL_DEFINEx`` macros automatically look up the specification
> +and validate parameters before and after the syscall executes.
> +
> +IOCTL Specification
> +-------------------
> +
> +IOCTLs use the same annotation approach with additional structure field
> +specifications

This might be a really good place for an example

[...]

> +Usage Examples
> +--------------
> +
> +Query specific API::
> +
> +    $ cat /sys/kernel/debug/kapi/apis/kmalloc/specification
> +    API: kmalloc
> +    Version: 3.0
> +    Description: Allocate kernel memory
> +
> +    Parameters:
> +      [0] size (size_t, in): Number of bytes to allocate
> +          Range: 0 - 4194304
> +      [1] flags (flags, in): Allocation flags (GFP_*)
> +          Mask: 0x1ffffff

Ah, you do document that second parameter somewhere :)

> +    Returns: pointer - Pointer to allocated memory or NULL
> +
> +    Errors:
> +      ENOMEM: Out of memory
> +
> +    Context: process, softirq, hardirq
> +
> +    Side Effects:
> +      - Allocates memory from kernel heap

That part wasn't in your example

> +Export all specifications::
> +
> +    $ cat /sys/kernel/debug/kapi/export/all.json > kernel-apis.json
> +
> +Enable validation for specific API::
> +
> +    $ echo 1 > /sys/kernel/debug/kapi/apis/kmalloc/validate
> +
> +Performance Considerations
> +==========================
> +
> +Memory Overhead
> +---------------
> +
> +Each API specification consumes approximately 400-450KB of memory due to the
> +fixed-size arrays in ``struct kernel_api_spec``. With the current 4 syscall
> +specifications, total memory usage is approximately 1.7MB. Consider:

Ouch.

> +Documentation Generation
> +------------------------
> +
> +The framework exports specifications via debugfs that can be used
> +to generate documentation. Tools for automatic documentation generation
> +from specifications are planned for future development.

Documentation always comes last :)

Interesting stuff.

jon

^ permalink raw reply

* Re: [PATCH 5/9] kernel/api: add API specification for sys_open
From: Jonathan Corbet @ 2026-03-17 18:37 UTC (permalink / raw)
  To: Sasha Levin, Greg Kroah-Hartman
  Cc: linux-api, linux-kernel, linux-doc, linux-fsdevel, linux-kbuild,
	linux-kselftest, workflows, tools, x86, Thomas Gleixner,
	Paul E. McKenney, Dmitry Vyukov, Randy Dunlap, Cyril Hrubis,
	Kees Cook, Jake Edge, David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <abQ-iIylzpuqlRv3@laps>

Sasha Levin <sashal@kernel.org> writes:

> On Fri, Mar 13, 2026 at 04:33:57PM +0100, Greg Kroah-Hartman wrote:
>>On Fri, Mar 13, 2026 at 11:09:15AM -0400, Sasha Levin wrote:

>>> + * since-version: 1.0
>>
>>I think since older versions :)
>
> Right. I guess that in my mind 1.0 was the first official "release". I'll
> update it to 0.01.

That kind of raises the question of just what since-version means.  The
version-0.01 (or 1.0) version of open() surely didn't do everything
described in this specification.  So it's saying that some version of
the system call has existed since then?

Thanks,

jon

^ permalink raw reply

* Re: [PATCH 1/9] kernel/api: introduce kernel API specification framework
From: Mauro Carvalho Chehab @ 2026-03-18  6:00 UTC (permalink / raw)
  To: Jonathan Corbet
  Cc: Sasha Levin, linux-api, linux-kernel, linux-doc, linux-fsdevel,
	linux-kbuild, linux-kselftest, workflows, tools, x86,
	Thomas Gleixner, Paul E. McKenney, Greg Kroah-Hartman,
	Dmitry Vyukov, Randy Dunlap, Cyril Hrubis, Kees Cook, Jake Edge,
	David Laight, Askar Safin, Gabriele Paoloni,
	Mauro Carvalho Chehab, Christian Brauner, Alexander Viro,
	Andrew Morton, Masahiro Yamada, Shuah Khan, Ingo Molnar,
	Arnd Bergmann
In-Reply-To: <87h5qe9wig.fsf@trenco.lwn.net>

On Tue, 17 Mar 2026 11:49:27 -0600
Jonathan Corbet <corbet@lwn.net> wrote:

> Sasha Levin <sashal@kernel.org> writes:
> 
> > Add a framework for formally documenting kernel APIs with inline
> > specifications. This framework provides:
> >
> > - Structured API documentation with parameter specifications, return
> >   values, error conditions, and execution context requirements
> > - Runtime validation capabilities for debugging (CONFIG_KAPI_RUNTIME_CHECKS)
> > - Export of specifications via debugfs for tooling integration
> > - Support for both internal kernel APIs and system calls  
> 
> So I'll confess I have only scanned over the implementation, but I have
> some thoughts on the earlier stuff.
> 
> [...]
> 
> > diff --git a/Documentation/dev-tools/kernel-api-spec.rst b/Documentation/dev-tools/kernel-api-spec.rst
> > new file mode 100644
> > index 0000000000000..7c0c1694f1f4a
> > --- /dev/null
> > +++ b/Documentation/dev-tools/kernel-api-spec.rst
> > @@ -0,0 +1,482 @@
> > +.. SPDX-License-Identifier: GPL-2.0
> > +
> > +======================================
> > +Kernel API Specification Framework
> > +======================================
> > +
> > +:Author: Sasha Levin <sashal@kernel.org>
> > +:Date: June 2025  
> 
> Has it not changed since then?
> 
> > +.. contents:: Table of Contents
> > +   :depth: 3
> > +   :local:
> > +
> > +Introduction
> > +============  
> 
> [...]
> 
> > +Usage Guide
> > +===========
> > +
> > +Basic API Specification
> > +-----------------------
> > +
> > +API specifications are written as KAPI-annotated kerneldoc comments directly in
> > +the source file, immediately preceding the function implementation. The ``kapi``
> > +tool extracts these annotations to produce structured specifications.
> > +
> > +.. code-block:: c
> > +
> > +    /**
> > +     * kmalloc - allocate kernel memory
> > +     * @size: Number of bytes to allocate
> > +     * @flags: Allocation flags (GFP_*)  
> 
> Given that the text thus far has talked about user-space API validation,
> it's a bit surprising to see an internal function used as an example.
> 
> Also, maybe it should be kmalloc_obj()?  <runs away>
> 
> > +     * context-flags: KAPI_CTX_PROCESS | KAPI_CTX_SOFTIRQ | KAPI_CTX_HARDIRQ
> > +     * param-count: 2  
> 
> param-count is two, but you only document one of them?
> 
> > +     * param: size
> > +     *   type: KAPI_TYPE_UINT
> > +     *   flags: KAPI_PARAM_IN
> > +     *   constraint-type: KAPI_CONSTRAINT_RANGE
> > +     *   range: 0, KMALLOC_MAX_SIZE
> > +     *
> > +     * error: ENOMEM, Out of memory
> > +     *   desc: Insufficient memory available for the requested allocation.
> > +     */  
> 
> Honest question: can this be made a bit easier for people to create,
> with less shift-key wear?  My biggest worry with a system like this is
> that people won't take the time to create and maintain the entries, so
> anything that would ease the task would help.  Is there an impediment to
> something like:
> 
>   contexts: process, softirq, hardirq
> 
>   param: size
>     type: uint, input
>     constraint: range(0, KMALLOC_MAX_SIZE)
> 
> See what I'm getting at?  ISTM that your DSL could be made a bit less
> verbose and shouty while being just as well defined, but perhaps I'm
> missing something?
> 
> Even better, of course, would be to add a "description" field for each
> parameter, and allow that rather than the @param description that
> kerneldoc currently uses.  That would keep all the information together,
> at the minor cost of adding another significant complication to the
> kernel-doc script.  Mauro won't mind :)

No, I won't ;-) 

It sounds a lot better to use kernel-doc also for kAPI than to have an
independent tool.

It is also very confusing if we end with a kernel-doc-like format
that it is not parsed by kernel-doc.

> 
> > +    void *kmalloc(size_t size, gfp_t flags)
> > +    {
> > +        /* Implementation */
> > +    }
> > +
> > +Alternatively, specifications can be defined using the ``DEFINE_KERNEL_API_SPEC``
> > +macro for compiled-in specs that are stored in the ``.kapi_specs`` ELF section:
> > +
> > +.. code-block:: c
> > +
> > +    #include <linux/kernel_api_spec.h>
> > +
> > +    DEFINE_KERNEL_API_SPEC(sys_open)
> > +    KAPI_DESCRIPTION("Open or create a file")
> > +    KAPI_CONTEXT(KAPI_CTX_PROCESS | KAPI_CTX_SLEEPABLE)
> > +    /* ... parameter, error, constraint definitions ... */
> > +    KAPI_END_SPEC  
> 
> So the reason for two completely separate mechanisms is not entirely
> clear to me.  The kerneldoc variant is essentially documentation, while
> the macro stuff is to be built into the executable?  What if you want
> both?

You can easily add support at kernel-doc to output such macros.

All you need is to create a new class derived from OutputFormat and
make it produce any different output format, including:

    #include <linux/kernel_api_spec.h>

    DEFINE_KERNEL_API_SPEC(sys_open)
    KAPI_DESCRIPTION("Open or create a file")
    KAPI_CONTEXT(KAPI_CTX_PROCESS | KAPI_CTX_SLEEPABLE)
    /* ... parameter, error, constraint definitions ... */
    KAPI_END_SPEC  

I'd say that converting from such output to `.kapi_specs`` ELF section
itself and/or to sysfs/debugfs - e.g. something that would require to
compile or be linked with Kernel's compiled binaries should be done by a
separate tool, but we should aim to have a singe tool to process 
kernel documentation markups.

It is hard enough to maintain just one tool - and to have people actually
writing documentation. Having a second one to handle it, with a different
format will likely increase a lot the documentation burden.

> It would be nice to only have one way if at all possible; I'm sure that
> crossed your mind at some point :)  If there have to be two, having both
> examples describe the same function would make the parallels more clear.
> 
> > +System Call Specification
> > +-------------------------
> > +
> > +System calls are documented inline in the implementation file (e.g., ``fs/open.c``)
> > +using KAPI-annotated kerneldoc comments. When ``CONFIG_KAPI_RUNTIME_CHECKS`` is
> > +enabled, the ``SYSCALL_DEFINEx`` macros automatically look up the specification
> > +and validate parameters before and after the syscall executes.
> > +
> > +IOCTL Specification
> > +-------------------
> > +
> > +IOCTLs use the same annotation approach with additional structure field
> > +specifications  
> 
> This might be a really good place for an example
> 
> [...]
> 
> > +Usage Examples
> > +--------------
> > +
> > +Query specific API::
> > +
> > +    $ cat /sys/kernel/debug/kapi/apis/kmalloc/specification
> > +    API: kmalloc
> > +    Version: 3.0
> > +    Description: Allocate kernel memory
> > +
> > +    Parameters:
> > +      [0] size (size_t, in): Number of bytes to allocate
> > +          Range: 0 - 4194304
> > +      [1] flags (flags, in): Allocation flags (GFP_*)
> > +          Mask: 0x1ffffff  
> 
> Ah, you do document that second parameter somewhere :)
> 
> > +    Returns: pointer - Pointer to allocated memory or NULL
> > +
> > +    Errors:
> > +      ENOMEM: Out of memory
> > +
> > +    Context: process, softirq, hardirq
> > +
> > +    Side Effects:
> > +      - Allocates memory from kernel heap  
> 
> That part wasn't in your example
> 
> > +Export all specifications::
> > +
> > +    $ cat /sys/kernel/debug/kapi/export/all.json > kernel-apis.json
> > +
> > +Enable validation for specific API::
> > +
> > +    $ echo 1 > /sys/kernel/debug/kapi/apis/kmalloc/validate
> > +
> > +Performance Considerations
> > +==========================
> > +
> > +Memory Overhead
> > +---------------
> > +
> > +Each API specification consumes approximately 400-450KB of memory due to the
> > +fixed-size arrays in ``struct kernel_api_spec``. With the current 4 syscall
> > +specifications, total memory usage is approximately 1.7MB. Consider:  
> 
> Ouch.
> 
> > +Documentation Generation
> > +------------------------
> > +
> > +The framework exports specifications via debugfs that can be used
> > +to generate documentation. Tools for automatic documentation generation
> > +from specifications are planned for future development.  
> 
> Documentation always comes last :)
> 
> Interesting stuff.
> 
> jon



Thanks,
Mauro

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox