From: atwellwea@gmail.com
To: netdev@vger.kernel.org, davem@davemloft.net, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, ncardwell@google.com
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-trace-kernel@vger.kernel.org, mptcp@lists.linux.dev,
dsahern@kernel.org, horms@kernel.org, kuniyu@google.com,
andrew+netdev@lunn.ch, willemdebruijn.kernel@gmail.com,
jasowang@redhat.com, skhan@linuxfoundation.org, corbet@lwn.net,
matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
rostedt@goodmis.org, mhiramat@kernel.org,
mathieu.desnoyers@efficios.com, 0x7f454c46@gmail.com
Subject: [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests
Date: Sat, 14 Mar 2026 14:13:46 -0600 [thread overview]
Message-ID: <20260314201348.1786972-13-atwellwea@gmail.com> (raw)
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>
From: Wesley Atwell <atwellwea@gmail.com>
Add a test-only TUN ioctl that inflates RX skb->truesize, plus the
packetdrill-side helper needed to drive that ioctl through packetdrill's
own TUN queue file descriptor.
Use that plumbing to cover the receive-window regressions where
scaling_ratio drifts after advertisement, alongside the baseline too-big
packetdrill cases that exercise the same sender-visible rwnd accounting
from the non-injected path.
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
drivers/net/tun.c | 65 ++++++++
include/uapi/linux/if_tun.h | 4 +
.../tcp_rcv_neg_window_truesize.pkt | 143 ++++++++++++++++++
.../net/packetdrill/tcp_rcv_toobig.pkt | 35 +++++
.../packetdrill/tcp_rcv_toobig_default.pkt | 97 ++++++++++++
.../tcp_rcv_toobig_default_truesize.pkt | 118 +++++++++++++++
.../tcp_rcv_wnd_shrink_allowed_truesize.pkt | 49 ++++++
tools/testing/selftests/net/tun.c | 140 ++++++++++++++++-
8 files changed, 650 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c492fda6fc15..2cef62cebe88 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -53,6 +53,7 @@
#include <linux/if_ether.h>
#include <linux/if_tun.h>
#include <linux/if_vlan.h>
+#include <linux/overflow.h>
#include <linux/crc32.h>
#include <linux/math.h>
#include <linux/nsproxy.h>
@@ -85,8 +86,13 @@
#include "tun_vnet.h"
+struct tun_file;
+
+#define TUNSETTRUESIZE_OLD _IOW('T', 228, unsigned int)
+
static void tun_default_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd);
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb);
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
@@ -138,6 +144,7 @@ struct tun_file {
u16 queue_index;
unsigned int ifindex;
};
+ u32 rx_extra_truesize;
struct napi_struct napi;
bool napi_enabled;
bool napi_frags_enabled;
@@ -1817,6 +1824,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
goto free_skb;
}
+ tun_rx_update_truesize(tfile, skb);
switch (tun->flags & TUN_TYPE_MASK) {
case IFF_TUN:
if (tun->flags & IFF_NO_PI) {
@@ -2373,6 +2381,25 @@ static void tun_put_page(struct tun_page *tpage)
__page_frag_cache_drain(tpage->page, tpage->count);
}
+/* Tests can inflate skb->truesize on ingress to exercise receive-memory
+ * accounting against a scaling_ratio that drifts after a window was
+ * advertised. The knob is per queue file, defaults to zero, and only changes
+ * behavior when explicitly enabled through the TUN fd.
+ */
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb)
+{
+ u32 extra = READ_ONCE(tfile->rx_extra_truesize);
+ unsigned int truesize;
+
+ if (!extra)
+ return;
+
+ if (check_add_overflow(skb->truesize, extra, &truesize))
+ truesize = UINT_MAX;
+
+ skb->truesize = truesize;
+}
+
static int tun_xdp_one(struct tun_struct *tun,
struct tun_file *tfile,
struct xdp_buff *xdp, int *flush,
@@ -2459,6 +2486,7 @@ static int tun_xdp_one(struct tun_struct *tun,
goto out;
}
+ tun_rx_update_truesize(tfile, skb);
skb->protocol = eth_type_trans(skb, tun->dev);
skb_reset_network_header(skb);
skb_probe_transport_header(skb);
@@ -3045,6 +3073,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
struct tun_struct *tun;
void __user* argp = (void __user*)arg;
unsigned int carrier;
+ unsigned int extra_truesize;
struct ifreq ifr;
kuid_t owner;
kgid_t group;
@@ -3309,6 +3338,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = tun_net_change_carrier(tun->dev, (bool)carrier);
break;
+ /* Support both the legacy pointer-payload form and the scalar form
+ * used by the selftest helper when injecting truesize from
+ * packetdrill shell commands.
+ */
+ case TUNSETTRUESIZE:
+ case TUNSETTRUESIZE_OLD:
+ ret = -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ goto unlock;
+
+ if (cmd == TUNSETTRUESIZE_OLD) {
+ ret = -EFAULT;
+ if (copy_from_user(&extra_truesize, argp,
+ sizeof(extra_truesize))) {
+ ret = -EINVAL;
+ if (arg > U32_MAX)
+ goto unlock;
+
+ extra_truesize = arg;
+ }
+ } else {
+ ret = -EINVAL;
+ if (arg > U32_MAX)
+ goto unlock;
+
+ extra_truesize = arg;
+ }
+
+ WRITE_ONCE(tfile->rx_extra_truesize, extra_truesize);
+ netif_info(tun, drv, tun->dev,
+ "rx extra truesize set to %u\n", extra_truesize);
+ ret = 0;
+ break;
+
case TUNGETDEVNETNS:
ret = -EPERM;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -3348,6 +3411,7 @@ static long tun_chr_compat_ioctl(struct file *file,
case TUNGETSNDBUF:
case TUNSETSNDBUF:
case SIOCGIFHWADDR:
+ case TUNSETTRUESIZE_OLD:
case SIOCSIFHWADDR:
arg = (unsigned long)compat_ptr(arg);
break;
@@ -3408,6 +3472,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
RCU_INIT_POINTER(tfile->tun, NULL);
tfile->flags = 0;
tfile->ifindex = 0;
+ tfile->rx_extra_truesize = 0;
init_waitqueue_head(&tfile->socket.wq.wait);
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 79d53c7a1ebd..4be63efe6540 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,10 @@
#define TUNSETFILTEREBPF _IOR('T', 225, int)
#define TUNSETCARRIER _IOW('T', 226, int)
#define TUNGETDEVNETNS _IO('T', 227)
+/* Test-only: add scalar bytes to skb->truesize on RX after TUN allocates
+ * an skb.
+ */
+#define TUNSETTRUESIZE _IO('T', 228)
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
new file mode 100644
index 000000000000..1c5550fff509
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+// Run the negative-window / max-advertised-window regression with inflated
+// TUN skb->truesize so scaling_ratio drifts throughout the flow. The sequence
+// checks and drop counters should remain identical to the uninflated case.
+
+--mss=1000
+
+`./defaults.sh`
+
+ 0 `nstat -n`
+
+// Establish a connection.
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1000000], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 32792 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 4>
+ +0 < . 1:1(0) ack 1 win 257
+
+ +0 accept(3, ..., ...) = 4
+
+// Put 1040000 bytes into the receive buffer.
+ +0 < P. 1:65001(65000) ack 1 win 257
+ * > . 1:1(0) ack 65001
+ +0 < P. 65001:130001(65000) ack 1 win 257
+ * > . 1:1(0) ack 130001
+ +0 < P. 130001:195001(65000) ack 1 win 257
+ * > . 1:1(0) ack 195001
+ +0 < P. 195001:260001(65000) ack 1 win 257
+ * > . 1:1(0) ack 260001
+ +0 < P. 260001:325001(65000) ack 1 win 257
+ * > . 1:1(0) ack 325001
+ +0 < P. 325001:390001(65000) ack 1 win 257
+ * > . 1:1(0) ack 390001
+ +0 < P. 390001:455001(65000) ack 1 win 257
+ * > . 1:1(0) ack 455001
+ +0 < P. 455001:520001(65000) ack 1 win 257
+ * > . 1:1(0) ack 520001
+ +0 < P. 520001:585001(65000) ack 1 win 257
+ * > . 1:1(0) ack 585001
+ +0 < P. 585001:650001(65000) ack 1 win 257
+ * > . 1:1(0) ack 650001
+ +0 < P. 650001:715001(65000) ack 1 win 257
+ * > . 1:1(0) ack 715001
+ +0 < P. 715001:780001(65000) ack 1 win 257
+ * > . 1:1(0) ack 780001
+ +0 < P. 780001:845001(65000) ack 1 win 257
+ * > . 1:1(0) ack 845001
+ +0 < P. 845001:910001(65000) ack 1 win 257
+ * > . 1:1(0) ack 910001
+ +0 < P. 910001:975001(65000) ack 1 win 257
+ * > . 1:1(0) ack 975001
+ +0 < P. 975001:1040001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1040001
+
+// Start inflating future TUN skbs only after the baseline sender-visible
+// window has been established, so the negative-window checks below exercise
+// ratio drift without changing the initial max advertised window.
+ +0 `../tun --set-rx-truesize tun0 65536`
+
+// Trigger an extreme memory squeeze by shrinking SO_RCVBUF.
+ +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [16000], 4) = 0
+
+ +0 < P. 1040001:1105001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1040001 win 0
+// Check LINUX_MIB_TCPRCVQDROP has been incremented.
+ +0 `nstat -s | grep TcpExtTCPRcvQDrop | grep -q " 1 "`
+
+// RWIN == 0: rcv_wup = 1040001, rcv_wnd = 0, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window.
+ +0 write(4, ..., 1000) = 1000
+ +0 > P. 1:1001(1000) ack 1040001 win 0
+ +0 < . 1105001:1105001(0) ack 1001 win 257
+
+// In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+ +0 < P. 1040001:1041001(1000) ack 1001 win 257
+ +0 > . 1001:1001(0) ack 1040001 win 0
+// Ooo partial segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+ +0 < P. 1039001:1041001(2000) ack 1001 win 257
+ +0 > . 1001:1001(0) ack 1040001 win 0 <nop,nop,sack 1039001:1040001>
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented twice.
+ +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 2 "`
+
+// Ooo segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW).
+ +0 < P. 1105001:1106001(1000) ack 1001 win 257
+ +0 > . 1001:1001(0) ack 1040001 win 0
+// Ooo segment, beyond max adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE).
+ +0 < P. 2000001:2001001(1000) ack 1001 win 257
+ +0 > . 1001:1001(0) ack 1040001 win 0
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice.
+ +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 2 "`
+
+// Read all data.
+ +0 read(4, ..., 2000000) = 1040000
+ * > . 1001:1001(0) ack 1040001
+
+// RWIN > 0: rcv_wup = 1040001, 0 < rcv_wnd < 32000, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window, beyond adv. window.
+ +0 write(4, ..., 1000) = 1000
+ +0 > P. 1001:2001(1000) ack 1040001
+ +0 < . 1105001:1105001(0) ack 2001 win 257
+
+// In order segment, in max adv. window, in adv. window -> accept.
+ +0 < P. 1040001:1041001(1000) ack 2001 win 257
+ * > . 2001:2001(0) ack 1041001
+
+// Ooo partial segment, in adv. window -> accept.
+ +0 < P. 1040001:1042001(2000) ack 2001 win 257
+ * > . 2001:2001(0) ack 1042001 <nop,nop,sack 1040001:1041001>
+
+// Ooo segment, in max adv. window, beyond adv. window -> drop.
+ +0 < P. 1105001:1106001(1000) ack 2001 win 257
+ +0 > . 2001:2001(0) ack 1042001
+// Ooo segment, beyond max adv. window, beyond adv. window -> drop.
+ +0 < P. 2000001:2001001(1000) ack 2001 win 257
+ +0 > . 2001:2001(0) ack 1042001
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice more.
+ +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 4 "`
+
+// We are allowed to go beyond the window and buffer with one packet.
+ +0 < P. 1042001:1062001(20000) ack 2001 win 257
+ * > . 2001:2001(0) ack 1062001
+ +0 < P. 1062001:1082001(20000) ack 2001 win 257
+ * > . 2001:2001(0) ack 1082001 win 0
+
+// But not more: in-order segment, in max adv. window -> drop.
+ +0 < P. 1082001:1083001(1000) ack 2001 win 257
+ * > . 2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented again.
+ +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 3 "`
+
+// Another ratio drop must not change the final zero-window decision.
+ +0 `../tun --set-rx-truesize tun0 131072`
+
+ +0 < P. 1082001:1083001(1000) ack 2001 win 257
+ * > . 2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented once more.
+ +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 4 "`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
new file mode 100644
index 000000000000..837ba3633752
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh`
+
+ 0 `nstat -n`
+
+// Establish a connection.
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0>
+ +.1 < . 1:1(0) ack 1 win 257
+
+ +0 accept(3, ..., ...) = 4
+
+ +0 < P. 1:20001(20000) ack 1 win 257
+ +.04 > . 1:1(0) ack 20001 win 18000
+
+ +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0
+ +0 < P. 20001:80001(60000) ack 1 win 257
+ +0 > . 1:1(0) ack 20001 win 18000
+
+ +0 read(4, ..., 20000) = 20000
+
+// A too big packet is accepted if the receive queue is empty, but the
+// stronger admission path must not zero the receive buffer while doing so.
+ +0 < P. 20001:80001(60000) ack 1 win 257
+ * > . 1:1(0) ack 80001 win 0
+ +0 %{ assert SK_MEMINFO_RCVBUF > 0, SK_MEMINFO_RCVBUF }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
new file mode 100644
index 000000000000..b2e4950e0b83
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. Leave a large skb in
+// the queue, then deliver another one which still fits the remaining rwnd.
+// We should grow sk_rcvbuf to honor the already-advertised window instead of
+// dropping the packet.
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 <...>
+ +.1 < . 1:1(0) ack 1 win 257
+
+ +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its 128kB default.
+ +0 < P. 1:65001(65000) ack 1 win 257
+ * > . 1:1(0) ack 65001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 65001:130001(65000) ack 1 win 257
+ * > . 1:1(0) ack 130001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 130001:195001(65000) ack 1 win 257
+ * > . 1:1(0) ack 195001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 195001:260001(65000) ack 1 win 257
+ * > . 1:1(0) ack 260001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 260001:325001(65000) ack 1 win 257
+ * > . 1:1(0) ack 325001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 325001:390001(65000) ack 1 win 257
+ * > . 1:1(0) ack 390001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 390001:455001(65000) ack 1 win 257
+ * > . 1:1(0) ack 455001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 455001:520001(65000) ack 1 win 257
+ * > . 1:1(0) ack 520001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 520001:585001(65000) ack 1 win 257
+ * > . 1:1(0) ack 585001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 585001:650001(65000) ack 1 win 257
+ * > . 1:1(0) ack 650001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 650001:715001(65000) ack 1 win 257
+ * > . 1:1(0) ack 715001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 715001:780001(65000) ack 1 win 257
+ * > . 1:1(0) ack 780001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 780001:845001(65000) ack 1 win 257
+ * > . 1:1(0) ack 845001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 845001:910001(65000) ack 1 win 257
+ * > . 1:1(0) ack 910001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 910001:975001(65000) ack 1 win 257
+ * > . 1:1(0) ack 975001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 975001:1040001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1040001
+ +0 read(4, ..., 65000) = 65000
+
+// Leave about 60kB queued, then accept another large skb which still fits
+// the rwnd we already exposed to the peer. The regression is the drop; the
+// exact sk_rcvbuf growth path is an implementation detail.
+ +0 < P. 1040001:1102001(62000) ack 1 win 257
+ * > . 1:1(0) ack 1102001
+
+ +0 < P. 1102001:1167001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1167001
+ +0 read(4, ..., 127000) = 127000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
new file mode 100644
index 000000000000..c2ebe11d75f7
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. The warmup traffic
+// keeps the socket in the normal data path without changing its default
+// sk_rcvbuf. Then inflate skb->truesize on future TUN RX packets so the live
+// scaling_ratio drops after we already exposed a larger rwnd to the peer.
+// The follow-up packet should still be admitted, and tcp_clamp_window() should
+// grow sk_rcvbuf to honor the sender-visible window instead of dropping data.
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 <...>
+ +.1 < . 1:1(0) ack 1 win 257
+
+ +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its initial default.
+ +0 < P. 1:65001(65000) ack 1 win 257
+ * > . 1:1(0) ack 65001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 65001:130001(65000) ack 1 win 257
+ * > . 1:1(0) ack 130001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 130001:195001(65000) ack 1 win 257
+ * > . 1:1(0) ack 195001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 195001:260001(65000) ack 1 win 257
+ * > . 1:1(0) ack 260001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 260001:325001(65000) ack 1 win 257
+ * > . 1:1(0) ack 325001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 325001:390001(65000) ack 1 win 257
+ * > . 1:1(0) ack 390001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 390001:455001(65000) ack 1 win 257
+ * > . 1:1(0) ack 455001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 455001:520001(65000) ack 1 win 257
+ * > . 1:1(0) ack 520001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 520001:585001(65000) ack 1 win 257
+ * > . 1:1(0) ack 585001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 585001:650001(65000) ack 1 win 257
+ * > . 1:1(0) ack 650001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 650001:715001(65000) ack 1 win 257
+ * > . 1:1(0) ack 715001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 715001:780001(65000) ack 1 win 257
+ * > . 1:1(0) ack 780001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 780001:845001(65000) ack 1 win 257
+ * > . 1:1(0) ack 845001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 845001:910001(65000) ack 1 win 257
+ * > . 1:1(0) ack 910001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 910001:975001(65000) ack 1 win 257
+ * > . 1:1(0) ack 975001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 < P. 975001:1040001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1040001
+ +0 read(4, ..., 65000) = 65000
+
+ +0 %{ base_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+// Leave about 60kB queued, then make future TUN skbs look more expensive in
+// two steps. Both inflated skbs still fit the already-advertised window and
+// must be admitted, and sk_rcvbuf should keep growing as the live
+// scaling_ratio drops further.
+ +0 < P. 1040001:1102001(62000) ack 1 win 257
+ * > . 1:1(0) ack 1102001
+
+ +0 `../tun --set-rx-truesize tun0 4096`
+
+ +0 < P. 1102001:1167001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1167001
+ +0 %{ assert SK_MEMINFO_RCVBUF > base_rcvbuf, (base_rcvbuf, SK_MEMINFO_RCVBUF) }%
+ +0 %{ small_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+ +0 < P. 1167001:1229001(62000) ack 1 win 257
+ * > . 1:1(0) ack 1229001
+
+ +0 `../tun --set-rx-truesize tun0 65536`
+
+ +0 < P. 1229001:1294001(65000) ack 1 win 257
+ * > . 1:1(0) ack 1294001
+ +0 %{ assert SK_MEMINFO_RCVBUF > small_rcvbuf, (base_rcvbuf, small_rcvbuf, SK_MEMINFO_RCVBUF) }%
+
+ +0 < P. 1294001:1356001(62000) ack 1 win 257
+ * > . 1:1(0) ack 1356001
+ +0 read(4, ..., 254000) = 254000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
new file mode 100644
index 000000000000..08da5fddaa12
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_shrink_window=1
+sysctl -q net.ipv4.tcp_rmem="4096 32768 $((32*1024*1024))"`
+
+ 0 `nstat -n`
+
+// Establish a connection. After the first payload we know the peer has seen a
+// scaled receive window reaching sequence 25361. Inflate later TUN skbs in two
+// steps so the live scaling_ratio drops more than once, then verify that:
+// 1) a segment one byte beyond the max advertised window is still dropped,
+// 2) a segment exactly using the previously advertised max window is still
+// accepted even though the current live ratio no longer matches that
+// original advertisement basis.
+ +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+ +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+ +0 bind(3, ..., ...) = 0
+ +0 listen(3, 1) = 0
+
+ +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+ +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 10>
+ +0 < . 1:1(0) ack 1 win 257
+
+ +0 accept(3, ..., ...) = 4
+
+ +0 < P. 1:10001(10000) ack 1 win 257
+ * > . 1:1(0) ack 10001 win 15
+
+// Max window seq advertised here is 10001 + 15*1024 = 25361.
+ +0 `../tun --set-rx-truesize tun0 4096`
+
+ +0 < P. 10001:11024(1023) ack 1 win 257
+ * > . 1:1(0) ack 11024
+
+ +0 `../tun --set-rx-truesize tun0 65536`
+
+// Segment beyond the max window stays invalid even after ratio drift.
+ +0 < P. 11024:25362(14338) ack 1 win 257
+ * > . 1:1(0) ack 11024
+
+// Segment exactly using the max window must still be accepted.
+ +0 < P. 11024:25361(14337) ack 1 win 257
+ * > . 1:1(0) ack 25361
+
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented once.
+ +0 `nstat | grep TcpExtBeyondWindow | grep -q " 1 "`
diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index cf106a49b55e..473992b3784d 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -2,14 +2,17 @@
#define _GNU_SOURCE
+#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
+#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/if_tun.h>
#include <sys/ioctl.h>
+#include <sys/syscall.h>
#include <sys/socket.h>
#include "kselftest_harness.h"
@@ -174,6 +177,135 @@ static int tun_delete(char *dev)
return ip_link_del(dev);
}
+static bool is_numeric_name(const char *name)
+{
+ for (; *name; name++) {
+ if (*name < '0' || *name > '9')
+ return false;
+ }
+
+ return true;
+}
+
+static int packetdrill_dup_fd(int pidfd, const char *fd_name)
+{
+ char *end;
+ unsigned long tmp;
+
+ errno = 0;
+ tmp = strtoul(fd_name, &end, 10);
+ if (errno || *end || tmp > INT_MAX) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ return syscall(SYS_pidfd_getfd, pidfd, (int)tmp, 0);
+}
+
+static int open_packetdrill_tunfd(pid_t pid, const char *ifname)
+{
+ char fd_dir[PATH_MAX];
+ struct dirent *dent;
+ struct ifreq ifr = {};
+ int pidfd;
+ int saved_errno = ENOENT;
+ DIR *dir;
+
+ snprintf(fd_dir, sizeof(fd_dir), "/proc/%ld/fd", (long)pid);
+
+ pidfd = syscall(SYS_pidfd_open, pid, 0);
+ if (pidfd < 0)
+ return -1;
+
+ dir = opendir(fd_dir);
+ if (!dir) {
+ close(pidfd);
+ return -1;
+ }
+
+ while ((dent = readdir(dir))) {
+ int fd;
+
+ if (!is_numeric_name(dent->d_name))
+ continue;
+
+ /* Reopen via pidfd_getfd() so we duplicate packetdrill's attached
+ * queue file, instead of opening a fresh /dev/net/tun instance.
+ */
+ fd = packetdrill_dup_fd(pidfd, dent->d_name);
+ if (fd < 0) {
+ saved_errno = errno;
+ continue;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+ if (!ioctl(fd, TUNGETIFF, &ifr) &&
+ !strncmp(ifr.ifr_name, ifname, IFNAMSIZ)) {
+ close(pidfd);
+ closedir(dir);
+ return fd;
+ }
+
+ if (errno)
+ saved_errno = errno;
+ close(fd);
+ }
+
+ close(pidfd);
+ closedir(dir);
+ errno = saved_errno;
+ return -1;
+}
+
+/* Packetdrill owns the TUN queue fd, so drive the test ioctl through that
+ * exact file descriptor found under /proc/$PACKETDRILL_PID/fd.
+ */
+static int packetdrill_set_rx_truesize(const char *ifname, const char *value)
+{
+ char *packetdrill_pid, *end;
+ unsigned long long tmp;
+ unsigned int extra;
+ pid_t pid;
+ int fd;
+
+ packetdrill_pid = getenv("PACKETDRILL_PID");
+ if (!packetdrill_pid || !*packetdrill_pid) {
+ fprintf(stderr, "PACKETDRILL_PID is not set\n");
+ return 1;
+ }
+
+ errno = 0;
+ tmp = strtoull(packetdrill_pid, &end, 10);
+ if (errno || *end || !tmp || tmp > INT_MAX) {
+ fprintf(stderr, "invalid PACKETDRILL_PID: %s\n", packetdrill_pid);
+ return 1;
+ }
+ pid = (pid_t)tmp;
+
+ errno = 0;
+ tmp = strtoull(value, &end, 0);
+ if (errno || *end || tmp > UINT_MAX) {
+ fprintf(stderr, "invalid truesize value: %s\n", value);
+ return 1;
+ }
+ extra = (unsigned int)tmp;
+
+ fd = open_packetdrill_tunfd(pid, ifname);
+ if (fd < 0) {
+ perror("open_packetdrill_tunfd");
+ return 1;
+ }
+
+ if (ioctl(fd, TUNSETTRUESIZE, (unsigned long)extra)) {
+ perror("ioctl(TUNSETTRUESIZE)");
+ close(fd);
+ return 1;
+ }
+
+ close(fd);
+ return 0;
+}
+
static int tun_open(char *dev, const int flags, const int hdrlen,
const int features, const unsigned char *mac_addr)
{
@@ -985,4 +1117,10 @@ XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet);
XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet);
XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet);
-TEST_HARNESS_MAIN
+int main(int argc, char **argv)
+{
+ if (argc == 4 && !strcmp(argv[1], "--set-rx-truesize"))
+ return packetdrill_set_rx_truesize(argv[2], argv[3]);
+
+ return test_harness_run(argc, argv);
+}
--
2.43.0
next prev parent reply other threads:[~2026-03-14 20:15 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-14 20:13 [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 01/14] tcp: factor receive-memory accounting helpers atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 02/14] tcp: snapshot advertise-time scaling for rcv_wnd atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 03/14] tcp: refresh rcv_wnd snapshots at TCP write sites atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 04/14] tcp: snapshot the maximum advertised receive window atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack atwellwea
2026-03-16 11:04 ` Paolo Abeni
2026-03-16 11:24 ` Paolo Abeni
2026-03-16 11:31 ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 06/14] tcp: regrow rcvbuf when scaling_ratio drops after advertisement atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction atwellwea
2026-03-16 11:44 ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 08/14] tcp: extend TCP_REPAIR_WINDOW for live and max-window snapshots atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 09/14] mptcp: refresh TCP receive-window snapshots on subflows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 10/14] tcp: expose rmem and backlog in tcp and mptcp rcvbuf_grow tracepoints atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 11/14] selftests: tcp_ao: cover legacy, v1, and retracted repair windows atwellwea
2026-03-14 20:13 ` atwellwea [this message]
2026-03-15 1:18 ` [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths atwellwea
2026-03-15 1:19 ` [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift Jakub Kicinski
2026-03-16 11:09 ` Paolo Abeni
[not found] ` <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>
2026-03-16 17:47 ` Paolo Abeni
2026-03-16 18:03 ` Wesley Atwell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260314201348.1786972-13-atwellwea@gmail.com \
--to=atwellwea@gmail.com \
--cc=0x7f454c46@gmail.com \
--cc=andrew+netdev@lunn.ch \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=geliang@kernel.org \
--cc=horms@kernel.org \
--cc=jasowang@redhat.com \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=martineau@kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matttbe@kernel.org \
--cc=mhiramat@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rostedt@goodmis.org \
--cc=skhan@linuxfoundation.org \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.