Linux Trace Kernel

Linux Trace Kernel
 help / color / mirror / Atom feed

* [PATCH 09/15] lib/bootconfig: increment xbc_node_num after node init succeeds
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

Move the xbc_node_num increment to after xbc_init_node() so a failed
init does not leave a partially initialized node counted in the array.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 56fbedc9e725..06e8a79ab472 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -429,9 +429,10 @@ static struct xbc_node * __init xbc_add_node(char *data, uint16_t flag)
 	if (xbc_node_num == XBC_NODE_MAX)
 		return NULL;
 
-	node = &xbc_nodes[xbc_node_num++];
+	node = &xbc_nodes[xbc_node_num];
 	if (xbc_init_node(node, data, flag) < 0)
 		return NULL;
+	xbc_node_num++;
 
 	return node;
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH 06/15] lib/bootconfig: narrow flag parameter type from uint32_t to uint16_t
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

The flag parameter in the node creation helpers only ever carries
XBC_KEY (0) or XBC_VALUE (0x8000), both of which fit in uint16_t.
Using uint16_t matches the width of xbc_node.data where the flag is
ultimately stored.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 45db51bc9cc7..34bdc2d13881 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -408,7 +408,7 @@ const char * __init xbc_node_find_next_key_value(struct xbc_node *root,
 
 /* XBC parse and tree build */
 
-static int __init xbc_init_node(struct xbc_node *node, char *data, uint32_t flag)
+static int __init xbc_init_node(struct xbc_node *node, char *data, uint16_t flag)
 {
 	unsigned long offset = data - xbc_data;
 
@@ -422,7 +422,7 @@ static int __init xbc_init_node(struct xbc_node *node, char *data, uint32_t flag
 	return 0;
 }
 
-static struct xbc_node * __init xbc_add_node(char *data, uint32_t flag)
+static struct xbc_node * __init xbc_add_node(char *data, uint16_t flag)
 {
 	struct xbc_node *node;
 
@@ -452,7 +452,7 @@ static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node)
 	return node;
 }
 
-static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, bool head)
+static struct xbc_node * __init __xbc_add_sibling(char *data, uint16_t flag, bool head)
 {
 	struct xbc_node *sib, *node = xbc_add_node(data, flag);
 
@@ -480,17 +480,17 @@ static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, boo
 	return node;
 }
 
-static inline struct xbc_node * __init xbc_add_sibling(char *data, uint32_t flag)
+static inline struct xbc_node * __init xbc_add_sibling(char *data, uint16_t flag)
 {
 	return __xbc_add_sibling(data, flag, false);
 }
 
-static inline struct xbc_node * __init xbc_add_head_sibling(char *data, uint32_t flag)
+static inline struct xbc_node * __init xbc_add_head_sibling(char *data, uint16_t flag)
 {
 	return __xbc_add_sibling(data, flag, true);
 }
 
-static inline __init struct xbc_node *xbc_add_child(char *data, uint32_t flag)
+static inline __init struct xbc_node *xbc_add_child(char *data, uint16_t flag)
 {
 	struct xbc_node *node = xbc_add_sibling(data, flag);
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH 14/15] bootconfig: add __packed definition to tools/bootconfig shim header
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

The tools/bootconfig userspace build includes the main bootconfig.h
via a shim header that defines kernel macros for userspace. Add the
__packed macro so the struct xbc_node declaration works after the
conversion from open-coded __attribute__((__packed__)).

Signed-off-by: Josh Law <objecting@objecting.org>
---
 tools/bootconfig/include/linux/bootconfig.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h
index 6784296a0692..41c50ab95ba5 100644
--- a/tools/bootconfig/include/linux/bootconfig.h
+++ b/tools/bootconfig/include/linux/bootconfig.h
@@ -48,6 +48,7 @@ static inline char *strim(char *s)
 
 #define __init
 #define __initdata
+#define __packed	__attribute__((__packed__))
 
 #include "../../../../include/linux/bootconfig.h"
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH 08/15] lib/bootconfig: fix off-by-one in xbc_verify_tree() next node check
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

Valid node indices are 0 to xbc_node_num-1, so a next value equal to
xbc_node_num is out of bounds. Use >= instead of > to catch this.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 58d6ae297280..56fbedc9e725 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -816,7 +816,7 @@ static int __init xbc_verify_tree(void)
 	}
 
 	for (i = 0; i < xbc_node_num; i++) {
-		if (xbc_nodes[i].next > xbc_node_num) {
+		if (xbc_nodes[i].next >= xbc_node_num) {
 			return xbc_parse_error("No closing brace",
 				xbc_node_get_data(xbc_nodes + i));
 		}
-- 
2.34.1


^ permalink raw reply related

* [PATCH 12/15] bootconfig: constify xbc_calc_checksum() data parameter
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

xbc_calc_checksum() only reads the data buffer, so mark the parameter
as const void * and the internal pointer as const unsigned char *.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 include/linux/bootconfig.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h
index c37e0096c4f1..d78c2b62debf 100644
--- a/include/linux/bootconfig.h
+++ b/include/linux/bootconfig.h
@@ -36,9 +36,9 @@ bool __init cmdline_has_extra_options(void);
  * The checksum will be used with the BOOTCONFIG_MAGIC and the size for
  * embedding the bootconfig in the initrd image.
  */
-static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size)
+static inline __init uint32_t xbc_calc_checksum(const void *data, uint32_t size)
 {
-	unsigned char *p = data;
+	const unsigned char *p = data;
 	uint32_t ret = 0;
 
 	while (size--)
-- 
2.34.1


^ permalink raw reply related

* [PATCH 10/15] lib/bootconfig: drop redundant memset of xbc_nodes
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

memblock_alloc() already returns zeroed memory, so the explicit memset
in xbc_init() is redundant. Switch the userspace xbc_alloc_mem() from
malloc() to calloc() so both paths return zeroed memory, and remove
the separate memset call.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 06e8a79ab472..fe1053043752 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -71,7 +71,7 @@ static inline void __init xbc_free_mem(void *addr, size_t size, bool early)
 
 static inline void *xbc_alloc_mem(size_t size)
 {
-	return malloc(size);
+	return calloc(1, size);
 }
 
 static inline void xbc_free_mem(void *addr, size_t size, bool early)
@@ -982,7 +982,6 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos)
 		_xbc_exit(true);
 		return -ENOMEM;
 	}
-	memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX);
 
 	ret = xbc_parse_tree();
 	if (!ret)
-- 
2.34.1


^ permalink raw reply related

* [PATCH 05/15] lib/bootconfig: fix inconsistent if/else bracing
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

When one branch of a conditional uses braces, both branches should
use them per kernel coding style.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index e955d2f7e7ca..45db51bc9cc7 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -473,8 +473,9 @@ static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, boo
 				sib->next = xbc_node_index(node);
 			}
 		}
-	} else
+	} else {
 		xbc_parse_error("Too many nodes", data);
+	}
 
 	return node;
 }
@@ -992,8 +993,9 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos)
 		if (emsg)
 			*emsg = xbc_err_msg;
 		_xbc_exit(true);
-	} else
+	} else {
 		ret = xbc_node_num;
+	}
 
 	return ret;
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH 01/15] lib/bootconfig: add missing __init annotations to static helpers
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

skip_comment() and skip_spaces_until_newline() are static functions
called exclusively from __init code paths but lack the __init
annotation themselves. Add it so their memory can be reclaimed after
init.

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index b0ef1e74e98a..51fd2299ec0f 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -509,7 +509,7 @@ static inline __init bool xbc_valid_keyword(char *key)
 	return *key == '\0';
 }
 
-static char *skip_comment(char *p)
+static char __init *skip_comment(char *p)
 {
 	char *ret;
 
@@ -522,7 +522,7 @@ static char *skip_comment(char *p)
 	return ret;
 }
 
-static char *skip_spaces_until_newline(char *p)
+static char __init *skip_spaces_until_newline(char *p)
 {
 	while (isspace(*p) && *p != '\n')
 		p++;
-- 
2.34.1


^ permalink raw reply related

* [PATCH 03/15] lib/bootconfig: fix typo "uder" in xbc_node_find_next_leaf()
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 53aedc042f6e..35091617bca5 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -364,7 +364,7 @@ struct xbc_node * __init xbc_node_find_next_leaf(struct xbc_node *root,
 			node = xbc_node_get_parent(node);
 			if (node == root)
 				return NULL;
-			/* User passed a node which is not uder parent */
+			/* User passed a node which is not under parent */
 			if (WARN_ON(!node))
 				return NULL;
 		}
-- 
2.34.1


^ permalink raw reply related

* [PATCH 04/15] lib/bootconfig: add blank line before xbc_get_info() kerneldoc
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 35091617bca5..e955d2f7e7ca 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -79,6 +79,7 @@ static inline void xbc_free_mem(void *addr, size_t size, bool early)
 	free(addr);
 }
 #endif
+
 /**
  * xbc_get_info() - Get the information of loaded boot config
  * @node_size: A pointer to store the number of nodes.
-- 
2.34.1


^ permalink raw reply related

* [PATCH 00/15] bootconfig: fixes, cleanups, and modernization
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law

This series addresses a collection of issues found during a review of
lib/bootconfig.c and include/linux/bootconfig.h, ranging from a
potential crash to coding style and API modernization.

Bug fixes:
  - Fix off-by-one in xbc_verify_tree() where a next-node index equal
    to xbc_node_num passes the bounds check despite being out of range
    (patch 8).
  - Fix xbc_node_get_data() returning NULL on WARN_ON, which causes
    NULL dereferences in callers that pass the result to strlen() or
    strcmp() without checking. Return an empty string instead so the
    WARN_ON remains non-fatal as intended (patch 15).
  - Move xbc_node_num increment to after xbc_init_node() validation
    so a failed init does not leave a partially initialized node
    counted in the array (patch 9).

Correctness:
  - Add missing __init annotations to skip_comment() and
    skip_spaces_until_newline() so their memory can be reclaimed
    after init (patch 1).
  - Narrow the flag parameter in node creation helpers from uint32_t
    to uint16_t to match the xbc_node.data field width (patch 6).
  - Constify the xbc_calc_checksum() data parameter since it only
    reads the buffer (patch 12).

Cleanups:
  - Fix comment typos (patches 2-3), missing blank line before
    kerneldoc (patch 4), inconsistent if/else bracing (patches 5, 7).
  - Drop redundant memset after memblock_alloc which already returns
    zeroed memory; switch the userspace path from malloc to calloc
    to match (patch 10).

Modernization:
  - Replace open-coded __attribute__((__packed__)) with the __packed
    macro, adding the definition to the tools/bootconfig shim header
    (patches 11, 14).
  - Replace the catch-all linux/kernel.h include with the specific
    headers needed: linux/cache.h, linux/compiler.h, and
    linux/sprintf.h (patch 13).

Build-tested with both the in-kernel build (lib/bootconfig.o,
init/main.o) and the userspace tools/bootconfig build. All 70
tools/bootconfig test cases pass.

Josh Law (15):
  lib/bootconfig: add missing __init annotations to static helpers
  lib/bootconfig: fix typo "initiized" in xbc_root_node() kerneldoc
  lib/bootconfig: fix typo "uder" in xbc_node_find_next_leaf()
  lib/bootconfig: add blank line before xbc_get_info() kerneldoc
  lib/bootconfig: fix inconsistent if/else bracing
  lib/bootconfig: narrow flag parameter type from uint32_t to uint16_t
  lib/bootconfig: fix inconsistent if/else bracing in __xbc_add_key()
  lib/bootconfig: fix off-by-one in xbc_verify_tree() next node check
  lib/bootconfig: increment xbc_node_num after node init succeeds
  lib/bootconfig: drop redundant memset of xbc_nodes
  bootconfig: use __packed macro for struct xbc_node
  bootconfig: constify xbc_calc_checksum() data parameter
  lib/bootconfig: replace linux/kernel.h with specific includes
  bootconfig: add __packed definition to tools/bootconfig shim header
  lib/bootconfig: return empty string instead of NULL from
    xbc_node_get_data()

 include/linux/bootconfig.h                  |  6 +--
 lib/bootconfig.c                            | 51 +++++++++++----------
 tools/bootconfig/include/linux/bootconfig.h |  1 +
 3 files changed, 32 insertions(+), 26 deletions(-)

--
2.34.1


^ permalink raw reply

* [PATCH 02/15] lib/bootconfig: fix typo "initiized" in xbc_root_node() kerneldoc
From: Josh Law @ 2026-03-14 21:45 UTC (permalink / raw)
  To: Masami Hiramatsu, Andrew Morton
  Cc: linux-trace-kernel, linux-kernel, Josh Law
In-Reply-To: <20260314214555.96217-1-objecting@objecting.org>

Signed-off-by: Josh Law <objecting@objecting.org>
---
 lib/bootconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bootconfig.c b/lib/bootconfig.c
index 51fd2299ec0f..53aedc042f6e 100644
--- a/lib/bootconfig.c
+++ b/lib/bootconfig.c
@@ -112,7 +112,7 @@ static int __init xbc_parse_error(const char *msg, const char *p)
  * xbc_root_node() - Get the root node of extended boot config
  *
  * Return the address of root node of extended boot config. If the
- * extended boot config is not initiized, return NULL.
+ * extended boot config is not initialized, return NULL.
  */
 struct xbc_node * __init xbc_root_node(void)
 {
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net-next v4 00/13] devlink: introduce shared devlink instance for PFs on same chip
From: patchwork-bot+netdevbpf @ 2026-03-14 20:20 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, edumazet, kuba, pabeni, horms, donald.hunter,
	corbet, skhan, saeedm, leon, tariqt, mbloch, przemyslaw.kitszel,
	mschmidt, andrew+netdev, rostedt, mhiramat, mathieu.desnoyers,
	chuck.lever, matttbe, cjubran, daniel.zahka, linux-doc,
	linux-rdma, linux-trace-kernel
In-Reply-To: <20260312100407.551173-1-jiri@resnulli.us>

Hello:

This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Thu, 12 Mar 2026 11:03:54 +0100 you wrote:
> From: Jiri Pirko <jiri@nvidia.com>
> 
> Multiple PFs on a network adapter often reside on the same physical
> chip, running a single firmware. Some resources and configurations
> are inherently shared among these PFs - PTP clocks, VF group rates,
> firmware parameters, and others. Today there is no good object in
> the devlink model to attach these chip-wide configuration knobs to.
> Drivers resort to workarounds like pinning shared state to PF0 or
> maintaining ad-hoc internal structures (e.g., ice_adapter) that are
> invisible to userspace.
> 
> [...]

Here is the summary with links:
  - [net-next,v4,01/13] devlink: expose devlink instance index over netlink
    https://git.kernel.org/netdev/net-next/c/68deca0f0f4b
  - [net-next,v4,02/13] devlink: add helpers to get bus_name/dev_name
    https://git.kernel.org/netdev/net-next/c/0f5531879afb
  - [net-next,v4,03/13] devlink: avoid extra iterations when found devlink is not registered
    https://git.kernel.org/netdev/net-next/c/e2e3666fd360
  - [net-next,v4,04/13] devlink: allow to use devlink index as a command handle
    https://git.kernel.org/netdev/net-next/c/d85a8af57da8
  - [net-next,v4,05/13] devlink: support index-based lookup via bus_name/dev_name handle
    https://git.kernel.org/netdev/net-next/c/725d5fdb7b9c
  - [net-next,v4,06/13] devlink: support index-based notification filtering
    https://git.kernel.org/netdev/net-next/c/089aeb4f2218
  - [net-next,v4,07/13] devlink: introduce __devlink_alloc() with dev driver pointer
    https://git.kernel.org/netdev/net-next/c/eb32a6310a7b
  - [net-next,v4,08/13] devlink: add devlink_dev_driver_name() helper and use it in trace events
    https://git.kernel.org/netdev/net-next/c/20b0f383aae7
  - [net-next,v4,09/13] devlink: add devl_warn() helper and use it in port warnings
    https://git.kernel.org/netdev/net-next/c/104733e1303e
  - [net-next,v4,10/13] devlink: allow devlink instance allocation without a backing device
    https://git.kernel.org/netdev/net-next/c/a4c6d53e5fd6
  - [net-next,v4,11/13] devlink: introduce shared devlink instance for PFs on same chip
    https://git.kernel.org/netdev/net-next/c/1850e76b3804
  - [net-next,v4,12/13] documentation: networking: add shared devlink documentation
    https://git.kernel.org/netdev/net-next/c/63fff8c0f702
  - [net-next,v4,13/13] net/mlx5: Add a shared devlink instance for PFs on same chip
    https://git.kernel.org/netdev/net-next/c/2a8c8a03f306

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

nsim_do_psp() can leave an extra extension reference pinned until
nsim_psp_handle_ext() reattaches it to a forwarded skb.

Route the drop paths through a common helper and release that extra
reference when __dev_forward_skb() fails and when start_xmit() drops the
skb before it reaches the peer RX path.

This is separate from the peer RX truesize test hook itself.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/netdevsim/netdev.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 22238df79b6a..c22513c523d6 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -187,6 +187,15 @@ static int nsim_napi_rx(struct net_device *tx_dev, struct net_device *rx_dev,
 	return NET_RX_SUCCESS;
 }
 
+/* nsim_do_psp() pins an extra extension ref until nsim_psp_handle_ext()
+ * reattaches it to a forwarded skb.
+ */
+static void nsim_psp_ext_put(struct skb_ext *psp_ext)
+{
+	if (psp_ext)
+		__skb_ext_put(psp_ext);
+}
+
 static int nsim_forward_skb(struct net_device *tx_dev,
 			    struct net_device *rx_dev,
 			    struct sk_buff *skb,
@@ -196,8 +205,10 @@ static int nsim_forward_skb(struct net_device *tx_dev,
 	int ret;
 
 	ret = __dev_forward_skb(rx_dev, skb);
-	if (ret)
+	if (ret) {
+		nsim_psp_ext_put(psp_ext);
 		return ret;
+	}
 
 	nsim_psp_handle_ext(skb, psp_ext);
 
@@ -278,11 +289,8 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		 * the synthetic cost; sender-side skb accounting stays put.
 		 */
 		nskb = skb_clone(skb, GFP_ATOMIC);
-		if (!nskb) {
-			if (psp_ext)
-				__skb_ext_put(psp_ext);
+		if (!nskb)
 			goto out_drop_free;
-		}
 
 		consume_skb(skb);
 		skb = nskb;
@@ -303,6 +311,7 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 out_drop_any:
 	dr = SKB_DROP_REASON_NOT_SPECIFIED;
 out_drop_free:
+	nsim_psp_ext_put(psp_ext);
 	kfree_skb_reason(skb, dr);
 out_drop_cnt:
 	rcu_read_unlock();
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Add a debugfs-controlled peer RX truesize knob to netdevsim, inflate the
forwarded skb only on the peer RX side, and cover the resulting socket
memory-accounting behavior with a dedicated selftest.

This keeps the synthetic cost out of the sender-side skb geometry while
giving the selftests a second runtime vehicle for the receive-memory
accounting exercised by the TCP rwnd work.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/netdevsim/netdev.c                | 145 +++++-
 drivers/net/netdevsim/netdevsim.h             |   4 +
 .../selftests/drivers/net/netdevsim/Makefile  |   1 +
 .../drivers/net/netdevsim/peer-rx-truesize.sh | 426 ++++++++++++++++++
 4 files changed, 575 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh

diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index 5ec028a00c62..22238df79b6a 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -17,8 +17,10 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool_netlink.h>
 #include <linux/kernel.h>
+#include <linux/kstrtox.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <linux/refcount.h>
 #include <linux/slab.h>
 #include <net/netdev_queues.h>
 #include <net/netdev_rx_queue.h>
@@ -37,6 +39,91 @@ MODULE_IMPORT_NS("NETDEV_INTERNAL");
 
 #define NSIM_RING_SIZE		256
 
+struct nsim_rx_truesize {
+	refcount_t refs;
+	u32 value;
+};
+
+static struct nsim_rx_truesize *
+nsim_rx_truesize_get(struct nsim_rx_truesize *rx_truesize)
+{
+	if (!rx_truesize)
+		return NULL;
+
+	if (!refcount_inc_not_zero(&rx_truesize->refs))
+		return NULL;
+
+	return rx_truesize;
+}
+
+static void nsim_rx_truesize_put(struct nsim_rx_truesize *rx_truesize)
+{
+	if (!rx_truesize)
+		return;
+
+	if (refcount_dec_and_test(&rx_truesize->refs))
+		kfree(rx_truesize);
+}
+
+static ssize_t nsim_rx_truesize_read(struct file *file, char __user *user_buf,
+				     size_t count, loff_t *ppos)
+{
+	struct nsim_rx_truesize *rx_truesize = file->private_data;
+	char buf[24];
+	int len;
+
+	len = scnprintf(buf, sizeof(buf), "%u\n",
+			READ_ONCE(rx_truesize->value));
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t nsim_rx_truesize_write(struct file *file,
+				      const char __user *user_buf,
+				      size_t count, loff_t *ppos)
+{
+	struct nsim_rx_truesize *rx_truesize = file->private_data;
+	u32 value;
+	int err;
+
+	err = kstrtou32_from_user(user_buf, count, 0, &value);
+	if (err)
+		return err;
+
+	WRITE_ONCE(rx_truesize->value, value);
+
+	return count;
+}
+
+static int nsim_rx_truesize_open(struct inode *inode, struct file *file)
+{
+	struct nsim_rx_truesize *rx_truesize;
+
+	rx_truesize = nsim_rx_truesize_get(inode->i_private);
+	if (!rx_truesize)
+		return -ENODEV;
+
+	file->private_data = rx_truesize;
+
+	return nonseekable_open(inode, file);
+}
+
+static int nsim_rx_truesize_release(struct inode *inode, struct file *file)
+{
+	nsim_rx_truesize_put(file->private_data);
+
+	return 0;
+}
+
+static const struct file_operations nsim_rx_truesize_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nsim_rx_truesize_open,
+	.read		= nsim_rx_truesize_read,
+	.write		= nsim_rx_truesize_write,
+	.release	= nsim_rx_truesize_release,
+	.llseek		= noop_llseek,
+};
+
 static void nsim_start_peer_tx_queue(struct net_device *dev, struct nsim_rq *rq)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -117,6 +204,28 @@ static int nsim_forward_skb(struct net_device *tx_dev,
 	return nsim_napi_rx(tx_dev, rx_dev, rq, skb);
 }
 
+/* Tests can inflate peer RX skb->truesize to exercise receiver-side TCP
+ * accounting under scaling-ratio drift without perturbing sender-side skb
+ * ownership.
+ */
+static void nsim_rx_update_truesize(struct sk_buff *skb, u32 extra)
+{
+	unsigned int truesize;
+
+	if (!extra)
+		return;
+
+	if (check_add_overflow(skb->truesize, extra, &truesize))
+		truesize = UINT_MAX;
+
+	skb->truesize = truesize;
+}
+
+static u32 nsim_rx_extra_truesize(const struct netdevsim *ns)
+{
+	return READ_ONCE(ns->rx_truesize->value);
+}
+
 static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct netdevsim *ns = netdev_priv(dev);
@@ -125,7 +234,9 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned int len = skb->len;
 	struct netdevsim *peer_ns;
 	struct netdev_config *cfg;
+	struct sk_buff *nskb;
 	struct nsim_rq *rq;
+	u32 extra;
 	int rxq;
 	int dr;
 
@@ -160,7 +271,24 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	      cfg->hds_thresh > len)))
 		skb_linearize(skb);
 
+	extra = nsim_rx_extra_truesize(peer_ns);
 	skb_tx_timestamp(skb);
+	if (extra) {
+		/* Clone before inflating truesize so only the peer RX path sees
+		 * the synthetic cost; sender-side skb accounting stays put.
+		 */
+		nskb = skb_clone(skb, GFP_ATOMIC);
+		if (!nskb) {
+			if (psp_ext)
+				__skb_ext_put(psp_ext);
+			goto out_drop_free;
+		}
+
+		consume_skb(skb);
+		skb = nskb;
+		nsim_rx_update_truesize(skb, extra);
+	}
+
 	if (unlikely(nsim_forward_skb(dev, peer_dev,
 				      skb, rq, psp_ext) == NET_RX_DROP))
 		goto out_drop_cnt;
@@ -1121,6 +1249,7 @@ struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
 			      u8 perm_addr[ETH_ALEN])
 {
 	struct net_device *dev;
+	struct nsim_rx_truesize *rx_truesize;
 	struct netdevsim *ns;
 	int err;
 
@@ -1140,6 +1269,13 @@ struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
 	ns->nsim_bus_dev = nsim_dev->nsim_bus_dev;
 	SET_NETDEV_DEV(dev, &ns->nsim_bus_dev->dev);
 	SET_NETDEV_DEVLINK_PORT(dev, &nsim_dev_port->devlink_port);
+	rx_truesize = kzalloc_obj(*rx_truesize);
+	if (!rx_truesize) {
+		err = -ENOMEM;
+		goto err_free_netdev;
+	}
+	refcount_set(&rx_truesize->refs, 1);
+	ns->rx_truesize = rx_truesize;
 	nsim_ethtool_init(ns);
 	if (nsim_dev_port_is_pf(nsim_dev_port))
 		err = nsim_init_netdevsim(ns);
@@ -1153,21 +1289,27 @@ struct netdevsim *nsim_create(struct nsim_dev *nsim_dev,
 	ns->qr_dfs = debugfs_create_file("queue_reset", 0200,
 					 nsim_dev_port->ddir, ns,
 					 &nsim_qreset_fops);
+	ns->rx_truesize_dfs = debugfs_create_file("rx_extra_truesize", 0600,
+						  nsim_dev_port->ddir,
+						  ns->rx_truesize,
+						  &nsim_rx_truesize_fops);
 	return ns;
 
 err_free_netdev:
+	nsim_rx_truesize_put(ns->rx_truesize);
 	free_netdev(dev);
 	return ERR_PTR(err);
 }
 
 void nsim_destroy(struct netdevsim *ns)
 {
+	struct nsim_rx_truesize *rx_truesize = ns->rx_truesize;
 	struct net_device *dev = ns->netdev;
 	struct netdevsim *peer;
 
+	debugfs_remove(ns->rx_truesize_dfs);
 	debugfs_remove(ns->qr_dfs);
 	debugfs_remove(ns->pp_dfs);
-
 	if (ns->nb.notifier_call)
 		unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb,
 						      &ns->nn);
@@ -1198,6 +1340,7 @@ void nsim_destroy(struct netdevsim *ns)
 	}
 
 	free_netdev(dev);
+	nsim_rx_truesize_put(rx_truesize);
 }
 
 bool netdev_is_nsim(struct net_device *dev)
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index f767fc8a7505..972ad274060e 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -75,6 +75,8 @@ struct nsim_macsec {
 	u8 nsim_secy_count;
 };
 
+struct nsim_rx_truesize;
+
 struct nsim_ethtool_pauseparam {
 	bool rx;
 	bool tx;
@@ -144,6 +146,8 @@ struct netdevsim {
 	} udp_ports;
 
 	struct page *page;
+	struct nsim_rx_truesize *rx_truesize;
+	struct dentry *rx_truesize_dfs;
 	struct dentry *pp_dfs;
 	struct dentry *qr_dfs;
 
diff --git a/tools/testing/selftests/drivers/net/netdevsim/Makefile b/tools/testing/selftests/drivers/net/netdevsim/Makefile
index 1a228c5430f5..9e9e48d5913b 100644
--- a/tools/testing/selftests/drivers/net/netdevsim/Makefile
+++ b/tools/testing/selftests/drivers/net/netdevsim/Makefile
@@ -14,6 +14,7 @@ TEST_PROGS := \
 	macsec-offload.sh \
 	nexthop.sh \
 	peer.sh \
+	peer-rx-truesize.sh \
 	psample.sh \
 	tc-mq-visibility.sh \
 	udp_tunnel_nic.sh \
diff --git a/tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh b/tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh
new file mode 100755
index 000000000000..6d1101d20847
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/peer-rx-truesize.sh
@@ -0,0 +1,426 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+set -euo pipefail
+
+lib_dir=$(dirname "$0")/../../../net
+source "$lib_dir"/lib.sh
+
+NSIM_SRV_ID=$((1024 + RANDOM % 1024))
+NSIM_CLI_ID=$((2048 + RANDOM % 1024))
+NSIM_SYS_LINK=/sys/bus/netdevsim/link_device
+SERVER_ADDR=192.0.2.1
+CLIENT_ADDR=192.0.2.2
+RMEM_PORT=12345
+WARM_PORT=12346
+RMEM_QUEUED_LEN=65000
+RMEM_INFLATED_LEN=65000
+RMEM_SMALL_EXTRA=4096
+RMEM_LARGE_EXTRA=65536
+WARM_WARMUP_ROUNDS=16
+WARM_WARMUP_LEN=65000
+WARM_QUEUED_LEN=62000
+WARM_INFLATED_LEN=65000
+WARM_EXTRA=65536
+
+srv_dev=
+cli_dev=
+srv_pid=
+cli_pid=
+srv_fd=
+cli_fd=
+stage_dir=
+CASE_BASE_METRIC=
+CASE_FINAL_METRIC=
+
+cleanup()
+{
+	local rc=$?
+
+	if [ -n "${srv_pid:-}" ]; then
+		kill "${srv_pid}" 2>/dev/null || true
+		wait "${srv_pid}" 2>/dev/null || true
+	fi
+
+	if [ -n "${cli_pid:-}" ]; then
+		kill "${cli_pid}" 2>/dev/null || true
+		wait "${cli_pid}" 2>/dev/null || true
+	fi
+
+	if [ -n "${srv_fd:-}" ]; then
+		eval "exec ${srv_fd}<&-"
+	fi
+
+	if [ -n "${cli_fd:-}" ]; then
+		eval "exec ${cli_fd}<&-"
+	fi
+
+	if [ -d "${stage_dir:-}" ]; then
+		rm -rf "${stage_dir}"
+	fi
+
+	cleanup_netdevsim "${NSIM_SRV_ID}" 2>/dev/null || true
+	cleanup_netdevsim "${NSIM_CLI_ID}" 2>/dev/null || true
+	cleanup_ns "${SRV:-}" "${CLI:-}" 2>/dev/null || true
+
+	exit "${rc}"
+}
+
+trap cleanup EXIT
+
+ensure_debugfs()
+{
+	if mount | grep -q 'on /sys/kernel/debug type debugfs'; then
+		return 0
+	fi
+
+	if ! mount -t debugfs none /sys/kernel/debug >/dev/null 2>&1; then
+		echo "SKIP: failed to mount debugfs"
+		exit "${ksft_skip}"
+	fi
+}
+
+ensure_netdevsim()
+{
+	if [ -w /sys/bus/netdevsim/new_device ]; then
+		return 0
+	fi
+
+	if ! modprobe netdevsim >/dev/null 2>&1; then
+		echo "SKIP: no netdevsim support"
+		exit "${ksft_skip}"
+	fi
+}
+
+create_nsim()
+{
+	local id="$1"
+	local ns="$2"
+	local addr="$3"
+	local dev
+
+	echo "${id}" | ip netns exec "${ns}" tee /sys/bus/netdevsim/new_device >/dev/null
+	udevadm settle
+
+	dev=$(ip netns exec "${ns}" ls /sys/bus/netdevsim/devices/netdevsim"${id}"/net)
+	ip -netns "${ns}" link set dev "${dev}" name "nsim${id}"
+	ip -netns "${ns}" addr add "${addr}/24" dev "nsim${id}"
+	ip -netns "${ns}" link set dev "nsim${id}" up
+
+	echo "nsim${id}"
+}
+
+link_nsim_peers()
+{
+	local srv_ifindex
+	local cli_ifindex
+
+	eval "exec {srv_fd}</var/run/netns/${SRV}"
+	eval "exec {cli_fd}</var/run/netns/${CLI}"
+
+	srv_ifindex=$(ip netns exec "${SRV}" cat /sys/class/net/"${srv_dev}"/ifindex)
+	cli_ifindex=$(ip netns exec "${CLI}" cat /sys/class/net/"${cli_dev}"/ifindex)
+
+	echo "${srv_fd}:${srv_ifindex} ${cli_fd}:${cli_ifindex}" > "${NSIM_SYS_LINK}"
+}
+
+wait_for_file()
+{
+	local path="$1"
+	local i
+
+	for i in $(seq 100); do
+		if [ -e "${path}" ]; then
+			return 0
+		fi
+		sleep 0.1
+	done
+
+	return 1
+}
+
+server_python='
+import array
+import fcntl
+import os
+import socket
+import struct
+import sys
+import time
+
+SO_MEMINFO = 55
+SK_MEMINFO_RMEM_ALLOC = 0
+TCP_MAXSEG = getattr(socket, "TCP_MAXSEG", 2)
+FIONREAD = 0x541B
+POLL_INTERVAL = 0.01
+POLL_TIMEOUT = 20.0
+
+(mode, host, port, warmup_rounds, warmup_len, queued_len, inflated_len,
+ ready_file, result_file) = sys.argv[1:]
+port = int(port)
+warmup_rounds = int(warmup_rounds)
+warmup_len = int(warmup_len)
+queued_len = int(queued_len)
+inflated_len = int(inflated_len)
+
+def queued_bytes(sock):
+    buf = array.array("I", [0])
+    fcntl.ioctl(sock.fileno(), FIONREAD, buf, True)
+    return buf[0]
+
+def wait_for_queued(sock, target):
+    deadline = time.time() + POLL_TIMEOUT
+    while time.time() < deadline:
+        if queued_bytes(sock) >= target:
+            return
+        time.sleep(POLL_INTERVAL)
+    raise SystemExit(f"timed out waiting for {target} queued bytes")
+
+def meminfo(sock):
+    raw = sock.getsockopt(socket.SOL_SOCKET, SO_MEMINFO, 9 * 4)
+    return struct.unpack("=9I", raw)
+
+def wait_for_growth(sock, idx, base):
+    deadline = time.time() + POLL_TIMEOUT
+    while time.time() < deadline:
+        cur = meminfo(sock)[idx]
+        if cur > base:
+            return cur
+        time.sleep(POLL_INTERVAL)
+    raise SystemExit(f"timed out waiting for SO_MEMINFO[{idx}] growth from {base}")
+
+def write_metric(path, value):
+    with open(path, "w", encoding="ascii") as fp:
+        fp.write(f"{value}\n")
+
+def recv_all(sock, total):
+    remaining = total
+    while remaining:
+        chunk = sock.recv(min(65536, remaining))
+        if not chunk:
+            raise SystemExit("unexpected EOF while draining receive data")
+        remaining -= len(chunk)
+
+listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+listener.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+listener.setsockopt(socket.IPPROTO_TCP, TCP_MAXSEG, 1000)
+listener.bind((host, port))
+listener.listen(1)
+conn, _ = listener.accept()
+
+for _ in range(warmup_rounds):
+    recv_all(conn, warmup_len)
+
+if mode == "rmem_alloc":
+    wait_for_queued(conn, queued_len)
+    base_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(ready_file, base_metric)
+
+    recv_all(conn, queued_len)
+    wait_for_queued(conn, inflated_len)
+    grown_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(result_file, grown_metric)
+elif mode == "rmem_alloc_warm":
+    wait_for_queued(conn, queued_len)
+    base_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(ready_file, base_metric)
+
+    wait_for_queued(conn, queued_len + 1)
+    grown_metric = wait_for_growth(conn, SK_MEMINFO_RMEM_ALLOC, base_metric)
+    write_metric(result_file, grown_metric)
+elif mode == "rmem_alloc_growth":
+    # The growth cases compare against a live socket metric, so wait for
+    # observed growth instead of trusting one instantaneous post-queue sample.
+    wait_for_queued(conn, queued_len)
+    base_metric = meminfo(conn)[SK_MEMINFO_RMEM_ALLOC]
+    write_metric(ready_file, base_metric)
+
+    recv_all(conn, queued_len)
+    wait_for_queued(conn, inflated_len)
+    grown_metric = wait_for_growth(conn, SK_MEMINFO_RMEM_ALLOC, base_metric)
+    write_metric(result_file, grown_metric)
+else:
+    raise SystemExit(f"unknown mode: {mode}")
+'
+
+client_python='
+import os
+import socket
+import sys
+import time
+
+POLL_INTERVAL = 0.01
+POLL_TIMEOUT = 20.0
+
+host, port, warmup_rounds, warmup_len, queued_len, inflated_len, gate_file = sys.argv[1:]
+port = int(port)
+warmup_rounds = int(warmup_rounds)
+warmup_len = int(warmup_len)
+queued_len = int(queued_len)
+inflated_len = int(inflated_len)
+
+def send_all(sock, total):
+    payload = b"a" * min(total, 65536)
+    left = total
+    while left:
+        chunk = payload[: min(len(payload), left)]
+        sent = sock.send(chunk)
+        if sent <= 0:
+            raise SystemExit("short send")
+        left -= sent
+
+def wait_for_file(path):
+    deadline = time.time() + POLL_TIMEOUT
+    while time.time() < deadline:
+        if os.path.exists(path):
+            return
+        time.sleep(POLL_INTERVAL)
+    raise SystemExit(f"timed out waiting for {path}")
+
+cli = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+cli.setsockopt(socket.IPPROTO_TCP, socket.TCP_MAXSEG, 1000)
+cli.connect((host, port))
+for _ in range(warmup_rounds):
+    send_all(cli, warmup_len)
+send_all(cli, queued_len)
+wait_for_file(gate_file)
+send_all(cli, inflated_len)
+cli.close()
+'
+
+read_metric()
+{
+	local path="$1"
+	local value
+
+	if ! read -r value < "${path}"; then
+		echo "FAIL: unable to read metric from ${path}"
+		exit "${ksft_fail}"
+	fi
+
+	printf '%s\n' "${value}"
+}
+
+run_case()
+{
+	local case_id="$1"
+	local mode="$2"
+	local port="$3"
+	local warmups="$4"
+	local warmup_len="$5"
+	local queued_len="$6"
+	local inflated_len="$7"
+	local extra="$8"
+	local label="$9"
+	local ready_file="${stage_dir}/${case_id}.ready"
+	local result_file="${stage_dir}/${case_id}.result"
+	local gate_file="${stage_dir}/${case_id}.gate"
+
+	rm -f "${ready_file}" "${result_file}" "${gate_file}"
+	echo 0 > "${dfs_file}"
+
+	ip netns exec "${SRV}" python3 - "${mode}" "${SERVER_ADDR}" "${port}" \
+		"${warmups}" "${warmup_len}" "${queued_len}" "${inflated_len}" \
+		"${ready_file}" "${result_file}" <<PY &
+${server_python}
+PY
+	srv_pid=$!
+
+	wait_local_port_listen "${SRV}" "${port}" tcp
+
+	ip netns exec "${CLI}" python3 - "${SERVER_ADDR}" "${port}" \
+		"${warmups}" "${warmup_len}" "${queued_len}" "${inflated_len}" \
+		"${gate_file}" <<PY &
+${client_python}
+PY
+	cli_pid=$!
+
+	if ! wait_for_file "${ready_file}"; then
+		echo "FAIL: ${label}: ready marker did not appear"
+		exit "${ksft_fail}"
+	fi
+
+	echo "${extra}" > "${dfs_file}"
+	touch "${gate_file}"
+
+	wait "${cli_pid}"
+	cli_pid=
+	wait "${srv_pid}"
+	srv_pid=
+
+	CASE_BASE_METRIC=$(read_metric "${ready_file}")
+	CASE_FINAL_METRIC=$(read_metric "${result_file}")
+
+	echo "PASS: ${label}"
+}
+
+# This test only proves that injected truesize reaches socket memory
+# accounting. Packetdrill covers the sender-visible rwnd accept/drop logic.
+
+assert_no_growth()
+{
+	local label="$1"
+
+	if [ "${CASE_FINAL_METRIC}" -gt "${CASE_BASE_METRIC}" ]; then
+		echo "FAIL: ${label}: metric grew unexpectedly:" \
+		     "base=${CASE_BASE_METRIC}" \
+		     "after=${CASE_FINAL_METRIC}"
+		exit "${ksft_fail}"
+	fi
+}
+
+assert_growth()
+{
+	local label="$1"
+
+	if [ "${CASE_FINAL_METRIC}" -le "${CASE_BASE_METRIC}" ]; then
+		echo "FAIL: ${label}: metric did not grow:" \
+		     "base=${CASE_BASE_METRIC}" \
+		     "after=${CASE_FINAL_METRIC}"
+		exit "${ksft_fail}"
+	fi
+}
+
+ensure_debugfs
+ensure_netdevsim
+set +u
+setup_ns SRV CLI
+set -u
+
+srv_dev=$(create_nsim "${NSIM_SRV_ID}" "${SRV}" "${SERVER_ADDR}")
+cli_dev=$(create_nsim "${NSIM_CLI_ID}" "${CLI}" "${CLIENT_ADDR}")
+link_nsim_peers
+
+ip netns exec "${SRV}" sysctl -wq net.ipv4.tcp_moderate_rcvbuf=0
+
+stage_dir=$(mktemp -d)
+dfs_file="/sys/kernel/debug/netdevsim/netdevsim${NSIM_SRV_ID}/ports/0/rx_extra_truesize"
+
+run_case "rmem_noop" "rmem_alloc" "${RMEM_PORT}" 0 0 \
+	"${RMEM_QUEUED_LEN}" "${RMEM_INFLATED_LEN}" 0 \
+	"peer rx truesize zero no-op"
+assert_no_growth "peer rx truesize zero no-op"
+
+run_case "rmem_small" "rmem_alloc_growth" "${RMEM_PORT}" 0 0 \
+	"${RMEM_QUEUED_LEN}" "${RMEM_INFLATED_LEN}" "${RMEM_SMALL_EXTRA}" \
+	"peer rx truesize small rmem_alloc"
+assert_growth "peer rx truesize small rmem_alloc"
+small_delta=$((CASE_FINAL_METRIC - CASE_BASE_METRIC))
+
+run_case "rmem_large" "rmem_alloc_growth" "${RMEM_PORT}" 0 0 \
+	"${RMEM_QUEUED_LEN}" "${RMEM_INFLATED_LEN}" "${RMEM_LARGE_EXTRA}" \
+	"peer rx truesize large rmem_alloc"
+assert_growth "peer rx truesize large rmem_alloc"
+large_delta=$((CASE_FINAL_METRIC - CASE_BASE_METRIC))
+
+if [ "${large_delta}" -le "${small_delta}" ]; then
+	echo "FAIL: peer rx truesize stepped rmem_alloc:" \
+	     "small_delta=${small_delta}" \
+	     "large_delta=${large_delta}"
+	exit "${ksft_fail}"
+fi
+
+run_case "rmem_warm" "rmem_alloc_warm" "${WARM_PORT}" "${WARM_WARMUP_ROUNDS}" "${WARM_WARMUP_LEN}" \
+	"${WARM_QUEUED_LEN}" "${WARM_INFLATED_LEN}" "${WARM_EXTRA}" \
+	"peer rx truesize warm rmem_alloc"
+assert_growth "peer rx truesize warm rmem_alloc"
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Add a test-only TUN ioctl that inflates RX skb->truesize, plus the
packetdrill-side helper needed to drive that ioctl through packetdrill's
own TUN queue file descriptor.

Use that plumbing to cover the receive-window regressions where
scaling_ratio drifts after advertisement, alongside the baseline too-big
packetdrill cases that exercise the same sender-visible rwnd accounting
from the non-injected path.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/tun.c                             |  65 ++++++++
 include/uapi/linux/if_tun.h                   |   4 +
 .../tcp_rcv_neg_window_truesize.pkt           | 143 ++++++++++++++++++
 .../net/packetdrill/tcp_rcv_toobig.pkt        |  35 +++++
 .../packetdrill/tcp_rcv_toobig_default.pkt    |  97 ++++++++++++
 .../tcp_rcv_toobig_default_truesize.pkt       | 118 +++++++++++++++
 .../tcp_rcv_wnd_shrink_allowed_truesize.pkt   |  49 ++++++
 tools/testing/selftests/net/tun.c             | 140 ++++++++++++++++-
 8 files changed, 650 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c492fda6fc15..2cef62cebe88 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -53,6 +53,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/if_vlan.h>
+#include <linux/overflow.h>
 #include <linux/crc32.h>
 #include <linux/math.h>
 #include <linux/nsproxy.h>
@@ -85,8 +86,13 @@
 
 #include "tun_vnet.h"
 
+struct tun_file;
+
+#define TUNSETTRUESIZE_OLD _IOW('T', 228, unsigned int)
+
 static void tun_default_link_ksettings(struct net_device *dev,
 				       struct ethtool_link_ksettings *cmd);
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb);
 
 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
@@ -138,6 +144,7 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	u32 rx_extra_truesize;
 	struct napi_struct napi;
 	bool napi_enabled;
 	bool napi_frags_enabled;
@@ -1817,6 +1824,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		goto free_skb;
 	}
 
+	tun_rx_update_truesize(tfile, skb);
 	switch (tun->flags & TUN_TYPE_MASK) {
 	case IFF_TUN:
 		if (tun->flags & IFF_NO_PI) {
@@ -2373,6 +2381,25 @@ static void tun_put_page(struct tun_page *tpage)
 		__page_frag_cache_drain(tpage->page, tpage->count);
 }
 
+/* Tests can inflate skb->truesize on ingress to exercise receive-memory
+ * accounting against a scaling_ratio that drifts after a window was
+ * advertised. The knob is per queue file, defaults to zero, and only changes
+ * behavior when explicitly enabled through the TUN fd.
+ */
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb)
+{
+	u32 extra = READ_ONCE(tfile->rx_extra_truesize);
+	unsigned int truesize;
+
+	if (!extra)
+		return;
+
+	if (check_add_overflow(skb->truesize, extra, &truesize))
+		truesize = UINT_MAX;
+
+	skb->truesize = truesize;
+}
+
 static int tun_xdp_one(struct tun_struct *tun,
 		       struct tun_file *tfile,
 		       struct xdp_buff *xdp, int *flush,
@@ -2459,6 +2486,7 @@ static int tun_xdp_one(struct tun_struct *tun,
 		goto out;
 	}
 
+	tun_rx_update_truesize(tfile, skb);
 	skb->protocol = eth_type_trans(skb, tun->dev);
 	skb_reset_network_header(skb);
 	skb_probe_transport_header(skb);
@@ -3045,6 +3073,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
 	unsigned int carrier;
+	unsigned int extra_truesize;
 	struct ifreq ifr;
 	kuid_t owner;
 	kgid_t group;
@@ -3309,6 +3338,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = tun_net_change_carrier(tun->dev, (bool)carrier);
 		break;
 
+	/* Support both the legacy pointer-payload form and the scalar form
+	 * used by the selftest helper when injecting truesize from
+	 * packetdrill shell commands.
+	 */
+	case TUNSETTRUESIZE:
+	case TUNSETTRUESIZE_OLD:
+		ret = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto unlock;
+
+		if (cmd == TUNSETTRUESIZE_OLD) {
+			ret = -EFAULT;
+			if (copy_from_user(&extra_truesize, argp,
+					   sizeof(extra_truesize))) {
+				ret = -EINVAL;
+				if (arg > U32_MAX)
+					goto unlock;
+
+				extra_truesize = arg;
+			}
+		} else {
+			ret = -EINVAL;
+			if (arg > U32_MAX)
+				goto unlock;
+
+			extra_truesize = arg;
+		}
+
+		WRITE_ONCE(tfile->rx_extra_truesize, extra_truesize);
+		netif_info(tun, drv, tun->dev,
+			   "rx extra truesize set to %u\n", extra_truesize);
+		ret = 0;
+		break;
+
 	case TUNGETDEVNETNS:
 		ret = -EPERM;
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -3348,6 +3411,7 @@ static long tun_chr_compat_ioctl(struct file *file,
 	case TUNGETSNDBUF:
 	case TUNSETSNDBUF:
 	case SIOCGIFHWADDR:
+	case TUNSETTRUESIZE_OLD:
 	case SIOCSIFHWADDR:
 		arg = (unsigned long)compat_ptr(arg);
 		break;
@@ -3408,6 +3472,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
+	tfile->rx_extra_truesize = 0;
 
 	init_waitqueue_head(&tfile->socket.wq.wait);
 
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 79d53c7a1ebd..4be63efe6540 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,10 @@
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
 #define TUNSETCARRIER _IOW('T', 226, int)
 #define TUNGETDEVNETNS _IO('T', 227)
+/* Test-only: add scalar bytes to skb->truesize on RX after TUN allocates
+ * an skb.
+ */
+#define TUNSETTRUESIZE _IO('T', 228)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
new file mode 100644
index 000000000000..1c5550fff509
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+// Run the negative-window / max-advertised-window regression with inflated
+// TUN skb->truesize so scaling_ratio drifts throughout the flow. The sequence
+// checks and drop counters should remain identical to the uninflated case.
+
+--mss=1000
+
+`./defaults.sh`
+
+    0 `nstat -n`
+
+// Establish a connection.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1000000], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 4>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Put 1040000 bytes into the receive buffer.
+   +0 < P. 1:65001(65000) ack 1 win 257
+    * > .  1:1(0) ack 65001
+   +0 < P. 65001:130001(65000) ack 1 win 257
+    * > .  1:1(0) ack 130001
+   +0 < P. 130001:195001(65000) ack 1 win 257
+    * > .  1:1(0) ack 195001
+   +0 < P. 195001:260001(65000) ack 1 win 257
+    * > .  1:1(0) ack 260001
+   +0 < P. 260001:325001(65000) ack 1 win 257
+    * > .  1:1(0) ack 325001
+   +0 < P. 325001:390001(65000) ack 1 win 257
+    * > .  1:1(0) ack 390001
+   +0 < P. 390001:455001(65000) ack 1 win 257
+    * > .  1:1(0) ack 455001
+   +0 < P. 455001:520001(65000) ack 1 win 257
+    * > .  1:1(0) ack 520001
+   +0 < P. 520001:585001(65000) ack 1 win 257
+    * > .  1:1(0) ack 585001
+   +0 < P. 585001:650001(65000) ack 1 win 257
+    * > .  1:1(0) ack 650001
+   +0 < P. 650001:715001(65000) ack 1 win 257
+    * > .  1:1(0) ack 715001
+   +0 < P. 715001:780001(65000) ack 1 win 257
+    * > .  1:1(0) ack 780001
+   +0 < P. 780001:845001(65000) ack 1 win 257
+    * > .  1:1(0) ack 845001
+   +0 < P. 845001:910001(65000) ack 1 win 257
+    * > .  1:1(0) ack 910001
+   +0 < P. 910001:975001(65000) ack 1 win 257
+    * > .  1:1(0) ack 975001
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+    * > .  1:1(0) ack 1040001
+
+// Start inflating future TUN skbs only after the baseline sender-visible
+// window has been established, so the negative-window checks below exercise
+// ratio drift without changing the initial max advertised window.
+   +0 `../tun --set-rx-truesize tun0 65536`
+
+// Trigger an extreme memory squeeze by shrinking SO_RCVBUF.
+   +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [16000], 4) = 0
+
+   +0 < P. 1040001:1105001(65000) ack 1 win 257
+    * > .  1:1(0) ack 1040001 win 0
+// Check LINUX_MIB_TCPRCVQDROP has been incremented.
+   +0 `nstat -s | grep TcpExtTCPRcvQDrop | grep -q " 1 "`
+
+// RWIN == 0: rcv_wup = 1040001, rcv_wnd = 0, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window.
+   +0 write(4, ..., 1000) = 1000
+   +0 > P. 1:1001(1000) ack 1040001 win 0
+   +0 < .  1105001:1105001(0) ack 1001 win 257
+
+// In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+   +0 < P. 1040001:1041001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Ooo partial segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+   +0 < P. 1039001:1041001(2000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0 <nop,nop,sack 1039001:1040001>
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented twice.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 2 "`
+
+// Ooo segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW).
+   +0 < P. 1105001:1106001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Ooo segment, beyond max adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE).
+   +0 < P. 2000001:2001001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice.
+   +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 2 "`
+
+// Read all data.
+   +0 read(4, ..., 2000000) = 1040000
+    * > .  1001:1001(0) ack 1040001
+
+// RWIN > 0: rcv_wup = 1040001, 0 < rcv_wnd < 32000, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window, beyond adv. window.
+   +0 write(4, ..., 1000) = 1000
+   +0 > P.  1001:2001(1000) ack 1040001
+   +0 < . 1105001:1105001(0) ack 2001 win 257
+
+// In order segment, in max adv. window, in adv. window -> accept.
+   +0 < P. 1040001:1041001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1041001
+
+// Ooo partial segment, in adv. window -> accept.
+   +0 < P. 1040001:1042001(2000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1042001 <nop,nop,sack 1040001:1041001>
+
+// Ooo segment, in max adv. window, beyond adv. window -> drop.
+   +0 < P. 1105001:1106001(1000) ack 2001 win 257
+   +0 > .  2001:2001(0) ack 1042001
+// Ooo segment, beyond max adv. window, beyond adv. window -> drop.
+   +0 < P. 2000001:2001001(1000) ack 2001 win 257
+   +0 > .  2001:2001(0) ack 1042001
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice more.
+   +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 4 "`
+
+// We are allowed to go beyond the window and buffer with one packet.
+   +0 < P. 1042001:1062001(20000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1062001
+   +0 < P. 1062001:1082001(20000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001 win 0
+
+// But not more: in-order segment, in max adv. window -> drop.
+   +0 < P. 1082001:1083001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented again.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 3 "`
+
+// Another ratio drop must not change the final zero-window decision.
+   +0 `../tun --set-rx-truesize tun0 131072`
+
+   +0 < P. 1082001:1083001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented once more.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 4 "`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
new file mode 100644
index 000000000000..837ba3633752
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh`
+
+    0 `nstat -n`
+
+// Establish a connection.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 < P. 1:20001(20000) ack 1 win 257
+ +.04 > .  1:1(0) ack 20001 win 18000
+
+   +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0
+   +0 < P. 20001:80001(60000) ack 1 win 257
+   +0 > .  1:1(0) ack 20001 win 18000
+
+   +0 read(4, ..., 20000) = 20000
+
+// A too big packet is accepted if the receive queue is empty, but the
+// stronger admission path must not zero the receive buffer while doing so.
+   +0 < P. 20001:80001(60000) ack 1 win 257
+    * > .  1:1(0) ack 80001 win 0
+   +0 %{ assert SK_MEMINFO_RCVBUF > 0, SK_MEMINFO_RCVBUF }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
new file mode 100644
index 000000000000..b2e4950e0b83
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. Leave a large skb in
+// the queue, then deliver another one which still fits the remaining rwnd.
+// We should grow sk_rcvbuf to honor the already-advertised window instead of
+// dropping the packet.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its 128kB default.
+   +0 < P. 1:65001(65000) ack 1 win 257
+   * > .  1:1(0) ack 65001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 65001:130001(65000) ack 1 win 257
+   * > .  1:1(0) ack 130001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 130001:195001(65000) ack 1 win 257
+   * > .  1:1(0) ack 195001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 195001:260001(65000) ack 1 win 257
+   * > .  1:1(0) ack 260001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 260001:325001(65000) ack 1 win 257
+   * > .  1:1(0) ack 325001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 325001:390001(65000) ack 1 win 257
+   * > .  1:1(0) ack 390001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 390001:455001(65000) ack 1 win 257
+   * > .  1:1(0) ack 455001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 455001:520001(65000) ack 1 win 257
+   * > .  1:1(0) ack 520001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 520001:585001(65000) ack 1 win 257
+   * > .  1:1(0) ack 585001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 585001:650001(65000) ack 1 win 257
+   * > .  1:1(0) ack 650001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 650001:715001(65000) ack 1 win 257
+   * > .  1:1(0) ack 715001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 715001:780001(65000) ack 1 win 257
+   * > .  1:1(0) ack 780001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 780001:845001(65000) ack 1 win 257
+   * > .  1:1(0) ack 845001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 845001:910001(65000) ack 1 win 257
+   * > .  1:1(0) ack 910001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 910001:975001(65000) ack 1 win 257
+   * > .  1:1(0) ack 975001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1040001
+   +0 read(4, ..., 65000) = 65000
+
+// Leave about 60kB queued, then accept another large skb which still fits
+// the rwnd we already exposed to the peer. The regression is the drop; the
+// exact sk_rcvbuf growth path is an implementation detail.
+   +0 < P. 1040001:1102001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1102001
+
+   +0 < P. 1102001:1167001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1167001
+   +0 read(4, ..., 127000) = 127000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
new file mode 100644
index 000000000000..c2ebe11d75f7
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. The warmup traffic
+// keeps the socket in the normal data path without changing its default
+// sk_rcvbuf. Then inflate skb->truesize on future TUN RX packets so the live
+// scaling_ratio drops after we already exposed a larger rwnd to the peer.
+// The follow-up packet should still be admitted, and tcp_clamp_window() should
+// grow sk_rcvbuf to honor the sender-visible window instead of dropping data.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its initial default.
+   +0 < P. 1:65001(65000) ack 1 win 257
+   * > .  1:1(0) ack 65001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 65001:130001(65000) ack 1 win 257
+   * > .  1:1(0) ack 130001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 130001:195001(65000) ack 1 win 257
+   * > .  1:1(0) ack 195001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 195001:260001(65000) ack 1 win 257
+   * > .  1:1(0) ack 260001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 260001:325001(65000) ack 1 win 257
+   * > .  1:1(0) ack 325001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 325001:390001(65000) ack 1 win 257
+   * > .  1:1(0) ack 390001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 390001:455001(65000) ack 1 win 257
+   * > .  1:1(0) ack 455001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 455001:520001(65000) ack 1 win 257
+   * > .  1:1(0) ack 520001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 520001:585001(65000) ack 1 win 257
+   * > .  1:1(0) ack 585001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 585001:650001(65000) ack 1 win 257
+   * > .  1:1(0) ack 650001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 650001:715001(65000) ack 1 win 257
+   * > .  1:1(0) ack 715001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 715001:780001(65000) ack 1 win 257
+   * > .  1:1(0) ack 780001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 780001:845001(65000) ack 1 win 257
+   * > .  1:1(0) ack 845001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 845001:910001(65000) ack 1 win 257
+   * > .  1:1(0) ack 910001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 910001:975001(65000) ack 1 win 257
+   * > .  1:1(0) ack 975001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1040001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 %{ base_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+// Leave about 60kB queued, then make future TUN skbs look more expensive in
+// two steps. Both inflated skbs still fit the already-advertised window and
+// must be admitted, and sk_rcvbuf should keep growing as the live
+// scaling_ratio drops further.
+   +0 < P. 1040001:1102001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1102001
+
+   +0 `../tun --set-rx-truesize tun0 4096`
+
+   +0 < P. 1102001:1167001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1167001
+   +0 %{ assert SK_MEMINFO_RCVBUF > base_rcvbuf, (base_rcvbuf, SK_MEMINFO_RCVBUF) }%
+   +0 %{ small_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+   +0 < P. 1167001:1229001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1229001
+
+   +0 `../tun --set-rx-truesize tun0 65536`
+
+   +0 < P. 1229001:1294001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1294001
+   +0 %{ assert SK_MEMINFO_RCVBUF > small_rcvbuf, (base_rcvbuf, small_rcvbuf, SK_MEMINFO_RCVBUF) }%
+
+   +0 < P. 1294001:1356001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1356001
+   +0 read(4, ..., 254000) = 254000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
new file mode 100644
index 000000000000..08da5fddaa12
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_shrink_window=1
+sysctl -q net.ipv4.tcp_rmem="4096 32768 $((32*1024*1024))"`
+
+   0 `nstat -n`
+
+// Establish a connection. After the first payload we know the peer has seen a
+// scaled receive window reaching sequence 25361. Inflate later TUN skbs in two
+// steps so the live scaling_ratio drops more than once, then verify that:
+//   1) a segment one byte beyond the max advertised window is still dropped,
+//   2) a segment exactly using the previously advertised max window is still
+//      accepted even though the current live ratio no longer matches that
+//      original advertisement basis.
+  +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+  +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+  +0 bind(3, ..., ...) = 0
+  +0 listen(3, 1) = 0
+
+  +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+  +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 10>
+  +0 < . 1:1(0) ack 1 win 257
+
+  +0 accept(3, ..., ...) = 4
+
+  +0 < P. 1:10001(10000) ack 1 win 257
+   * > .  1:1(0) ack 10001 win 15
+
+// Max window seq advertised here is 10001 + 15*1024 = 25361.
+  +0 `../tun --set-rx-truesize tun0 4096`
+
+  +0 < P. 10001:11024(1023) ack 1 win 257
+   * > .  1:1(0) ack 11024
+
+  +0 `../tun --set-rx-truesize tun0 65536`
+
+// Segment beyond the max window stays invalid even after ratio drift.
+  +0 < P. 11024:25362(14338) ack 1 win 257
+   * > .  1:1(0) ack 11024
+
+// Segment exactly using the max window must still be accepted.
+  +0 < P. 11024:25361(14337) ack 1 win 257
+   * > .  1:1(0) ack 25361
+
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented once.
+  +0 `nstat | grep TcpExtBeyondWindow | grep -q " 1 "`
diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index cf106a49b55e..473992b3784d 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -2,14 +2,17 @@
 
 #define _GNU_SOURCE
 
+#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <linux/if_tun.h>
 #include <sys/ioctl.h>
+#include <sys/syscall.h>
 #include <sys/socket.h>
 
 #include "kselftest_harness.h"
@@ -174,6 +177,135 @@ static int tun_delete(char *dev)
 	return ip_link_del(dev);
 }
 
+static bool is_numeric_name(const char *name)
+{
+	for (; *name; name++) {
+		if (*name < '0' || *name > '9')
+			return false;
+	}
+
+	return true;
+}
+
+static int packetdrill_dup_fd(int pidfd, const char *fd_name)
+{
+	char *end;
+	unsigned long tmp;
+
+	errno = 0;
+	tmp = strtoul(fd_name, &end, 10);
+	if (errno || *end || tmp > INT_MAX) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	return syscall(SYS_pidfd_getfd, pidfd, (int)tmp, 0);
+}
+
+static int open_packetdrill_tunfd(pid_t pid, const char *ifname)
+{
+	char fd_dir[PATH_MAX];
+	struct dirent *dent;
+	struct ifreq ifr = {};
+	int pidfd;
+	int saved_errno = ENOENT;
+	DIR *dir;
+
+	snprintf(fd_dir, sizeof(fd_dir), "/proc/%ld/fd", (long)pid);
+
+	pidfd = syscall(SYS_pidfd_open, pid, 0);
+	if (pidfd < 0)
+		return -1;
+
+	dir = opendir(fd_dir);
+	if (!dir) {
+		close(pidfd);
+		return -1;
+	}
+
+	while ((dent = readdir(dir))) {
+		int fd;
+
+		if (!is_numeric_name(dent->d_name))
+			continue;
+
+		/* Reopen via pidfd_getfd() so we duplicate packetdrill's attached
+		 * queue file, instead of opening a fresh /dev/net/tun instance.
+		 */
+		fd = packetdrill_dup_fd(pidfd, dent->d_name);
+		if (fd < 0) {
+			saved_errno = errno;
+			continue;
+		}
+
+		memset(&ifr, 0, sizeof(ifr));
+		if (!ioctl(fd, TUNGETIFF, &ifr) &&
+		    !strncmp(ifr.ifr_name, ifname, IFNAMSIZ)) {
+			close(pidfd);
+			closedir(dir);
+			return fd;
+		}
+
+		if (errno)
+			saved_errno = errno;
+		close(fd);
+	}
+
+	close(pidfd);
+	closedir(dir);
+	errno = saved_errno;
+	return -1;
+}
+
+/* Packetdrill owns the TUN queue fd, so drive the test ioctl through that
+ * exact file descriptor found under /proc/$PACKETDRILL_PID/fd.
+ */
+static int packetdrill_set_rx_truesize(const char *ifname, const char *value)
+{
+	char *packetdrill_pid, *end;
+	unsigned long long tmp;
+	unsigned int extra;
+	pid_t pid;
+	int fd;
+
+	packetdrill_pid = getenv("PACKETDRILL_PID");
+	if (!packetdrill_pid || !*packetdrill_pid) {
+		fprintf(stderr, "PACKETDRILL_PID is not set\n");
+		return 1;
+	}
+
+	errno = 0;
+	tmp = strtoull(packetdrill_pid, &end, 10);
+	if (errno || *end || !tmp || tmp > INT_MAX) {
+		fprintf(stderr, "invalid PACKETDRILL_PID: %s\n", packetdrill_pid);
+		return 1;
+	}
+	pid = (pid_t)tmp;
+
+	errno = 0;
+	tmp = strtoull(value, &end, 0);
+	if (errno || *end || tmp > UINT_MAX) {
+		fprintf(stderr, "invalid truesize value: %s\n", value);
+		return 1;
+	}
+	extra = (unsigned int)tmp;
+
+	fd = open_packetdrill_tunfd(pid, ifname);
+	if (fd < 0) {
+		perror("open_packetdrill_tunfd");
+		return 1;
+	}
+
+	if (ioctl(fd, TUNSETTRUESIZE, (unsigned long)extra)) {
+		perror("ioctl(TUNSETTRUESIZE)");
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	return 0;
+}
+
 static int tun_open(char *dev, const int flags, const int hdrlen,
 		    const int features, const unsigned char *mac_addr)
 {
@@ -985,4 +1117,10 @@ XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet);
 XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet);
 XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet);
 
-TEST_HARNESS_MAIN
+int main(int argc, char **argv)
+{
+	if (argc == 4 && !strcmp(argv[1], "--set-rx-truesize"))
+		return packetdrill_set_rx_truesize(argv[2], argv[3]);
+
+	return test_harness_run(argc, argv);
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 11/14] selftests: tcp_ao: cover legacy, v1, and retracted repair windows
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Extend the tcp_ao repair selftests to exercise the legacy, v1, and
current TCP_REPAIR_WINDOW layouts, plus a synthesized retracted-window
image that preserves a larger historical right edge.

These tests validate both the append-only ABI contract and the restore-
time rebuilding of any snapshot state older userspace could not save.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 .../testing/selftests/net/tcp_ao/lib/aolib.h  |  83 +++++++-
 .../testing/selftests/net/tcp_ao/lib/repair.c |  18 +-
 .../selftests/net/tcp_ao/self-connect.c       | 201 +++++++++++++++++-
 3 files changed, 279 insertions(+), 23 deletions(-)

diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
index ebb2899c12fe..ef08db831457 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h
+++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h
@@ -13,6 +13,7 @@
 #include <linux/snmp.h>
 #include <linux/tcp.h>
 #include <netinet/in.h>
+#include <stddef.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -671,17 +672,55 @@ struct tcp_sock_state {
 	int timestamp;
 };
 
-extern void __test_sock_checkpoint(int sk, struct tcp_sock_state *state,
-				   void *addr, size_t addr_size);
+/* Legacy userspace stops before the snapshot field and therefore exercises
+ * the kernel's unknown-snapshot fallback path.
+ */
+static inline socklen_t test_tcp_repair_window_legacy_size(void)
+{
+	return offsetof(struct tcp_repair_window, rcv_wnd_scaling_ratio);
+}
+
+static inline socklen_t test_tcp_repair_window_v1_size(void)
+{
+	return offsetof(struct tcp_repair_window, rcv_mwnd_seq);
+}
+
+static inline socklen_t test_tcp_repair_window_exact_size(void)
+{
+	return sizeof(struct tcp_repair_window);
+}
+
+void __test_sock_checkpoint_opt(int sk, struct tcp_sock_state *state,
+				socklen_t trw_len,
+				void *addr, size_t addr_size);
 static inline void test_sock_checkpoint(int sk, struct tcp_sock_state *state,
 					sockaddr_af *saddr)
 {
-	__test_sock_checkpoint(sk, state, saddr, sizeof(*saddr));
+	__test_sock_checkpoint_opt(sk, state, test_tcp_repair_window_exact_size(),
+				   saddr, sizeof(*saddr));
+}
+
+static inline void test_sock_checkpoint_legacy(int sk,
+					       struct tcp_sock_state *state,
+					       sockaddr_af *saddr)
+{
+	__test_sock_checkpoint_opt(sk, state, test_tcp_repair_window_legacy_size(),
+				   saddr, sizeof(*saddr));
+}
+
+static inline void test_sock_checkpoint_v1(int sk,
+					   struct tcp_sock_state *state,
+					   sockaddr_af *saddr)
+{
+	__test_sock_checkpoint_opt(sk, state, test_tcp_repair_window_v1_size(),
+				   saddr, sizeof(*saddr));
 }
 extern void test_ao_checkpoint(int sk, struct tcp_ao_repair *state);
-extern void __test_sock_restore(int sk, const char *device,
-				struct tcp_sock_state *state,
-				void *saddr, void *daddr, size_t addr_size);
+void __test_sock_restore_opt(int sk, const char *device,
+			     struct tcp_sock_state *state,
+			     socklen_t trw_len,
+			     void *saddr, void *daddr,
+			     size_t addr_size);
 static inline void test_sock_restore(int sk, struct tcp_sock_state *state,
 				     sockaddr_af *saddr,
 				     const union tcp_addr daddr,
@@ -690,7 +729,37 @@ static inline void test_sock_restore(int sk, struct tcp_sock_state *state,
 	sockaddr_af addr;
 
 	tcp_addr_to_sockaddr_in(&addr, &daddr, htons(dport));
-	__test_sock_restore(sk, veth_name, state, saddr, &addr, sizeof(addr));
+	__test_sock_restore_opt(sk, veth_name, state,
+				test_tcp_repair_window_exact_size(),
+				saddr, &addr, sizeof(addr));
+}
+
+static inline void test_sock_restore_legacy(int sk,
+					    struct tcp_sock_state *state,
+					    sockaddr_af *saddr,
+					    const union tcp_addr daddr,
+					    unsigned int dport)
+{
+	sockaddr_af addr;
+
+	tcp_addr_to_sockaddr_in(&addr, &daddr, htons(dport));
+	__test_sock_restore_opt(sk, veth_name, state,
+				test_tcp_repair_window_legacy_size(),
+				saddr, &addr, sizeof(addr));
+}
+
+static inline void test_sock_restore_v1(int sk,
+					struct tcp_sock_state *state,
+					sockaddr_af *saddr,
+					const union tcp_addr daddr,
+					unsigned int dport)
+{
+	sockaddr_af addr;
+
+	tcp_addr_to_sockaddr_in(&addr, &daddr, htons(dport));
+	__test_sock_restore_opt(sk, veth_name, state,
+				test_tcp_repair_window_v1_size(),
+				saddr, &addr, sizeof(addr));
 }
 extern void test_ao_restore(int sk, struct tcp_ao_repair *state);
 extern void test_sock_state_free(struct tcp_sock_state *state);
diff --git a/tools/testing/selftests/net/tcp_ao/lib/repair.c b/tools/testing/selftests/net/tcp_ao/lib/repair.c
index 9893b3ba69f5..befbd0f72db5 100644
--- a/tools/testing/selftests/net/tcp_ao/lib/repair.c
+++ b/tools/testing/selftests/net/tcp_ao/lib/repair.c
@@ -66,8 +66,9 @@ static void test_sock_checkpoint_queue(int sk, int queue, int qlen,
 		test_error("recv(%d): %d", qlen, ret);
 }
 
-void __test_sock_checkpoint(int sk, struct tcp_sock_state *state,
-			    void *addr, size_t addr_size)
+void __test_sock_checkpoint_opt(int sk, struct tcp_sock_state *state,
+				socklen_t trw_len,
+				void *addr, size_t addr_size)
 {
 	socklen_t len = sizeof(state->info);
 	int ret;
@@ -82,9 +83,9 @@ void __test_sock_checkpoint(int sk, struct tcp_sock_state *state,
 	if (getsockname(sk, addr, &len) || len != addr_size)
 		test_error("getsockname(): %d", (int)len);
 
-	len = sizeof(state->trw);
+	len = trw_len;
 	ret = getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &state->trw, &len);
-	if (ret || len != sizeof(state->trw))
+	if (ret || len != trw_len)
 		test_error("getsockopt(TCP_REPAIR_WINDOW): %d", (int)len);
 
 	if (ioctl(sk, SIOCOUTQ, &state->outq_len))
@@ -160,9 +161,10 @@ static void test_sock_restore_queue(int sk, int queue, void *buf, int len)
 	} while (len > 0);
 }
 
-void __test_sock_restore(int sk, const char *device,
-			 struct tcp_sock_state *state,
-			 void *saddr, void *daddr, size_t addr_size)
+void __test_sock_restore_opt(int sk, const char *device,
+			     struct tcp_sock_state *state,
+			     socklen_t trw_len,
+			     void *saddr, void *daddr, size_t addr_size)
 {
 	struct tcp_repair_opt opts[4];
 	unsigned int opt_nr = 0;
@@ -215,7 +217,7 @@ void __test_sock_restore(int sk, const char *device,
 	}
 	test_sock_restore_queue(sk, TCP_RECV_QUEUE, state->in.buf, state->inq_len);
 	test_sock_restore_queue(sk, TCP_SEND_QUEUE, state->out.buf, state->outq_len);
-	if (setsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &state->trw, sizeof(state->trw)))
+	if (setsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &state->trw, trw_len))
 		test_error("setsockopt(TCP_REPAIR_WINDOW)");
 }
 
diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c
index 2c73bea698a6..a7c0f2edd351 100644
--- a/tools/testing/selftests/net/tcp_ao/self-connect.c
+++ b/tools/testing/selftests/net/tcp_ao/self-connect.c
@@ -4,6 +4,14 @@
 #include "aolib.h"
 
 static union tcp_addr local_addr;
+static bool checked_repair_window_lens;
+
+enum repair_window_mode {
+	REPAIR_WINDOW_CURRENT,
+	REPAIR_WINDOW_LEGACY,
+	REPAIR_WINDOW_V1,
+	REPAIR_WINDOW_RETRACTED,
+};
 
 static void __setup_lo_intf(const char *lo_intf,
 			    const char *addr_str, uint8_t prefix)
@@ -30,8 +38,157 @@ static void setup_lo_intf(const char *lo_intf)
 #endif
 }
 
+/* The repair ABI accepts the legacy, v1, and current layouts. */
+static void test_repair_window_len_contract(int sk)
+{
+	struct tcp_repair_window trw = {};
+	socklen_t len = test_tcp_repair_window_exact_size();
+	socklen_t v1_len = test_tcp_repair_window_v1_size();
+	socklen_t bad_len = test_tcp_repair_window_legacy_size() + 1;
+	int ret;
+
+	if (checked_repair_window_lens)
+		return;
+
+	checked_repair_window_lens = true;
+
+	ret = getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, &len);
+	if (ret || len != test_tcp_repair_window_exact_size())
+		test_error("getsockopt(TCP_REPAIR_WINDOW): %d", (int)len);
+
+	len = v1_len;
+	ret = getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, &len);
+	if (ret || len != v1_len)
+		test_fail("repair-window get accepts v1 len");
+	else
+		test_ok("repair-window get accepts v1 len");
+
+	len = bad_len;
+	ret = getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, &len);
+	if (ret == 0 || errno != EINVAL)
+		test_fail("repair-window get rejects invalid len");
+	else
+		test_ok("repair-window get rejects invalid len");
+
+	ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, bad_len);
+	if (ret == 0 || errno != EINVAL)
+		test_fail("repair-window set rejects invalid len");
+	else
+		test_ok("repair-window set rejects invalid len");
+
+	ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, v1_len + 1);
+	if (ret == 0 || errno != EINVAL)
+		test_fail("repair-window set rejects invalid v1+1 len");
+	else
+		test_ok("repair-window set rejects invalid v1+1 len");
+}
+
+static void test_retracted_repair_window_state(int sk,
+					       struct tcp_sock_state *img)
+{
+	struct tcp_repair_window trw = {};
+	socklen_t len = sizeof(trw);
+	int ret;
+
+	ret = getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, &len);
+	if (ret || len != sizeof(trw))
+		test_error("getsockopt(TCP_REPAIR_WINDOW): %d", (int)len);
+
+	if (trw.rcv_mwnd_seq != img->trw.rcv_mwnd_seq ||
+	    trw.rcv_mwnd_scaling_ratio != img->trw.rcv_mwnd_scaling_ratio ||
+	    trw.rcv_wnd != img->trw.rcv_wnd ||
+	    trw.rcv_wup != img->trw.rcv_wup ||
+	    trw.rcv_wnd_scaling_ratio != img->trw.rcv_wnd_scaling_ratio)
+		test_fail("repair-window restore preserves retracted state");
+	else
+		test_ok("repair-window restore preserves retracted state");
+}
+
+static void test_v1_repair_window_state(int sk, struct tcp_sock_state *img)
+{
+	struct tcp_repair_window trw = {};
+	socklen_t len = sizeof(trw);
+	__u32 max_right = img->trw.rcv_wup + img->trw.rcv_wnd;
+	int ret;
+
+	ret = getsockopt(sk, SOL_TCP, TCP_REPAIR_WINDOW, &trw, &len);
+	if (ret || len != sizeof(trw))
+		test_error("getsockopt(TCP_REPAIR_WINDOW): %d", (int)len);
+
+	if (trw.rcv_mwnd_seq != max_right ||
+	    trw.rcv_mwnd_scaling_ratio != img->trw.rcv_wnd_scaling_ratio ||
+	    trw.rcv_wnd != img->trw.rcv_wnd ||
+	    trw.rcv_wup != img->trw.rcv_wup ||
+	    trw.rcv_wnd_scaling_ratio != img->trw.rcv_wnd_scaling_ratio)
+		test_fail("repair-window v1 restore rebuilds max-window state");
+	else
+		test_ok("repair-window v1 restore rebuilds max-window state");
+}
+
+/* Synthesize a repair image whose live rwnd was retracted after a larger
+ * right edge had already been advertised, so restore testing can validate
+ * snapshot preservation without depending on the live receive path.
+ */
+static bool make_retracted_repair_window_state(struct tcp_sock_state *img)
+{
+	__u32 gran = 1U << img->info.tcpi_rcv_wscale;
+	__u32 max_right;
+	__u32 shrink;
+
+	if (!(img->info.tcpi_options & TCPI_OPT_WSCALE))
+		return false;
+
+	max_right = img->trw.rcv_wup + img->trw.rcv_wnd;
+	shrink = img->trw.rcv_wnd / 4;
+	if (shrink < gran)
+		shrink = gran;
+	if (shrink >= img->trw.rcv_wnd)
+		shrink = img->trw.rcv_wnd >> 1;
+	if (shrink == 0 || shrink >= img->trw.rcv_wnd)
+		return false;
+
+	img->trw.rcv_wnd -= shrink;
+	img->trw.rcv_mwnd_seq = max_right;
+	img->trw.rcv_mwnd_scaling_ratio = img->trw.rcv_wnd_scaling_ratio;
+	return true;
+}
+
+static socklen_t repair_window_len(enum repair_window_mode mode)
+{
+	switch (mode) {
+	case REPAIR_WINDOW_LEGACY:
+		return test_tcp_repair_window_legacy_size();
+	case REPAIR_WINDOW_V1:
+		return test_tcp_repair_window_v1_size();
+	case REPAIR_WINDOW_CURRENT:
+	case REPAIR_WINDOW_RETRACTED:
+		return test_tcp_repair_window_exact_size();
+	}
+
+	return test_tcp_repair_window_exact_size();
+}
+
+static void test_sock_checkpoint_mode(enum repair_window_mode mode, int sk,
+				      struct tcp_sock_state *img,
+				      sockaddr_af *addr)
+{
+	switch (mode) {
+	case REPAIR_WINDOW_LEGACY:
+		test_sock_checkpoint_legacy(sk, img, addr);
+		break;
+	case REPAIR_WINDOW_V1:
+		test_sock_checkpoint_v1(sk, img, addr);
+		break;
+	case REPAIR_WINDOW_CURRENT:
+	case REPAIR_WINDOW_RETRACTED:
+		test_sock_checkpoint(sk, img, addr);
+		break;
+	}
+}
+
 static void tcp_self_connect(const char *tst, unsigned int port,
-			     bool different_keyids, bool check_restore)
+			     bool different_keyids, bool check_restore,
+			     enum repair_window_mode repair_window_mode)
 {
 	struct tcp_counters before, after;
 	uint64_t before_aogood, after_aogood;
@@ -109,7 +266,16 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 	}
 
 	test_enable_repair(sk);
-	test_sock_checkpoint(sk, &img, &addr);
+	test_repair_window_len_contract(sk);
+	test_sock_checkpoint_mode(repair_window_mode, sk, &img, &addr);
+	if (repair_window_mode == REPAIR_WINDOW_RETRACTED &&
+	    !make_retracted_repair_window_state(&img)) {
+		test_sock_state_free(&img);
+		netstat_free(ns_before);
+		close(sk);
+		test_skip("%s: no scaled repair window to retract", tst);
+		return;
+	}
 #ifdef IPV6_TEST
 	addr.sin6_port = htons(port + 1);
 #else
@@ -123,7 +289,9 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 		test_error("socket()");
 
 	test_enable_repair(sk);
-	__test_sock_restore(sk, "lo", &img, &addr, &addr, sizeof(addr));
+	__test_sock_restore_opt(sk, "lo", &img,
+				repair_window_len(repair_window_mode),
+				&addr, &addr, sizeof(addr));
 	if (different_keyids) {
 		if (test_add_repaired_key(sk, DEFAULT_TEST_PASSWORD, 0,
 					  local_addr, -1, 7, 5))
@@ -137,6 +305,10 @@ static void tcp_self_connect(const char *tst, unsigned int port,
 			test_error("setsockopt(TCP_AO_ADD_KEY)");
 	}
 	test_ao_restore(sk, &ao_img);
+	if (repair_window_mode == REPAIR_WINDOW_V1)
+		test_v1_repair_window_state(sk, &img);
+	if (repair_window_mode == REPAIR_WINDOW_RETRACTED)
+		test_retracted_repair_window_state(sk, &img);
 	test_disable_repair(sk);
 	test_sock_state_free(&img);
 	if (test_client_verify(sk, 100, nr_packets)) {
@@ -165,20 +337,33 @@ static void *client_fn(void *arg)
 
 	setup_lo_intf("lo");
 
-	tcp_self_connect("self-connect(same keyids)", port++, false, false);
+	tcp_self_connect("self-connect(same keyids)", port++, false, false,
+			 REPAIR_WINDOW_CURRENT);
 
 	/* expecting rnext to change based on the first segment RNext != Current */
 	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
 			      port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1);
-	tcp_self_connect("self-connect(different keyids)", port++, true, false);
-	tcp_self_connect("self-connect(restore)", port, false, true);
+	tcp_self_connect("self-connect(different keyids)", port++, true, false,
+			 REPAIR_WINDOW_CURRENT);
+	tcp_self_connect("self-connect(restore)", port, false, true,
+			 REPAIR_WINDOW_CURRENT);
+	port += 2; /* restore test restores over different port */
+	tcp_self_connect("self-connect(restore, legacy repair window)", port,
+			 false, true, REPAIR_WINDOW_LEGACY);
+	port += 2; /* restore test restores over different port */
+	tcp_self_connect("self-connect(restore, v1 repair window)", port,
+			 false, true, REPAIR_WINDOW_V1);
+	port += 2; /* restore test restores over different port */
+	tcp_self_connect("self-connect(restore, retracted repair window)", port,
+			 false, true, REPAIR_WINDOW_RETRACTED);
 	port += 2; /* restore test restores over different port */
 	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
 			      port, port, 0, -1, -1, -1, -1, -1, 7, 5, -1);
 	/* intentionally on restore they are added to the socket in different order */
 	trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, local_addr, local_addr,
 			      port + 1, port + 1, 0, -1, -1, -1, -1, -1, 5, 7, -1);
-	tcp_self_connect("self-connect(restore, different keyids)", port, true, true);
+	tcp_self_connect("self-connect(restore, different keyids)",
+			 port, true, true, REPAIR_WINDOW_CURRENT);
 	port += 2; /* restore test restores over different port */
 
 	return NULL;
@@ -186,6 +371,6 @@ static void *client_fn(void *arg)
 
 int main(int argc, char *argv[])
 {
-	test_init(5, client_fn, NULL);
+	test_init(14, client_fn, NULL);
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 10/14] tcp: expose rmem and backlog in tcp and mptcp rcvbuf_grow tracepoints
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Extend the tcp_rcvbuf_grow and mptcp_rcvbuf_grow tracepoints with the
live receive-memory allocation and backlog occupancy that now drive the
window-growth decisions in this series.

That makes it easier to inspect sender-visible rwnd state against the
actual hard receive-memory inputs.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 include/trace/events/mptcp.h | 11 +++++++----
 include/trace/events/tcp.h   | 12 +++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h
index 269d949b2025..167970e8e0a5 100644
--- a/include/trace/events/mptcp.h
+++ b/include/trace/events/mptcp.h
@@ -199,6 +199,8 @@ TRACE_EVENT(mptcp_rcvbuf_grow,
 		__field(__u32, inq)
 		__field(__u32, space)
 		__field(__u32, ooo_space)
+		__field(__u32, rmem_alloc)
+		__field(__u32, backlog_len)
 		__field(__u32, rcvbuf)
 		__field(__u32, rcv_wnd)
 		__field(__u8, scaling_ratio)
@@ -228,6 +230,8 @@ TRACE_EVENT(mptcp_rcvbuf_grow,
 				     MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq -
 				     msk->ack_seq;
 
+		__entry->rmem_alloc = tcp_rmem_used(sk);
+		__entry->backlog_len = READ_ONCE(msk->backlog_len);
 		__entry->rcvbuf = sk->sk_rcvbuf;
 		__entry->rcv_wnd = atomic64_read(&msk->rcv_wnd_sent) -
 				   msk->ack_seq;
@@ -248,12 +252,11 @@ TRACE_EVENT(mptcp_rcvbuf_grow,
 		__entry->skaddr = sk;
 	),
 
-	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u "
-		  "rcvbuf=%u rcv_wnd=%u family=%d sport=%hu dport=%hu saddr=%pI4 "
-		  "daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
+	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rmem_alloc=%u backlog_len=%u rcvbuf=%u rcv_wnd=%u family=%d sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skaddr=%p",
 		  __entry->time, __entry->rtt_us, __entry->copied,
 		  __entry->inq, __entry->space, __entry->ooo_space,
-		  __entry->scaling_ratio, __entry->rcvbuf, __entry->rcv_wnd,
+		  __entry->scaling_ratio, __entry->rmem_alloc,
+		  __entry->backlog_len, __entry->rcvbuf, __entry->rcv_wnd,
 		  __entry->family, __entry->sport, __entry->dport,
 		  __entry->saddr, __entry->daddr, __entry->saddr_v6,
 		  __entry->daddr_v6, __entry->skaddr)
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index f155f95cdb6e..92d0bd6be0ba 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -217,6 +217,8 @@ TRACE_EVENT(tcp_rcvbuf_grow,
 		__field(__u32, inq)
 		__field(__u32, space)
 		__field(__u32, ooo_space)
+		__field(__u32, rmem_alloc)
+		__field(__u32, backlog_len)
 		__field(__u32, rcvbuf)
 		__field(__u32, rcv_ssthresh)
 		__field(__u32, window_clamp)
@@ -247,6 +249,8 @@ TRACE_EVENT(tcp_rcvbuf_grow,
 				     TCP_SKB_CB(tp->ooo_last_skb)->end_seq -
 				     tp->rcv_nxt;
 
+		__entry->rmem_alloc = tcp_rmem_used(sk);
+		__entry->backlog_len = READ_ONCE(sk->sk_backlog.len);
 		__entry->rcvbuf = sk->sk_rcvbuf;
 		__entry->rcv_ssthresh = tp->rcv_ssthresh;
 		__entry->window_clamp = tp->window_clamp;
@@ -269,13 +273,11 @@ TRACE_EVENT(tcp_rcvbuf_grow,
 		__entry->sock_cookie = sock_gen_cookie(sk);
 	),
 
-	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
-		  "rcv_ssthresh=%u window_clamp=%u rcv_wnd=%u "
-		  "family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
-		  "saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
+	TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rmem_alloc=%u backlog_len=%u rcvbuf=%u rcv_ssthresh=%u window_clamp=%u rcv_wnd=%u family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
 		  __entry->time, __entry->rtt_us, __entry->copied,
 		  __entry->inq, __entry->space, __entry->ooo_space,
-		  __entry->scaling_ratio, __entry->rcvbuf,
+		  __entry->scaling_ratio, __entry->rmem_alloc,
+		  __entry->backlog_len, __entry->rcvbuf,
 		  __entry->rcv_ssthresh, __entry->window_clamp,
 		  __entry->rcv_wnd,
 		  show_family_name(__entry->family),
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 09/14] mptcp: refresh TCP receive-window snapshots on subflows
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

When MPTCP resynchronizes the per-subflow TCP shadow window from the
mptcp-level receive state, refresh the live rwnd snapshot and the
remembered maximum-window snapshot along with it.

That keeps subflow TCP bookkeeping aligned with the sender-visible
window state tracked in the core TCP patches.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 net/mptcp/options.c  | 14 +++++++++-----
 net/mptcp/protocol.h | 14 +++++++++++---
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 8a1c5698983c..64cd637484a4 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1073,9 +1073,12 @@ static void rwin_update(struct mptcp_sock *msk, struct sock *ssk,
 		return;
 
 	/* Some other subflow grew the mptcp-level rwin since rcv_wup,
-	 * resync.
+	 * resync. Keep the TCP shadow window in its advertised u32 domain
+	 * and refresh the advertise-time scaling snapshot while doing so.
 	 */
-	tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent;
+	tcp_set_rcv_wnd(tp, min_t(u64, (u64)tp->rcv_wnd +
+				  (mptcp_rcv_wnd - subflow->rcv_wnd_sent),
+				  U32_MAX));
 	tcp_update_max_rcv_wnd_seq(tp);
 	subflow->rcv_wnd_sent = mptcp_rcv_wnd;
 }
@@ -1335,12 +1338,13 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th)
 	if (rcv_wnd_new != rcv_wnd_old) {
 raise_win:
 		/* The msk-level rcv wnd is after the tcp level one,
-		 * sync the latter.
+		 * sync the latter and refresh its advertise-time scaling
+		 * snapshot.
 		 */
 		rcv_wnd_new = rcv_wnd_old;
 		win = rcv_wnd_old - ack_seq;
-		new_win = min_t(u64, win, U32_MAX);
-		tp->rcv_wnd = new_win;
+		tcp_set_rcv_wnd(tp, min_t(u64, win, U32_MAX));
+		new_win = tp->rcv_wnd;
 		tcp_update_max_rcv_wnd_seq(tp);
 
 		/* Make sure we do not exceed the maximum possible
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 0bd1ee860316..4ea95c9c0c7a 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -408,11 +408,19 @@ static inline int mptcp_space_from_win(const struct sock *sk, int win)
 	return __tcp_space_from_win(mptcp_sk(sk)->scaling_ratio, win);
 }
 
+/* MPTCP exposes window space from the mptcp-level receive queue, so it tracks
+ * a separate backlog counter from the subflow backlog embedded in struct sock.
+ */
+static inline int mptcp_rwnd_avail(const struct sock *sk)
+{
+	return READ_ONCE(sk->sk_rcvbuf) -
+	       READ_ONCE(mptcp_sk(sk)->backlog_len) -
+	       tcp_rmem_used(sk);
+}
+
 static inline int __mptcp_space(const struct sock *sk)
 {
-	return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) -
-				    READ_ONCE(mptcp_sk(sk)->backlog_len) -
-				    sk_rmem_alloc_get(sk));
+	return mptcp_win_from_space(sk, mptcp_rwnd_avail(sk));
 }
 
 static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 08/14] tcp: extend TCP_REPAIR_WINDOW for live and max-window snapshots
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Extend TCP_REPAIR_WINDOW so repair and restore can round-trip both the
live rwnd snapshot and the remembered maximum sender-visible window.

Keep the ABI append-only by accepting the legacy and v1 prefix lengths on
both get and set, rebuilding any missing max-window state from the live
window when older userspace restores a socket.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 include/net/tcp.h        | 13 +++----
 include/uapi/linux/tcp.h |  8 +++++
 net/ipv4/tcp.c           | 73 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b479ad44f89..12e62fea2aaf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1766,13 +1766,14 @@ static inline bool tcp_space_from_wnd_snapshot(u8 scaling_ratio, int win,
 }
 
 /* Rebuild hard receive-memory units for data already covered by tp->rcv_wnd if
- * the advertise-time basis is known.
+ * the advertise-time basis is known. Legacy TCP_REPAIR restores can only
+ * recover tp->rcv_wnd itself; callers must fall back when the snapshot is
+ * unknown.
  */
 static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win,
 					  int *space)
 {
-	return tcp_space_from_wnd_snapshot(tp->rcv_wnd_scaling_ratio, win,
-					   space);
+	return tcp_space_from_wnd_snapshot(tp->rcv_wnd_scaling_ratio, win, space);
 }
 
 /* Same as tcp_space_from_rcv_wnd(), but for the remembered maximum
@@ -1800,9 +1801,9 @@ static inline void tcp_scaling_ratio_init(struct sock *sk)
 }
 
 /* tp->rcv_wnd is paired with the scaling_ratio that was in force when that
- * window was last advertised. Callers can leave a zero snapshot when the
- * advertise-time basis is unknown and refresh the pair on the next local
- * window update.
+ * window was last advertised. Legacy TCP_REPAIR restores can only recover the
+ * window value itself and use a zero snapshot until a fresh local window
+ * advertisement refreshes the pair.
  */
 static inline void tcp_set_rcv_wnd_snapshot(struct tcp_sock *tp, u32 win,
 					    u8 scaling_ratio)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 03772dd4d399..564a77f69130 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -152,6 +152,11 @@ struct tcp_repair_opt {
 	__u32	opt_val;
 };
 
+/* Append-only repair ABI.
+ * Older userspace may stop at rcv_wup or rcv_wnd_scaling_ratio.
+ * The kernel accepts those prefix lengths and rebuilds any missing
+ * receive-window snapshot state on restore.
+ */
 struct tcp_repair_window {
 	__u32	snd_wl1;
 	__u32	snd_wnd;
@@ -159,6 +164,9 @@ struct tcp_repair_window {
 
 	__u32	rcv_wnd;
 	__u32	rcv_wup;
+	__u32	rcv_wnd_scaling_ratio;  /* 0 means live-window basis unknown */
+	__u32	rcv_mwnd_seq;
+	__u32	rcv_mwnd_scaling_ratio; /* 0 means max-window basis unknown */
 };
 
 enum {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66706dbb90f5..39a1265876ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3533,17 +3533,31 @@ static inline bool tcp_can_repair_sock(const struct sock *sk)
 		(sk->sk_state != TCP_LISTEN);
 }
 
+/* Keep accepting the pre-extension TCP_REPAIR_WINDOW layout so legacy
+ * userspace can restore sockets without fabricating a snapshot basis.
+ */
+static inline int tcp_repair_window_legacy_size(void)
+{
+	return offsetof(struct tcp_repair_window, rcv_wnd_scaling_ratio);
+}
+
+static inline int tcp_repair_window_v1_size(void)
+{
+	return offsetof(struct tcp_repair_window, rcv_mwnd_seq);
+}
+
 static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
 {
-	struct tcp_repair_window opt;
+	struct tcp_repair_window opt = {};
 
 	if (!tp->repair)
 		return -EPERM;
 
-	if (len != sizeof(opt))
+	if (len != tcp_repair_window_legacy_size() &&
+	    len != tcp_repair_window_v1_size() && len != sizeof(opt))
 		return -EINVAL;
 
-	if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
+	if (copy_from_sockptr(&opt, optbuf, len))
 		return -EFAULT;
 
 	if (opt.max_window < opt.snd_wnd)
@@ -3559,9 +3573,47 @@ static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
 	tp->snd_wnd	= opt.snd_wnd;
 	tp->max_window	= opt.max_window;
 
-	tp->rcv_wnd	= opt.rcv_wnd;
+	if (len == tcp_repair_window_legacy_size()) {
+		/* Legacy repair UAPI has no advertise-time basis for tp->rcv_wnd.
+		 * Mark the snapshot unknown until a fresh local advertisement
+		 * re-establishes the pair.
+		 */
+		tcp_set_rcv_wnd_unknown(tp, opt.rcv_wnd);
+		tp->rcv_wup	= opt.rcv_wup;
+		tcp_init_max_rcv_wnd_seq(tp);
+		return 0;
+	}
+
+	if (opt.rcv_wnd_scaling_ratio > U8_MAX)
+		return -EINVAL;
+
+	tcp_set_rcv_wnd_snapshot(tp, opt.rcv_wnd, opt.rcv_wnd_scaling_ratio);
 	tp->rcv_wup	= opt.rcv_wup;
-	tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd;
+
+	if (len == tcp_repair_window_v1_size()) {
+		/* v1 repair can restore the live-window snapshot, but not a
+		 * retracted max-window snapshot. Rebuild it from the live pair
+		 * until a fresh local advertisement updates it again.
+		 */
+		tcp_init_max_rcv_wnd_seq(tp);
+		return 0;
+	}
+
+	if (opt.rcv_mwnd_scaling_ratio > U8_MAX)
+		return -EINVAL;
+
+	/* Userspace may repair sequence-space values after checkpoint without
+	 * also rebasing the remembered max advertised right edge. If the exact
+	 * snapshot no longer covers the restored live window, treat it like
+	 * v1 and rebuild the max-window side from the live pair.
+	 */
+	if (after(opt.rcv_wup + opt.rcv_wnd, opt.rcv_mwnd_seq)) {
+		tcp_init_max_rcv_wnd_seq(tp);
+		return 0;
+	}
+
+	tp->rcv_mwnd_seq = opt.rcv_mwnd_seq;
+	tp->rcv_mwnd_scaling_ratio = opt.rcv_mwnd_scaling_ratio;
 
 	return 0;
 }
@@ -4650,12 +4702,16 @@ int do_tcp_getsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_REPAIR_WINDOW: {
-		struct tcp_repair_window opt;
+		struct tcp_repair_window opt = {};
 
 		if (copy_from_sockptr(&len, optlen, sizeof(int)))
 			return -EFAULT;
 
-		if (len != sizeof(opt))
+		/* Mirror the accepted set-side prefix lengths so checkpoint
+		 * tools can round-trip exactly the layout version they know.
+		 */
+		if (len != tcp_repair_window_legacy_size() &&
+		    len != tcp_repair_window_v1_size() && len != sizeof(opt))
 			return -EINVAL;
 
 		if (!tp->repair)
@@ -4666,6 +4722,9 @@ int do_tcp_getsockopt(struct sock *sk, int level,
 		opt.max_window	= tp->max_window;
 		opt.rcv_wnd	= tp->rcv_wnd;
 		opt.rcv_wup	= tp->rcv_wup;
+		opt.rcv_wnd_scaling_ratio = tp->rcv_wnd_scaling_ratio;
+		opt.rcv_mwnd_seq = tp->rcv_mwnd_seq;
+		opt.rcv_mwnd_scaling_ratio = tp->rcv_mwnd_scaling_ratio;
 
 		if (copy_to_sockptr(optval, &opt, len))
 			return -EFAULT;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

If receive-side accounting retracts the live rwnd below a larger
sender-visible window that was already advertised, allow one in-order
skb within that historical bound to repair its backing and reach the
normal receive path.

Hard receive-memory admission is still enforced through the existing
prune and collapse path. The rescue only changes how data already
inside sender-visible sequence space is classified and backed.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 net/ipv4/tcp_input.c | 92 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 86 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d76e4e4c0e57..4b9309c37e99 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5376,24 +5376,86 @@ static void tcp_ofo_queue(struct sock *sk)
 static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
 static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
 
+/* Sequence checks run against the sender-visible receive window before this
+ * point. If later receive-side accounting retracts the live receive window
+ * below the maximum right edge we already advertised, allow one in-order skb
+ * which still fits inside that sender-visible bound to reach the normal
+ * receive queue path.
+ *
+ * Keep receive-memory admission itself on the legacy hard-cap path so prune
+ * and collapse behavior stay aligned with the established retracted-window
+ * handling.
+ */
+static bool tcp_skb_in_retracted_window(const struct tcp_sock *tp,
+					const struct sk_buff *skb)
+{
+	u32 live_end = tp->rcv_nxt + tcp_receive_window(tp);
+	u32 max_end = tp->rcv_nxt + tcp_max_receive_window(tp);
+
+	return after(max_end, live_end) &&
+	       after(TCP_SKB_CB(skb)->end_seq, live_end) &&
+	       !after(TCP_SKB_CB(skb)->end_seq, max_end);
+}
+
 static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
 {
-	unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
+	return tcp_rmem_used(sk) <= READ_ONCE(sk->sk_rcvbuf);
+}
+
+/* Caller already established that @skb extends into the retracted-but-still-
+ * valid sender-visible window. For in-order progress, regrow sk_rcvbuf before
+ * falling into prune/forced-mem handling.
+ *
+ * This path intentionally repairs backing for one in-order skb that is already
+ * within sender-visible sequence space, rather than treating it like ordinary
+ * receive-buffer autotuning.
+ *
+ * Keep this rescue bounded to the span accepted by this skb instead of the
+ * full historical tp->rcv_mwnd_seq. However, never grow below skb->truesize,
+ * because sk_rmem_schedule() still charges hard memory, not sender-visible
+ * window bytes.
+ */
+static void tcp_try_grow_retracted_skb(struct sock *sk,
+				       const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int needed = skb->truesize;
+	int span_space;
+	u32 span_win;
+
+	if (TCP_SKB_CB(skb)->seq != tp->rcv_nxt)
+		return;
+
+	span_win = TCP_SKB_CB(skb)->end_seq - tp->rcv_nxt;
+	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+		span_win--;
+
+	if (tcp_space_from_rcv_mwnd(tp, span_win, &span_space))
+		needed = max_t(int, needed, span_space);
 
-	return rmem <= sk->sk_rcvbuf;
+	tcp_try_grow_rcvbuf(sk, needed);
 }
 
+/* Sender-visible window rescue does not relax hard receive-memory admission.
+ * If growth did not make room, fall back to the established prune/collapse
+ * path.
+ */
 static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
 				 unsigned int size)
 {
-	if (!tcp_can_ingest(sk, skb) ||
-	    !sk_rmem_schedule(sk, skb, size)) {
+	bool can_ingest = tcp_can_ingest(sk, skb);
+	bool scheduled = can_ingest && sk_rmem_schedule(sk, skb, size);
+
+	if (!scheduled) {
+		int pruned = tcp_prune_queue(sk, skb);
 
-		if (tcp_prune_queue(sk, skb) < 0)
+		if (pruned < 0)
 			return -1;
 
 		while (!sk_rmem_schedule(sk, skb, size)) {
-			if (!tcp_prune_ofo_queue(sk, skb))
+			bool pruned_ofo = tcp_prune_ofo_queue(sk, skb);
+
+			if (!pruned_ofo)
 				return -1;
 		}
 	}
@@ -5629,6 +5691,7 @@ void tcp_data_ready(struct sock *sk)
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	bool retracted;
 	enum skb_drop_reason reason;
 	bool fragstolen;
 	int eaten;
@@ -5647,6 +5710,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	}
 	tcp_cleanup_skb(skb);
 	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
+	retracted = skb->len && tcp_skb_in_retracted_window(tp, skb);
 
 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	tp->rx_opt.dsack = 0;
@@ -5667,6 +5731,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
 				goto queue_and_out;
 
+			if (retracted)
+				goto queue_and_out;
+
 			reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
 			goto out_of_window;
@@ -5674,7 +5741,20 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 
 		/* Ok. In sequence. In window. */
 queue_and_out:
+		if (unlikely(retracted))
+			tcp_try_grow_retracted_skb(sk, skb);
+
 		if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+			/* If the live rwnd collapsed to zero while rescuing an
+			 * skb that still fit in sender-visible sequence space,
+			 * report zero-window rather than generic proto-mem.
+			 */
+			if (unlikely(!tcp_receive_window(tp) && retracted)) {
+				reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
+				NET_INC_STATS(sock_net(sk),
+					      LINUX_MIB_TCPZEROWINDOWDROP);
+				goto out_of_window;
+			}
 			/* TODO: maybe ratelimit these WIN 0 ACK ? */
 			inet_csk(sk)->icsk_ack.pending |=
 					(ICSK_ACK_NOMEM | ICSK_ACK_NOW);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 06/14] tcp: regrow rcvbuf when scaling_ratio drops after advertisement
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

When tcp_measure_rcv_mss() lowers scaling_ratio after a window was
already advertised, grow sk_rcvbuf so the remaining live sender-visible
window still has matching hard receive-memory backing.

This repairs the live advertised window only. Retracted-window rescue is
handled separately in a later patch.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 net/ipv4/tcp_input.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 32256519a085..d76e4e4c0e57 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -221,6 +221,31 @@ static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff
 	rcu_read_unlock();
 }
 
+/* If scaling_ratio drops after we already advertised tp->rcv_wnd, grow
+ * sk_rcvbuf so the remaining live window still maps back to hard memory
+ * units under the old advertise-time basis.
+ */
+static void tcp_try_grow_advertised_window(struct sock *sk,
+					   const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int needed;
+
+	/* Keep this repair aligned with tcp_rcvbuf_grow(): do not adjust
+	 * receive-buffer backing for not-yet-accepted or orphaned sockets.
+	 */
+	if (!tcp_rcvbuf_grow_allowed(sk))
+		return;
+
+	if (!tcp_receive_window(tp))
+		return;
+
+	if (!tcp_space_from_rcv_wnd(tp, tcp_receive_window(tp), &needed))
+		return;
+
+	tcp_try_grow_rcvbuf(sk, needed);
+}
+
 /* Adapt the MSS value used to make delayed ack decision to the
  * real world.
  */
@@ -251,6 +276,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
 			if (old_ratio != tcp_sk(sk)->scaling_ratio) {
 				struct tcp_sock *tp = tcp_sk(sk);
 
+				tcp_try_grow_advertised_window(sk, skb);
 				val = tcp_win_from_space(sk, sk->sk_rcvbuf);
 				tcp_set_window_clamp(sk, val);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
more sender-visible window than the current hard receive-memory backing
can cover.

The new helper keeps backlog and memory-pressure limits in the same
units as the rest of the receive path, while __tcp_select_window()
backs any rounding slack before advertising it.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 include/net/tcp.h     | 12 ++++++++++++
 net/ipv4/tcp_input.c  | 36 ++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_output.c | 15 +++++++++++++--
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc22ab6b80d5..5b479ad44f89 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
 enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
@@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
 	return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
 }
 
+/* Passive children clone the listener's sk_socket until accept() grafts
+ * their own struct socket, so only sockets that point back to themselves
+ * should autotune receive-buffer backing.
+ */
+static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
+{
+	struct socket *sock = READ_ONCE(sk->sk_socket);
+
+	return sock && READ_ONCE(sock->sk) == sk;
+}
+
 /* Note: caller must be prepared to deal with negative returns */
 static inline int tcp_space(const struct sock *sk)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 352f814a4ff6..32256519a085 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
 				    (u32)TCP_INIT_CWND * tp->advmss);
 }
 
+/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
+ * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
+ * already consumed by sk_backlog.len.
+ */
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
+{
+	struct net *net = sock_net(sk);
+	int backlog;
+	int rmem2;
+	int target;
+
+	needed = max(needed, 0);
+	backlog = READ_ONCE(sk->sk_backlog.len);
+	target = tcp_rmem_used(sk) + backlog + needed;
+
+	if (target <= READ_ONCE(sk->sk_rcvbuf))
+		return true;
+
+	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+	if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
+	    (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+	    tcp_under_memory_pressure(sk) ||
+	    sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+		return false;
+
+	WRITE_ONCE(sk->sk_rcvbuf,
+		   min_t(int, rmem2,
+			 max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
+
+	return target <= READ_ONCE(sk->sk_rcvbuf);
+}
+
 /* 4. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk)
 {
@@ -785,14 +817,14 @@ static void tcp_clamp_window(struct sock *sk)
 	icsk->icsk_ack.quick = 0;
 	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
 
-	if (sk->sk_rcvbuf < rmem2 &&
+	if (READ_ONCE(sk->sk_rcvbuf) < rmem2 &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
 	    !tcp_under_memory_pressure(sk) &&
 	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
 		WRITE_ONCE(sk->sk_rcvbuf,
 			   min(atomic_read(&sk->sk_rmem_alloc), rmem2));
 	}
-	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 57a2a6daaad3..53781cf591d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
 	 * scaled window will not line up with the MSS boundary anyway.
 	 */
 	if (tp->rx_opt.rcv_wscale) {
+		int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
+
 		window = free_space;
 
 		/* Advertise enough space so that it won't get scaled away.
-		 * Import case: prevent zero window announcement if
+		 * Important case: prevent zero-window announcement if
 		 * 1<<rcv_wscale > mss.
 		 */
-		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+		window = ALIGN(window, rcv_wscale);
+
+		/* Back any scale-quantization slack before we expose it.
+		 * Otherwise tcp_can_ingest() can reject data which is still
+		 * within the sender-visible window.
+		 */
+		if (window > free_space &&
+		    (!tcp_rcvbuf_grow_allowed(sk) ||
+		     !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
+			window = round_down(free_space, rcv_wscale);
 	} else {
 		window = tp->rcv_wnd;
 		/* Get the largest window that is a nice multiple of mss.
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 04/14] tcp: snapshot the maximum advertised receive window
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Track the maximum sender-visible receive-window right edge separately
from the live rwnd, along with the scaling basis that was in force when
that larger window was advertised.

This gives later admission and restore paths enough information to
reason about retracted windows without losing the original sender-
visible bound.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 .../networking/net_cachelines/tcp_sock.rst    |  1 +
 include/linux/tcp.h                           |  1 +
 include/net/tcp.h                             | 21 ++++++++++++++++++-
 net/ipv4/tcp.c                                |  1 +
 net/ipv4/tcp_fastopen.c                       |  2 +-
 net/ipv4/tcp_input.c                          |  4 ++--
 net/ipv4/tcp_minisocks.c                      |  2 +-
 net/ipv4/tcp_output.c                         |  2 +-
 8 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
index 09ece1c59c2d..d58a3b1eb55d 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -11,6 +11,7 @@ Type                          Name                    fastpath_tx_access  fastpa
 struct inet_connection_sock   inet_conn
 u16                           tcp_header_len          read_mostly         read_mostly         tcp_bound_to_half_wnd,tcp_current_mss(tx);tcp_rcv_established(rx)
 u16                           gso_segs                read_mostly                             tcp_xmit_size_goal
+u8                            rcv_mwnd_scaling_ratio  read_write          read_mostly         tcp_init_max_rcv_wnd_seq,tcp_update_max_rcv_wnd_seq,tcp_repair_set_window,do_tcp_getsockopt
 u8                            rcv_wnd_scaling_ratio   read_write          read_mostly         tcp_set_rcv_wnd,tcp_can_ingest,tcp_repair_set_window,do_tcp_getsockopt
 __be32                        pred_flags              read_write          read_mostly         tcp_select_window(tx);tcp_rcv_established(rx)
 u64                           bytes_received                              read_write          tcp_rcv_nxt_update(rx)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 2ace563d59d6..e5d7a65ac439 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -297,6 +297,7 @@ struct tcp_sock {
 		est_ecnfield:2,/* ECN field for AccECN delivered estimates */
 		accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
 		prev_ecnfield:2; /* ECN bits from the previous segment */
+	u8	rcv_mwnd_scaling_ratio; /* 0 if unknown, else tp->rcv_mwnd_seq basis */
 	u8	rcv_wnd_scaling_ratio; /* 0 if unknown, else tp->rcv_wnd basis */
 	__be32	pred_flags;
 	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6fa7cdb0979e..fc22ab6b80d5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -947,13 +947,21 @@ static inline u32 tcp_max_receive_window(const struct tcp_sock *tp)
 	return (u32) win;
 }
 
+static inline void tcp_init_max_rcv_wnd_seq(struct tcp_sock *tp)
+{
+	tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
+	tp->rcv_mwnd_scaling_ratio = tp->rcv_wnd_scaling_ratio;
+}
+
 /* Check if we need to update the maximum receive window sequence number */
 static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp)
 {
 	u32 wre = tp->rcv_wup + tp->rcv_wnd;
 
-	if (after(wre, tp->rcv_mwnd_seq))
+	if (after(wre, tp->rcv_mwnd_seq)) {
 		tp->rcv_mwnd_seq = wre;
+		tp->rcv_mwnd_scaling_ratio = tp->rcv_wnd_scaling_ratio;
+	}
 }
 
 /* Choose a new window, without checks for shrinking, and without
@@ -1766,6 +1774,16 @@ static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win,
 					   space);
 }
 
+/* Same as tcp_space_from_rcv_wnd(), but for the remembered maximum
+ * sender-visible receive window.
+ */
+static inline bool tcp_space_from_rcv_mwnd(const struct tcp_sock *tp, int win,
+					   int *space)
+{
+	return tcp_space_from_wnd_snapshot(tp->rcv_mwnd_scaling_ratio, win,
+					   space);
+}
+
 /* Assume a 50% default for skb->len/skb->truesize ratio.
  * This may be adjusted later in tcp_measure_rcv_mss().
  */
@@ -1776,6 +1794,7 @@ static inline void tcp_scaling_ratio_init(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
+	tp->rcv_mwnd_scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
 	tp->rcv_wnd_scaling_ratio = TCP_DEFAULT_SCALING_RATIO;
 }
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0383ee8d3b78..66706dbb90f5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5275,6 +5275,7 @@ static void __init tcp_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_scaling_ratio);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd_scaling_ratio);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4e389d609f91..56113cf2a165 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -377,7 +377,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 
 	tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
 	tp->rcv_wup = tp->rcv_nxt;
-	tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
+	tcp_init_max_rcv_wnd_seq(tp);
 	/* tcp_conn_request() is sending the SYNACK,
 	 * and queues the child into listener accept queue.
 	 */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b8e65e31255e..352f814a4ff6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6902,7 +6902,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 */
 		WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
 		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
-		tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
+		tcp_init_max_rcv_wnd_seq(tp);
 
 		/* RFC1323: The window in SYN & SYN/ACK segments is
 		 * never scaled.
@@ -7015,7 +7015,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
 		WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
 		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
-		tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd;
+		tcp_init_max_rcv_wnd_seq(tp);
 
 		/* RFC1323: The window in SYN & SYN/ACK segments is
 		 * never scaled.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1c02c9cd13fe..85bd9580caf9 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -604,7 +604,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->window_clamp = req->rsk_window_clamp;
 	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
 	tcp_set_rcv_wnd(newtp, req->rsk_rcv_wnd);
-	newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd;
+	tcp_init_max_rcv_wnd_seq(newtp);
 	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 	if (newtp->rx_opt.wscale_ok) {
 		newtp->rx_opt.snd_wscale = ireq->snd_wscale;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0b082726d7c4..57a2a6daaad3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -4171,7 +4171,7 @@ static void tcp_connect_init(struct sock *sk)
 	else
 		tp->rcv_tstamp = tcp_jiffies32;
 	tp->rcv_wup = tp->rcv_nxt;
-	tp->rcv_mwnd_seq = tp->rcv_nxt + tp->rcv_wnd;
+	tcp_init_max_rcv_wnd_seq(tp);
 	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
 
 	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 03/14] tcp: refresh rcv_wnd snapshots at TCP write sites
From: atwellwea @ 2026-03-14 20:13 UTC (permalink / raw)
  To: netdev, davem, kuba, pabeni, edumazet, ncardwell
  Cc: linux-kernel, linux-api, linux-doc, linux-kselftest,
	linux-trace-kernel, mptcp, dsahern, horms, kuniyu, andrew+netdev,
	willemdebruijn.kernel, jasowang, skhan, corbet, matttbe,
	martineau, geliang, rostedt, mhiramat, mathieu.desnoyers,
	0x7f454c46
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>

From: Wesley Atwell <atwellwea@gmail.com>

Refresh the live rwnd snapshot whenever TCP updates tp->rcv_wnd at the
normal write sites, including child setup, tcp_select_window(), and the
initial connect-time window selection.

This keeps the live sender-visible window paired with the scaling basis
that was actually advertised.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 net/ipv4/tcp_minisocks.c | 2 +-
 net/ipv4/tcp_output.c    | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d350d794a959..1c02c9cd13fe 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -603,7 +603,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->rx_opt.sack_ok = ireq->sack_ok;
 	newtp->window_clamp = req->rsk_window_clamp;
 	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
-	newtp->rcv_wnd = req->rsk_rcv_wnd;
+	tcp_set_rcv_wnd(newtp, req->rsk_rcv_wnd);
 	newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd;
 	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 	if (newtp->rx_opt.wscale_ok) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 35c3b0ab5a0c..0b082726d7c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -291,7 +291,7 @@ static u16 tcp_select_window(struct sock *sk)
 	 */
 	if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) {
 		tp->pred_flags = 0;
-		tp->rcv_wnd = 0;
+		tcp_set_rcv_wnd(tp, 0);
 		tp->rcv_wup = tp->rcv_nxt;
 		tcp_update_max_rcv_wnd_seq(tp);
 		return 0;
@@ -315,7 +315,7 @@ static u16 tcp_select_window(struct sock *sk)
 		}
 	}
 
-	tp->rcv_wnd = new_win;
+	tcp_set_rcv_wnd(tp, new_win);
 	tp->rcv_wup = tp->rcv_nxt;
 	tcp_update_max_rcv_wnd_seq(tp);
 
@@ -4148,6 +4148,10 @@ static void tcp_connect_init(struct sock *sk)
 				  READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
 				  &rcv_wscale,
 				  rcv_wnd);
+	/* tcp_select_initial_window() filled tp->rcv_wnd through its out-param,
+	 * so snapshot the scaling_ratio we will use for that initial rwnd.
+	 */
+	tcp_set_rcv_wnd(tp, tp->rcv_wnd);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
 	tp->rcv_ssthresh = tp->rcv_wnd;
-- 
2.43.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox