From: atwellwea@gmail.com
To: netdev@vger.kernel.org, davem@davemloft.net, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, ncardwell@google.com
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-trace-kernel@vger.kernel.org, mptcp@lists.linux.dev,
dsahern@kernel.org, horms@kernel.org, kuniyu@google.com,
andrew+netdev@lunn.ch, willemdebruijn.kernel@gmail.com,
jasowang@redhat.com, skhan@linuxfoundation.org, corbet@lwn.net,
matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
rostedt@goodmis.org, mhiramat@kernel.org,
mathieu.desnoyers@efficios.com, 0x7f454c46@gmail.com
Subject: [PATCH net-next v2 08/14] tcp: extend TCP_REPAIR_WINDOW for live and max-window snapshots
Date: Sat, 14 Mar 2026 14:13:42 -0600 [thread overview]
Message-ID: <20260314201348.1786972-9-atwellwea@gmail.com> (raw)
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>
From: Wesley Atwell <atwellwea@gmail.com>
Extend TCP_REPAIR_WINDOW so repair and restore can round-trip both the
live rwnd snapshot and the remembered maximum sender-visible window.
Keep the ABI append-only by accepting the legacy and v1 prefix lengths on
both get and set, rebuilding any missing max-window state from the live
window when older userspace restores a socket.
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
include/net/tcp.h | 13 +++----
include/uapi/linux/tcp.h | 8 +++++
net/ipv4/tcp.c | 73 ++++++++++++++++++++++++++++++++++++----
3 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b479ad44f89..12e62fea2aaf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1766,13 +1766,14 @@ static inline bool tcp_space_from_wnd_snapshot(u8 scaling_ratio, int win,
}
/* Rebuild hard receive-memory units for data already covered by tp->rcv_wnd if
- * the advertise-time basis is known.
+ * the advertise-time basis is known. Legacy TCP_REPAIR restores can only
+ * recover tp->rcv_wnd itself; callers must fall back when the snapshot is
+ * unknown.
*/
static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win,
int *space)
{
- return tcp_space_from_wnd_snapshot(tp->rcv_wnd_scaling_ratio, win,
- space);
+ return tcp_space_from_wnd_snapshot(tp->rcv_wnd_scaling_ratio, win, space);
}
/* Same as tcp_space_from_rcv_wnd(), but for the remembered maximum
@@ -1800,9 +1801,9 @@ static inline void tcp_scaling_ratio_init(struct sock *sk)
}
/* tp->rcv_wnd is paired with the scaling_ratio that was in force when that
- * window was last advertised. Callers can leave a zero snapshot when the
- * advertise-time basis is unknown and refresh the pair on the next local
- * window update.
+ * window was last advertised. Legacy TCP_REPAIR restores can only recover the
+ * window value itself and use a zero snapshot until a fresh local window
+ * advertisement refreshes the pair.
*/
static inline void tcp_set_rcv_wnd_snapshot(struct tcp_sock *tp, u32 win,
u8 scaling_ratio)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 03772dd4d399..564a77f69130 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -152,6 +152,11 @@ struct tcp_repair_opt {
__u32 opt_val;
};
+/* Append-only repair ABI.
+ * Older userspace may stop at rcv_wup or rcv_wnd_scaling_ratio.
+ * The kernel accepts those prefix lengths and rebuilds any missing
+ * receive-window snapshot state on restore.
+ */
struct tcp_repair_window {
__u32 snd_wl1;
__u32 snd_wnd;
@@ -159,6 +164,9 @@ struct tcp_repair_window {
__u32 rcv_wnd;
__u32 rcv_wup;
+ __u32 rcv_wnd_scaling_ratio; /* 0 means live-window basis unknown */
+ __u32 rcv_mwnd_seq;
+ __u32 rcv_mwnd_scaling_ratio; /* 0 means max-window basis unknown */
};
enum {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66706dbb90f5..39a1265876ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3533,17 +3533,31 @@ static inline bool tcp_can_repair_sock(const struct sock *sk)
(sk->sk_state != TCP_LISTEN);
}
+/* Keep accepting the pre-extension TCP_REPAIR_WINDOW layout so legacy
+ * userspace can restore sockets without fabricating a snapshot basis.
+ */
+static inline int tcp_repair_window_legacy_size(void)
+{
+ return offsetof(struct tcp_repair_window, rcv_wnd_scaling_ratio);
+}
+
+static inline int tcp_repair_window_v1_size(void)
+{
+ return offsetof(struct tcp_repair_window, rcv_mwnd_seq);
+}
+
static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
{
- struct tcp_repair_window opt;
+ struct tcp_repair_window opt = {};
if (!tp->repair)
return -EPERM;
- if (len != sizeof(opt))
+ if (len != tcp_repair_window_legacy_size() &&
+ len != tcp_repair_window_v1_size() && len != sizeof(opt))
return -EINVAL;
- if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
+ if (copy_from_sockptr(&opt, optbuf, len))
return -EFAULT;
if (opt.max_window < opt.snd_wnd)
@@ -3559,9 +3573,47 @@ static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
tp->snd_wnd = opt.snd_wnd;
tp->max_window = opt.max_window;
- tp->rcv_wnd = opt.rcv_wnd;
+ if (len == tcp_repair_window_legacy_size()) {
+ /* Legacy repair UAPI has no advertise-time basis for tp->rcv_wnd.
+ * Mark the snapshot unknown until a fresh local advertisement
+ * re-establishes the pair.
+ */
+ tcp_set_rcv_wnd_unknown(tp, opt.rcv_wnd);
+ tp->rcv_wup = opt.rcv_wup;
+ tcp_init_max_rcv_wnd_seq(tp);
+ return 0;
+ }
+
+ if (opt.rcv_wnd_scaling_ratio > U8_MAX)
+ return -EINVAL;
+
+ tcp_set_rcv_wnd_snapshot(tp, opt.rcv_wnd, opt.rcv_wnd_scaling_ratio);
tp->rcv_wup = opt.rcv_wup;
- tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd;
+
+ if (len == tcp_repair_window_v1_size()) {
+ /* v1 repair can restore the live-window snapshot, but not a
+ * retracted max-window snapshot. Rebuild it from the live pair
+ * until a fresh local advertisement updates it again.
+ */
+ tcp_init_max_rcv_wnd_seq(tp);
+ return 0;
+ }
+
+ if (opt.rcv_mwnd_scaling_ratio > U8_MAX)
+ return -EINVAL;
+
+ /* Userspace may repair sequence-space values after checkpoint without
+ * also rebasing the remembered max advertised right edge. If the exact
+ * snapshot no longer covers the restored live window, treat it like
+ * v1 and rebuild the max-window side from the live pair.
+ */
+ if (after(opt.rcv_wup + opt.rcv_wnd, opt.rcv_mwnd_seq)) {
+ tcp_init_max_rcv_wnd_seq(tp);
+ return 0;
+ }
+
+ tp->rcv_mwnd_seq = opt.rcv_mwnd_seq;
+ tp->rcv_mwnd_scaling_ratio = opt.rcv_mwnd_scaling_ratio;
return 0;
}
@@ -4650,12 +4702,16 @@ int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_REPAIR_WINDOW: {
- struct tcp_repair_window opt;
+ struct tcp_repair_window opt = {};
if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- if (len != sizeof(opt))
+ /* Mirror the accepted set-side prefix lengths so checkpoint
+ * tools can round-trip exactly the layout version they know.
+ */
+ if (len != tcp_repair_window_legacy_size() &&
+ len != tcp_repair_window_v1_size() && len != sizeof(opt))
return -EINVAL;
if (!tp->repair)
@@ -4666,6 +4722,9 @@ int do_tcp_getsockopt(struct sock *sk, int level,
opt.max_window = tp->max_window;
opt.rcv_wnd = tp->rcv_wnd;
opt.rcv_wup = tp->rcv_wup;
+ opt.rcv_wnd_scaling_ratio = tp->rcv_wnd_scaling_ratio;
+ opt.rcv_mwnd_seq = tp->rcv_mwnd_seq;
+ opt.rcv_mwnd_scaling_ratio = tp->rcv_mwnd_scaling_ratio;
if (copy_to_sockptr(optval, &opt, len))
return -EFAULT;
--
2.43.0
next prev parent reply other threads:[~2026-03-14 20:14 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-14 20:13 [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 01/14] tcp: factor receive-memory accounting helpers atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 02/14] tcp: snapshot advertise-time scaling for rcv_wnd atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 03/14] tcp: refresh rcv_wnd snapshots at TCP write sites atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 04/14] tcp: snapshot the maximum advertised receive window atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack atwellwea
2026-03-16 11:04 ` Paolo Abeni
2026-03-16 11:24 ` Paolo Abeni
2026-03-16 11:31 ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 06/14] tcp: regrow rcvbuf when scaling_ratio drops after advertisement atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction atwellwea
2026-03-16 11:44 ` Paolo Abeni
2026-03-14 20:13 ` atwellwea [this message]
2026-03-14 20:13 ` [PATCH net-next v2 09/14] mptcp: refresh TCP receive-window snapshots on subflows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 10/14] tcp: expose rmem and backlog in tcp and mptcp rcvbuf_grow tracepoints atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 11/14] selftests: tcp_ao: cover legacy, v1, and retracted repair windows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths atwellwea
2026-03-15 1:19 ` [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift Jakub Kicinski
2026-03-16 11:09 ` Paolo Abeni
[not found] ` <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>
2026-03-16 17:47 ` Paolo Abeni
2026-03-16 18:03 ` Wesley Atwell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260314201348.1786972-9-atwellwea@gmail.com \
--to=atwellwea@gmail.com \
--cc=0x7f454c46@gmail.com \
--cc=andrew+netdev@lunn.ch \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=geliang@kernel.org \
--cc=horms@kernel.org \
--cc=jasowang@redhat.com \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=martineau@kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matttbe@kernel.org \
--cc=mhiramat@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rostedt@goodmis.org \
--cc=skhan@linuxfoundation.org \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox