From: atwellwea@gmail.com
To: netdev@vger.kernel.org, davem@davemloft.net, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, ncardwell@google.com
Cc: linux-kernel@vger.kernel.org, linux-api@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org,
linux-trace-kernel@vger.kernel.org, mptcp@lists.linux.dev,
dsahern@kernel.org, horms@kernel.org, kuniyu@google.com,
andrew+netdev@lunn.ch, willemdebruijn.kernel@gmail.com,
jasowang@redhat.com, skhan@linuxfoundation.org, corbet@lwn.net,
matttbe@kernel.org, martineau@kernel.org, geliang@kernel.org,
rostedt@goodmis.org, mhiramat@kernel.org,
mathieu.desnoyers@efficios.com, 0x7f454c46@gmail.com
Subject: [PATCH net-next v2 08/14] tcp: extend TCP_REPAIR_WINDOW for live and max-window snapshots
Date: Sat, 14 Mar 2026 14:13:42 -0600 [thread overview]
Message-ID: <20260314201348.1786972-9-atwellwea@gmail.com> (raw)
In-Reply-To: <20260314201348.1786972-1-atwellwea@gmail.com>
From: Wesley Atwell <atwellwea@gmail.com>
Extend TCP_REPAIR_WINDOW so repair and restore can round-trip both the
live rwnd snapshot and the remembered maximum sender-visible window.
Keep the ABI append-only by accepting the legacy and v1 prefix lengths on
both get and set, rebuilding any missing max-window state from the live
window when older userspace restores a socket.
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
include/net/tcp.h | 13 +++----
include/uapi/linux/tcp.h | 8 +++++
net/ipv4/tcp.c | 73 ++++++++++++++++++++++++++++++++++++----
3 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5b479ad44f89..12e62fea2aaf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1766,13 +1766,14 @@ static inline bool tcp_space_from_wnd_snapshot(u8 scaling_ratio, int win,
}
/* Rebuild hard receive-memory units for data already covered by tp->rcv_wnd if
- * the advertise-time basis is known.
+ * the advertise-time basis is known. Legacy TCP_REPAIR restores can only
+ * recover tp->rcv_wnd itself; callers must fall back when the snapshot is
+ * unknown.
*/
static inline bool tcp_space_from_rcv_wnd(const struct tcp_sock *tp, int win,
int *space)
{
- return tcp_space_from_wnd_snapshot(tp->rcv_wnd_scaling_ratio, win,
- space);
+ return tcp_space_from_wnd_snapshot(tp->rcv_wnd_scaling_ratio, win, space);
}
/* Same as tcp_space_from_rcv_wnd(), but for the remembered maximum
@@ -1800,9 +1801,9 @@ static inline void tcp_scaling_ratio_init(struct sock *sk)
}
/* tp->rcv_wnd is paired with the scaling_ratio that was in force when that
- * window was last advertised. Callers can leave a zero snapshot when the
- * advertise-time basis is unknown and refresh the pair on the next local
- * window update.
+ * window was last advertised. Legacy TCP_REPAIR restores can only recover the
+ * window value itself and use a zero snapshot until a fresh local window
+ * advertisement refreshes the pair.
*/
static inline void tcp_set_rcv_wnd_snapshot(struct tcp_sock *tp, u32 win,
u8 scaling_ratio)
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 03772dd4d399..564a77f69130 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -152,6 +152,11 @@ struct tcp_repair_opt {
__u32 opt_val;
};
+/* Append-only repair ABI.
+ * Older userspace may stop at rcv_wup or rcv_wnd_scaling_ratio.
+ * The kernel accepts those prefix lengths and rebuilds any missing
+ * receive-window snapshot state on restore.
+ */
struct tcp_repair_window {
__u32 snd_wl1;
__u32 snd_wnd;
@@ -159,6 +164,9 @@ struct tcp_repair_window {
__u32 rcv_wnd;
__u32 rcv_wup;
+ __u32 rcv_wnd_scaling_ratio; /* 0 means live-window basis unknown */
+ __u32 rcv_mwnd_seq;
+ __u32 rcv_mwnd_scaling_ratio; /* 0 means max-window basis unknown */
};
enum {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66706dbb90f5..39a1265876ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3533,17 +3533,31 @@ static inline bool tcp_can_repair_sock(const struct sock *sk)
(sk->sk_state != TCP_LISTEN);
}
+/* Keep accepting the pre-extension TCP_REPAIR_WINDOW layout so legacy
+ * userspace can restore sockets without fabricating a snapshot basis.
+ */
+static inline int tcp_repair_window_legacy_size(void)
+{
+ return offsetof(struct tcp_repair_window, rcv_wnd_scaling_ratio);
+}
+
+static inline int tcp_repair_window_v1_size(void)
+{
+ return offsetof(struct tcp_repair_window, rcv_mwnd_seq);
+}
+
static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
{
- struct tcp_repair_window opt;
+ struct tcp_repair_window opt = {};
if (!tp->repair)
return -EPERM;
- if (len != sizeof(opt))
+ if (len != tcp_repair_window_legacy_size() &&
+ len != tcp_repair_window_v1_size() && len != sizeof(opt))
return -EINVAL;
- if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
+ if (copy_from_sockptr(&opt, optbuf, len))
return -EFAULT;
if (opt.max_window < opt.snd_wnd)
@@ -3559,9 +3573,47 @@ static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
tp->snd_wnd = opt.snd_wnd;
tp->max_window = opt.max_window;
- tp->rcv_wnd = opt.rcv_wnd;
+ if (len == tcp_repair_window_legacy_size()) {
+ /* Legacy repair UAPI has no advertise-time basis for tp->rcv_wnd.
+ * Mark the snapshot unknown until a fresh local advertisement
+ * re-establishes the pair.
+ */
+ tcp_set_rcv_wnd_unknown(tp, opt.rcv_wnd);
+ tp->rcv_wup = opt.rcv_wup;
+ tcp_init_max_rcv_wnd_seq(tp);
+ return 0;
+ }
+
+ if (opt.rcv_wnd_scaling_ratio > U8_MAX)
+ return -EINVAL;
+
+ tcp_set_rcv_wnd_snapshot(tp, opt.rcv_wnd, opt.rcv_wnd_scaling_ratio);
tp->rcv_wup = opt.rcv_wup;
- tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd;
+
+ if (len == tcp_repair_window_v1_size()) {
+ /* v1 repair can restore the live-window snapshot, but not a
+ * retracted max-window snapshot. Rebuild it from the live pair
+ * until a fresh local advertisement updates it again.
+ */
+ tcp_init_max_rcv_wnd_seq(tp);
+ return 0;
+ }
+
+ if (opt.rcv_mwnd_scaling_ratio > U8_MAX)
+ return -EINVAL;
+
+ /* Userspace may repair sequence-space values after checkpoint without
+ * also rebasing the remembered max advertised right edge. If the exact
+ * snapshot no longer covers the restored live window, treat it like
+ * v1 and rebuild the max-window side from the live pair.
+ */
+ if (after(opt.rcv_wup + opt.rcv_wnd, opt.rcv_mwnd_seq)) {
+ tcp_init_max_rcv_wnd_seq(tp);
+ return 0;
+ }
+
+ tp->rcv_mwnd_seq = opt.rcv_mwnd_seq;
+ tp->rcv_mwnd_scaling_ratio = opt.rcv_mwnd_scaling_ratio;
return 0;
}
@@ -4650,12 +4702,16 @@ int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_REPAIR_WINDOW: {
- struct tcp_repair_window opt;
+ struct tcp_repair_window opt = {};
if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- if (len != sizeof(opt))
+ /* Mirror the accepted set-side prefix lengths so checkpoint
+ * tools can round-trip exactly the layout version they know.
+ */
+ if (len != tcp_repair_window_legacy_size() &&
+ len != tcp_repair_window_v1_size() && len != sizeof(opt))
return -EINVAL;
if (!tp->repair)
@@ -4666,6 +4722,9 @@ int do_tcp_getsockopt(struct sock *sk, int level,
opt.max_window = tp->max_window;
opt.rcv_wnd = tp->rcv_wnd;
opt.rcv_wup = tp->rcv_wup;
+ opt.rcv_wnd_scaling_ratio = tp->rcv_wnd_scaling_ratio;
+ opt.rcv_mwnd_seq = tp->rcv_mwnd_seq;
+ opt.rcv_mwnd_scaling_ratio = tp->rcv_mwnd_scaling_ratio;
if (copy_to_sockptr(optval, &opt, len))
return -EFAULT;
--
2.43.0
next prev parent reply other threads:[~2026-03-14 20:14 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-14 20:13 [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 01/14] tcp: factor receive-memory accounting helpers atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 02/14] tcp: snapshot advertise-time scaling for rcv_wnd atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 03/14] tcp: refresh rcv_wnd snapshots at TCP write sites atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 04/14] tcp: snapshot the maximum advertised receive window atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 05/14] tcp: grow rcvbuf to back scaled-window quantization slack atwellwea
2026-03-16 11:04 ` Paolo Abeni
2026-03-16 11:24 ` Paolo Abeni
2026-03-16 11:31 ` Paolo Abeni
2026-03-14 20:13 ` [PATCH net-next v2 06/14] tcp: regrow rcvbuf when scaling_ratio drops after advertisement atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 07/14] tcp: honor the maximum advertised window after live retraction atwellwea
2026-03-16 11:44 ` Paolo Abeni
2026-03-14 20:13 ` atwellwea [this message]
2026-03-14 20:13 ` [PATCH net-next v2 09/14] mptcp: refresh TCP receive-window snapshots on subflows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 10/14] tcp: expose rmem and backlog in tcp and mptcp rcvbuf_grow tracepoints atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 11/14] selftests: tcp_ao: cover legacy, v1, and retracted repair windows atwellwea
2026-03-14 20:13 ` [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 13/14] netdevsim: add peer RX truesize support for selftests atwellwea
2026-03-15 1:18 ` Jakub Kicinski
2026-03-14 20:13 ` [PATCH net-next v2 14/14] netdevsim: release pinned PSP ext on drop paths atwellwea
2026-03-15 1:19 ` [PATCH net-next v2 00/14] tcp: preserve receive-window accounting across ratio drift Jakub Kicinski
2026-03-16 11:09 ` Paolo Abeni
[not found] ` <CAN=sVvyNpkyok_bt8eQSmqc4f7g7QoZBUmRmNRLoFz1HasEzMA@mail.gmail.com>
2026-03-16 17:47 ` Paolo Abeni
2026-03-16 18:03 ` Wesley Atwell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260314201348.1786972-9-atwellwea@gmail.com \
--to=atwellwea@gmail.com \
--cc=0x7f454c46@gmail.com \
--cc=andrew+netdev@lunn.ch \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=geliang@kernel.org \
--cc=horms@kernel.org \
--cc=jasowang@redhat.com \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=linux-api@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-trace-kernel@vger.kernel.org \
--cc=martineau@kernel.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=matttbe@kernel.org \
--cc=mhiramat@kernel.org \
--cc=mptcp@lists.linux.dev \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=rostedt@goodmis.org \
--cc=skhan@linuxfoundation.org \
--cc=willemdebruijn.kernel@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.