From: Marek Majkowski <marek@cloudflare.com>
To: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org, kernel-team@cloudflare.com,
ivan@cloudflare.com, edumazet@google.com, davem@davemloft.net,
kuba@kernel.org, pabeni@redhat.com, ast@kernel.org,
daniel@iogearbox.net, andrii@kernel.org,
Marek Majkowski <marek@cloudflare.com>
Subject: [PATCH net-next 1/2] RTAX_INITRWND should be able to bring the rcv_ssthresh above 64KiB
Date: Thu, 21 Jul 2022 17:10:40 +0200 [thread overview]
Message-ID: <20220721151041.1215017-2-marek@cloudflare.com> (raw)
In-Reply-To: <20220721151041.1215017-1-marek@cloudflare.com>
We already support RTAX_INITRWND / initrwnd path attribute:
$ ip route change local 127.0.0.0/8 dev lo initrwnd 1024
However normally, the initial advertised receive window is limited to
64KiB by rcv_ssthresh, regardless of initrwnd. This patch changes
that, bumping up rcv_ssthresh to value derived from initrwnd. This
allows for larger initial advertised receive windows, which is useful
for specific types of TCP flows: big BDP ones, where there is a lot of
data to send immediately after the flow is established.
There are three places where we initialize sockets:
- tcp_output:tcp_connect_init
- tcp_minisocks:tcp_openreq_init_rwin
- syncookies
In the first two we already have a call to `tcp_rwnd_init_bpf` and
`dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd
attribute. We use this value to bring `rcv_ssthresh` up, potentially
above the traditional 64KiB.
With higher initial `rcv_ssthresh` the receiver will open the receive
window more aggresively, which can improve large BDP flows - large
throughput and latency.
This patch does not cover the syncookies case.
Signed-off-by: Marek Majkowski <marek@cloudflare.com>
---
include/net/inet_sock.h | 1 +
net/ipv4/tcp_minisocks.c | 8 ++++++--
net/ipv4/tcp_output.c | 10 ++++++++--
3 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index daead5fb389a..bc68c9b70942 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -89,6 +89,7 @@ struct inet_request_sock {
no_srccheck: 1,
smc_ok : 1;
u32 ir_mark;
+ u32 rcv_ssthresh;
union {
struct ip_options_rcu __rcu *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6854bb1fb32b..89ba2a30a012 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -360,6 +360,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
u32 window_clamp;
__u8 rcv_wscale;
u32 rcv_wnd;
+ int adj_mss;
int mss;
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -378,15 +379,18 @@ void tcp_openreq_init_rwin(struct request_sock *req,
else if (full_space < rcv_wnd * mss)
full_space = rcv_wnd * mss;
+ adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(sk_listener, full_space,
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ adj_mss,
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
+ ireq->rcv_ssthresh = max(req->rsk_rcv_wnd, rcv_wnd * adj_mss);
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);
@@ -502,7 +506,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
newtp->rx_opt.sack_ok = ireq->sack_ok;
newtp->window_clamp = req->rsk_window_clamp;
- newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_ssthresh = ireq->rcv_ssthresh;
newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 18c913a2347a..0f2d4174ea59 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3642,6 +3642,7 @@ static void tcp_connect_init(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
u32 rcv_wnd;
+ u32 mss;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3679,8 +3680,10 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+ mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ?
+ tp->tcp_header_len - sizeof(struct tcphdr) : 0);
tcp_select_initial_window(sk, tcp_full_space(sk),
- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ mss,
&tp->rcv_wnd,
&tp->window_clamp,
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
@@ -3688,7 +3691,10 @@ static void tcp_connect_init(struct sock *sk)
rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
- tp->rcv_ssthresh = tp->rcv_wnd;
+ if (rcv_wnd)
+ tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * mss);
+ else
+ tp->rcv_ssthresh = tp->rcv_wnd;
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
--
2.25.1
next prev parent reply other threads:[~2022-07-21 15:11 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-21 15:10 [PATCH net-next 0/2] RTAX_INITRWND should be able to bring the rcv_ssthresh above 64KiB Marek Majkowski
2022-07-21 15:10 ` Marek Majkowski [this message]
2022-07-22 9:23 ` [PATCH net-next 1/2] " Eric Dumazet
2022-07-27 11:19 ` Marek Majkowski
2022-07-27 12:54 ` Eric Dumazet
2022-07-21 15:10 ` [PATCH net-next 2/2] Tests for RTAX_INITRWND Marek Majkowski
2022-07-22 1:54 ` [PATCH net-next 0/2] RTAX_INITRWND should be able to bring the rcv_ssthresh above 64KiB Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220721151041.1215017-2-marek@cloudflare.com \
--to=marek@cloudflare.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=ivan@cloudflare.com \
--cc=kernel-team@cloudflare.com \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.