From: Marek Majkowski <marek@cloudflare.com>
To: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org, kernel-team@cloudflare.com,
ivan@cloudflare.com, edumazet@google.com, davem@davemloft.net,
kuba@kernel.org, pabeni@redhat.com, ast@kernel.org,
daniel@iogearbox.net, andrii@kernel.org, brakmo@fb.com,
Marek Majkowski <marek@cloudflare.com>
Subject: [PATCH net-next v2 1/2] RTAX_INITRWND should be able to set the rcv_ssthresh above 64KiB
Date: Fri, 29 Jul 2022 16:39:34 +0200 [thread overview]
Message-ID: <20220729143935.2432743-2-marek@cloudflare.com> (raw)
In-Reply-To: <20220729143935.2432743-1-marek@cloudflare.com>
There are three places where we initialize sockets:
- tcp_output:tcp_connect_init
- tcp_minisocks:tcp_openreq_init_rwin
- syncookies
In the first two we already have a call to `tcp_rwnd_init_bpf` and
`dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd
attribute. We use this value to bring `rcv_ssthresh` up, potentially
above the traditional 64KiB.
With higher initial `rcv_ssthresh` the receiver will open the receive
window more aggresively, which can improve large BDP flows - large
throughput and latency.
This patch does not cover the syncookies case.
Signed-off-by: Marek Majkowski <marek@cloudflare.com>
---
include/linux/tcp.h | 1 +
net/ipv4/tcp_minisocks.c | 9 +++++++--
net/ipv4/tcp_output.c | 7 +++++--
3 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a9fbe22732c3..c7a8c71536f8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -164,6 +164,7 @@ struct tcp_request_sock {
* FastOpen it's the seq#
* after data-in-SYN.
*/
+ u32 rcv_ssthresh;
u8 syn_tos;
};
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb95d88497ae..8e5a3bd9a55b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -355,11 +355,13 @@ void tcp_openreq_init_rwin(struct request_sock *req,
const struct dst_entry *dst)
{
struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk_listener);
int full_space = tcp_full_space(sk_listener);
u32 window_clamp;
__u8 rcv_wscale;
u32 rcv_wnd;
+ int adj_mss;
int mss;
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -377,16 +379,19 @@ void tcp_openreq_init_rwin(struct request_sock *req,
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
else if (full_space < rcv_wnd * mss)
full_space = rcv_wnd * mss;
+ adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(sk_listener, full_space,
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ adj_mss,
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
+ treq->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * adj_mss);
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);
@@ -502,7 +507,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
newtp->rx_opt.sack_ok = ireq->sack_ok;
newtp->window_clamp = req->rsk_window_clamp;
- newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_ssthresh = treq->rcv_ssthresh;
newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 78b654ff421b..56f22d5da3a7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3649,6 +3649,7 @@ static void tcp_connect_init(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
u32 rcv_wnd;
+ u32 adj_mss;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3686,8 +3687,10 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+ adj_mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ?
+ tp->tcp_header_len - sizeof(struct tcphdr) : 0);
tcp_select_initial_window(sk, tcp_full_space(sk),
- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ adj_mss,
&tp->rcv_wnd,
&tp->window_clamp,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
@@ -3695,7 +3698,7 @@ static void tcp_connect_init(struct sock *sk)
rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
- tp->rcv_ssthresh = tp->rcv_wnd;
+ tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * adj_mss);
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
--
2.25.1
next prev parent reply other threads:[~2022-07-29 14:40 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-29 14:39 [PATCH net-next v2 0/2] RTAX_INITRWND should be able to bring the rcv_ssthresh above 64KiB Marek Majkowski
2022-07-29 14:39 ` Marek Majkowski [this message]
2022-08-11 2:36 ` b75edfb063: divide_error:#[##] kernel test robot
2022-07-29 14:39 ` [PATCH net-next v2 2/2] Tests for RTAX_INITRWND Marek Majkowski
2022-07-29 15:12 ` [PATCH net-next v2 0/2] RTAX_INITRWND should be able to bring the rcv_ssthresh above 64KiB Marek Majkowski
2022-08-01 22:24 ` Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220729143935.2432743-2-marek@cloudflare.com \
--to=marek@cloudflare.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=brakmo@fb.com \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=ivan@cloudflare.com \
--cc=kernel-team@cloudflare.com \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).