From: Haiyang Zhang <haiyangz@microsoft.com>
To: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org
Cc: haiyangz@microsoft.com, kys@microsoft.com, davem@davemloft.net,
edumazet@google.com, kuba@kernel.org, pabeni@redhat.com,
corbet@lwn.net, dsahern@kernel.org, ncardwell@google.com,
ycheng@google.com, kuniyu@amazon.com, morleyd@google.com,
mfreemon@cloudflare.com, mubashirq@google.com,
linux-doc@vger.kernel.org, weiwan@google.com,
linux-kernel@vger.kernel.org
Subject: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
Date: Tue, 10 Oct 2023 12:23:30 -0700 [thread overview]
Message-ID: <1696965810-8315-1-git-send-email-haiyangz@microsoft.com> (raw)
TCP pingpong threshold is 1 by default. But some applications, like SQL DB
may prefer a higher pingpong threshold to activate delayed acks in quick
ack mode for better performance.
The pingpong threshold and related code were changed to 3 in the year
2019 in:
commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
And reverted to 1 in the year 2022 in:
commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
There is no single value that fits all applications.
Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
optimal performance based on the application needs.
Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
v2: Make it per-namesapce setting, and other updates suggested by Neal Cardwell,
and Kuniyuki Iwashima.
---
Documentation/networking/ip-sysctl.rst | 8 ++++++++
include/net/inet_connection_sock.h | 16 ++++++++++++----
include/net/netns/ipv4.h | 1 +
net/ipv4/sysctl_net_ipv4.c | 8 ++++++++
net/ipv4/tcp_ipv4.c | 2 ++
net/ipv4/tcp_output.c | 4 ++--
6 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 5bfa1837968c..c0308b65dc2f 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
Default: 128
+tcp_pingpong_thresh - INTEGER
+ TCP pingpong threshold is 1 by default, but some application may need a
+ higher threshold for optimal performance.
+
+ Possible Values: 1 - 255
+
+ Default: 1
+
UDP variables
=============
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5d2fcc137b88..0182f27bce40 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -325,11 +325,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
-#define TCP_PINGPONG_THRESH 1
-
static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
{
- inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
+ inet_csk(sk)->icsk_ack.pingpong =
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
}
static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
@@ -339,7 +338,16 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
{
- return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
+ return inet_csk(sk)->icsk_ack.pingpong >=
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
+}
+
+static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ack.pingpong < U8_MAX)
+ icsk->icsk_ack.pingpong++;
}
static inline bool inet_csk_has_ulp(const struct sock *sk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d96d05b08819..9f1b3eb9473e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -191,6 +191,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_plb_rehash_rounds;
u8 sysctl_tcp_plb_suspend_rto_sec;
int sysctl_tcp_plb_cong_thresh;
+ u8 sysctl_tcp_pingpong_thresh;
int sysctl_udp_wmem_min;
int sysctl_udp_rmem_min;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e7f024d93572..f63a545a7374 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1498,6 +1498,14 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+ {
+ .procname = "tcp_pingpong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
{ }
};
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a441740616d7..f603ad9307af 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3288,6 +3288,8 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
net->ipv4.sysctl_tcp_shrink_window = 0;
+ net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+
return 0;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8885552dff8e..5736a736b59c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -170,10 +170,10 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
tp->lsndtime = now;
/* If it is a reply for ato after last received
- * packet, enter pingpong mode.
+ * packet, increase pingpong count.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- inet_csk_enter_pingpong_mode(sk);
+ inet_csk_inc_pingpong_cnt(sk);
}
/* Account for an ACK we sent. */
--
2.25.1
next reply other threads:[~2023-10-10 19:24 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-10-10 19:23 Haiyang Zhang [this message]
2023-10-10 20:06 ` [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl Neal Cardwell
2023-10-10 21:05 ` Haiyang Zhang
2023-10-10 20:11 ` Kuniyuki Iwashima
2023-10-10 21:09 ` Haiyang Zhang
2023-10-10 22:14 ` Stephen Hemminger
2023-10-10 22:27 ` Yuchung Cheng
2023-10-10 22:59 ` Haiyang Zhang
2023-10-11 2:15 ` Stephen Hemminger
2023-10-11 18:49 ` Haiyang Zhang
2023-10-11 18:57 ` Eric Dumazet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1696965810-8315-1-git-send-email-haiyangz@microsoft.com \
--to=haiyangz@microsoft.com \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=kuniyu@amazon.com \
--cc=kys@microsoft.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-hyperv@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mfreemon@cloudflare.com \
--cc=morleyd@google.com \
--cc=mubashirq@google.com \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=weiwan@google.com \
--cc=ycheng@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).