linux-doc.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Kuniyuki Iwashima <kuniyu@amazon.com>
To: <haiyangz@microsoft.com>
Cc: <corbet@lwn.net>, <davem@davemloft.net>, <dsahern@kernel.org>,
	<edumazet@google.com>, <kuba@kernel.org>, <kuniyu@amazon.com>,
	<kys@microsoft.com>, <linux-doc@vger.kernel.org>,
	<linux-hyperv@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
	<mfreemon@cloudflare.com>, <morleyd@google.com>,
	<mubashirq@google.com>, <ncardwell@google.com>,
	<netdev@vger.kernel.org>, <pabeni@redhat.com>,
	<weiwan@google.com>, <ycheng@google.com>
Subject: Re: [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl
Date: Tue, 10 Oct 2023 13:11:54 -0700	[thread overview]
Message-ID: <20231010201154.31898-1-kuniyu@amazon.com> (raw)
In-Reply-To: <1696965810-8315-1-git-send-email-haiyangz@microsoft.com>

From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Tue, 10 Oct 2023 12:23:30 -0700
> TCP pingpong threshold is 1 by default. But some applications, like SQL DB
> may prefer a higher pingpong threshold to activate delayed acks in quick
> ack mode for better performance.
> 
> The pingpong threshold and related code were changed to 3 in the year
> 2019 in:
>   commit 4a41f453bedf ("tcp: change pingpong threshold to 3")
> And reverted to 1 in the year 2022 in:
>   commit 4d8f24eeedc5 ("Revert "tcp: change pingpong threshold to 3"")
> 
> There is no single value that fits all applications.
> Add net.ipv4.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
> optimal performance based on the application needs.
> 
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> ---
> v2: Make it per-namesapce setting, and other updates suggested by Neal Cardwell,
> and Kuniyuki Iwashima.
> 
> ---
>  Documentation/networking/ip-sysctl.rst |  8 ++++++++
>  include/net/inet_connection_sock.h     | 16 ++++++++++++----
>  include/net/netns/ipv4.h               |  1 +
>  net/ipv4/sysctl_net_ipv4.c             |  8 ++++++++
>  net/ipv4/tcp_ipv4.c                    |  2 ++
>  net/ipv4/tcp_output.c                  |  4 ++--
>  6 files changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 5bfa1837968c..c0308b65dc2f 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -1183,6 +1183,14 @@ tcp_plb_cong_thresh - INTEGER
>  
>  	Default: 128
>  
> +tcp_pingpong_thresh - INTEGER
> +	TCP pingpong threshold is 1 by default, but some application may need a
> +	higher threshold for optimal performance.
> +
> +	Possible Values: 1 - 255
> +
> +	Default: 1
> +
>  UDP variables
>  =============
>  
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index 5d2fcc137b88..0182f27bce40 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -325,11 +325,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
>  
>  struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
>  
> -#define TCP_PINGPONG_THRESH	1
> -
>  static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
>  {
> -	inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
> +	inet_csk(sk)->icsk_ack.pingpong =
> +		READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
>  }
>  
>  static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
> @@ -339,7 +338,16 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
>  
>  static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
>  {
> -	return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
> +	return inet_csk(sk)->icsk_ack.pingpong >=
> +	       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pingpong_thresh);
> +}
> +
> +static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
> +{
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +
> +	if (icsk->icsk_ack.pingpong < U8_MAX)
> +		icsk->icsk_ack.pingpong++;
>  }
>  
>  static inline bool inet_csk_has_ulp(const struct sock *sk)
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index d96d05b08819..9f1b3eb9473e 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -191,6 +191,7 @@ struct netns_ipv4 {
>  	u8 sysctl_tcp_plb_rehash_rounds;
>  	u8 sysctl_tcp_plb_suspend_rto_sec;
>  	int sysctl_tcp_plb_cong_thresh;
> +	u8 sysctl_tcp_pingpong_thresh;
>  
>  	int sysctl_udp_wmem_min;
>  	int sysctl_udp_rmem_min;

Maybe a hole after sysctl_tcp_backlog_ack_defer is a good place
to put a new TCP knob.

After sysctl_tcp_plb_cong_thresh, we can fill 1-byte hole but the
cacheline seems cold for TCP.

$ pahole -C netns_ipv4 vmlinux
struct netns_ipv4 {
...
	u8                         sysctl_tcp_backlog_ack_defer; /*   402     1 */

	/* XXX 1 byte hole, try to pack */

	int                        sysctl_tcp_reordering; /*   404     4 */
...
	int                        sysctl_tcp_plb_cong_thresh; /*   572     4 */
	/* --- cacheline 9 boundary (576 bytes) --- */
	int                        sysctl_udp_wmem_min;  /*   576     4 */
	int                        sysctl_udp_rmem_min;  /*   580     4 */
	u8                         sysctl_fib_notify_on_flag_change; /*   584     1 */
	u8                         sysctl_tcp_syn_linear_timeouts; /*   585     1 */
	u8                         sysctl_igmp_llm_reports; /*   586     1 */

	/* XXX 1 byte hole, try to pack */
...


> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index e7f024d93572..f63a545a7374 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -1498,6 +1498,14 @@ static struct ctl_table ipv4_net_table[] = {
>  		.extra1		= SYSCTL_ZERO,
>  		.extra2		= SYSCTL_ONE,
>  	},
> +	{
> +		.procname	= "tcp_pingpong_thresh",
> +		.data		= &init_net.ipv4.sysctl_tcp_pingpong_thresh,
> +		.maxlen		= sizeof(u8),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dou8vec_minmax,
> +		.extra1		= SYSCTL_ONE,
> +	},
>  	{ }
>  };
>  
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index a441740616d7..f603ad9307af 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3288,6 +3288,8 @@ static int __net_init tcp_sk_init(struct net *net)
>  	net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
>  	net->ipv4.sysctl_tcp_shrink_window = 0;
>  
> +	net->ipv4.sysctl_tcp_pingpong_thresh = 1;
> +
>  	return 0;
>  }
>  
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 8885552dff8e..5736a736b59c 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -170,10 +170,10 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
>  	tp->lsndtime = now;
>  
>  	/* If it is a reply for ato after last received
> -	 * packet, enter pingpong mode.
> +	 * packet, increase pingpong count.
>  	 */
>  	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
> -		inet_csk_enter_pingpong_mode(sk);
> +		inet_csk_inc_pingpong_cnt(sk);
>  }
>  
>  /* Account for an ACK we sent. */
> -- 
> 2.25.1


  parent reply	other threads:[~2023-10-10 20:12 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-10 19:23 [PATCH net-next,v2] tcp: Set pingpong threshold via sysctl Haiyang Zhang
2023-10-10 20:06 ` Neal Cardwell
2023-10-10 21:05   ` Haiyang Zhang
2023-10-10 20:11 ` Kuniyuki Iwashima [this message]
2023-10-10 21:09   ` Haiyang Zhang
2023-10-10 22:14 ` Stephen Hemminger
2023-10-10 22:27   ` Yuchung Cheng
2023-10-10 22:59     ` Haiyang Zhang
2023-10-11  2:15       ` Stephen Hemminger
2023-10-11 18:49         ` Haiyang Zhang
2023-10-11 18:57           ` Eric Dumazet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231010201154.31898-1-kuniyu@amazon.com \
    --to=kuniyu@amazon.com \
    --cc=corbet@lwn.net \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=haiyangz@microsoft.com \
    --cc=kuba@kernel.org \
    --cc=kys@microsoft.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-hyperv@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mfreemon@cloudflare.com \
    --cc=morleyd@google.com \
    --cc=mubashirq@google.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=weiwan@google.com \
    --cc=ycheng@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).