Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Kuniyuki Iwashima <kuniyu@amazon.com>
To: <edumazet@google.com>
Cc: <davem@davemloft.net>, <eric.dumazet@gmail.com>,
	<kuba@kernel.org>, <ncardwell@google.com>,
	<netdev@vger.kernel.org>, <pabeni@redhat.com>,
	<soheil@google.com>, <kuniyu@amazon.com>
Subject: Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic
Date: Tue, 1 Nov 2022 09:54:12 -0700	[thread overview]
Message-ID: <20221101165412.25234-1-kuniyu@amazon.com> (raw)
In-Reply-To: <20221101035234.3910189-1-edumazet@google.com>

From:   Eric Dumazet <edumazet@google.com>
Date:   Tue,  1 Nov 2022 03:52:34 +0000
> After commits 36a6503fedda ("tcp: refine tcp_prune_ofo_queue()
> to not drop all packets") and 72cd43ba64fc1
> ("tcp: free batches of packets in tcp_prune_ofo_queue()")
> tcp_prune_ofo_queue() drops a fraction of ooo queue,
> to make room for incoming packet.
> 
> However it makes no sense to drop packets that are
> before the incoming packet, in sequence space.
> 
> In order to recover from packet losses faster,
> it makes more sense to only drop ooo packets
> which are after the incoming packet.
> 
> Tested:
> packetdrill test:
>    0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
>    +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
>    +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0
>    +0 bind(3, ..., ...) = 0
>    +0 listen(3, 1) = 0
> 
>    +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
>    +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
>   +.1 < . 1:1(0) ack 1 win 1024
>    +0 accept(3, ..., ...) = 4
> 
>  +.01 < . 200:300(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 200:300>
> 
>  +.01 < . 400:500(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 400:500 200:300>
> 
>  +.01 < . 600:700(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 600:700 400:500 200:300>
> 
>  +.01 < . 800:900(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 800:900 600:700 400:500 200:300>
> 
>  +.01 < . 1000:1100(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 1000:1100 800:900 600:700 400:500>
> 
>  +.01 < . 1200:1300(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> 
> // this packet is dropped because we have no room left.
>  +.01 < . 1400:1500(100) ack 1 win 1024
> 
>  +.01 < . 1:200(199) ack 1 win 1024
> // Make sure kernel did not drop 200:300 sequence
>    +0 > . 1:1(0) ack 300 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> // Make room, since our RCVBUF is very small
>    +0 read(4, ..., 299) = 299
> 
>  +.01 < . 300:400(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 500 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> 
>  +.01 < . 500:600(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 700 <nop,nop, sack 1200:1300 1000:1100 800:900>
> 
>    +0 read(4, ..., 400) = 400
> 
>  +.01 < . 700:800(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 900 <nop,nop, sack 1200:1300 1000:1100>
> 
>  +.01 < . 900:1000(100) ack 1 win 1024
>    +0 > . 1:1(0) ack 1100 <nop,nop, sack 1200:1300>
> 
>  +.01 < . 1100:1200(100) ack 1 win 1024
> // This checks that 1200:1300 has not been removed from ooo queue
>    +0 > . 1:1(0) ack 1300
> 
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>

Sounds good!

Thank you.


> ---
>  net/ipv4/tcp_input.c | 51 +++++++++++++++++++++++++++-----------------
>  1 file changed, 31 insertions(+), 20 deletions(-)
> 
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 0640453fce54b6daae0861d948f3db075830daf6..d764b5854dfcc865207b5eb749c29013ef18bdbc 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
>  	}
>  }
>  
> -static bool tcp_prune_ofo_queue(struct sock *sk);
> -static int tcp_prune_queue(struct sock *sk);
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
>  
>  static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>  				 unsigned int size)
> @@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
>  	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
>  	    !sk_rmem_schedule(sk, skb, size)) {
>  
> -		if (tcp_prune_queue(sk) < 0)
> +		if (tcp_prune_queue(sk, skb) < 0)
>  			return -1;
>  
>  		while (!sk_rmem_schedule(sk, skb, size)) {
> -			if (!tcp_prune_ofo_queue(sk))
> +			if (!tcp_prune_ofo_queue(sk, skb))
>  				return -1;
>  		}
>  	}
> @@ -5329,6 +5329,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>   * Clean the out-of-order queue to make room.
>   * We drop high sequences packets to :
>   * 1) Let a chance for holes to be filled.
> + *    This means we do not drop packets from ooo queue if their sequence
> + *    is before incoming packet sequence.
>   * 2) not add too big latencies if thousands of packets sit there.
>   *    (But if application shrinks SO_RCVBUF, we could still end up
>   *     freeing whole queue here)
> @@ -5336,24 +5338,31 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
>   *
>   * Return true if queue has shrunk.
>   */
> -static bool tcp_prune_ofo_queue(struct sock *sk)
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
>  {
>  	struct tcp_sock *tp = tcp_sk(sk);
>  	struct rb_node *node, *prev;
> +	bool pruned = false;
>  	int goal;
>  
>  	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
>  		return false;
>  
> -	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
>  	goal = sk->sk_rcvbuf >> 3;
>  	node = &tp->ooo_last_skb->rbnode;
> +
>  	do {
> +		struct sk_buff *skb = rb_to_skb(node);
> +
> +		/* If incoming skb would land last in ofo queue, stop pruning. */
> +		if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
> +			break;
> +		pruned = true;
>  		prev = rb_prev(node);
>  		rb_erase(node, &tp->out_of_order_queue);
> -		goal -= rb_to_skb(node)->truesize;
> -		tcp_drop_reason(sk, rb_to_skb(node),
> -				SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> +		goal -= skb->truesize;
> +		tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> +		tp->ooo_last_skb = rb_to_skb(prev);
>  		if (!prev || goal <= 0) {
>  			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
>  			    !tcp_under_memory_pressure(sk))
> @@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>  		}
>  		node = prev;
>  	} while (node);
> -	tp->ooo_last_skb = rb_to_skb(prev);
>  
> -	/* Reset SACK state.  A conforming SACK implementation will
> -	 * do the same at a timeout based retransmit.  When a connection
> -	 * is in a sad state like this, we care only about integrity
> -	 * of the connection not performance.
> -	 */
> -	if (tp->rx_opt.sack_ok)
> -		tcp_sack_reset(&tp->rx_opt);
> -	return true;
> +	if (pruned) {
> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> +		/* Reset SACK state.  A conforming SACK implementation will
> +		 * do the same at a timeout based retransmit.  When a connection
> +		 * is in a sad state like this, we care only about integrity
> +		 * of the connection not performance.
> +		 */
> +		if (tp->rx_opt.sack_ok)
> +			tcp_sack_reset(&tp->rx_opt);
> +	}
> +	return pruned;
>  }
>  
>  /* Reduce allocated memory if we can, trying to get
> @@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
>   * until the socket owning process reads some of the data
>   * to stabilize the situation.
>   */
> -static int tcp_prune_queue(struct sock *sk)
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
>  {
>  	struct tcp_sock *tp = tcp_sk(sk);
>  
> @@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
>  	/* Collapsing did not help, destructive actions follow.
>  	 * This must not ever occur. */
>  
> -	tcp_prune_ofo_queue(sk);
> +	tcp_prune_ofo_queue(sk, in_skb);
>  
>  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
>  		return 0;
> -- 
> 2.38.1.273.g43a17bfeac-goog

next prev parent reply	other threads:[~2022-11-01 16:54 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-01  3:52 [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic Eric Dumazet
2022-11-01  4:18 ` Soheil Hassas Yeganeh
2022-11-01 16:54 ` Kuniyuki Iwashima [this message]
2022-11-02  4:30 ` patchwork-bot+netdevbpf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221101165412.25234-1-kuniyu@amazon.com \
    --to=kuniyu@amazon.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=eric.dumazet@gmail.com \
    --cc=kuba@kernel.org \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=soheil@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).