From: Kuniyuki Iwashima <kuniyu@amazon.com>
To: <edumazet@google.com>
Cc: <davem@davemloft.net>, <eric.dumazet@gmail.com>,
<kuba@kernel.org>, <ncardwell@google.com>,
<netdev@vger.kernel.org>, <pabeni@redhat.com>,
<soheil@google.com>, <kuniyu@amazon.com>
Subject: Re: [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic
Date: Tue, 1 Nov 2022 09:54:12 -0700 [thread overview]
Message-ID: <20221101165412.25234-1-kuniyu@amazon.com> (raw)
In-Reply-To: <20221101035234.3910189-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 1 Nov 2022 03:52:34 +0000
> After commits 36a6503fedda ("tcp: refine tcp_prune_ofo_queue()
> to not drop all packets") and 72cd43ba64fc1
> ("tcp: free batches of packets in tcp_prune_ofo_queue()")
> tcp_prune_ofo_queue() drops a fraction of ooo queue,
> to make room for incoming packet.
>
> However it makes no sense to drop packets that are
> before the incoming packet, in sequence space.
>
> In order to recover from packet losses faster,
> it makes more sense to only drop ooo packets
> which are after the incoming packet.
>
> Tested:
> packetdrill test:
> 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
> +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
> +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0
> +0 bind(3, ..., ...) = 0
> +0 listen(3, 1) = 0
>
> +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
> +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 0>
> +.1 < . 1:1(0) ack 1 win 1024
> +0 accept(3, ..., ...) = 4
>
> +.01 < . 200:300(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 200:300>
>
> +.01 < . 400:500(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 400:500 200:300>
>
> +.01 < . 600:700(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 600:700 400:500 200:300>
>
> +.01 < . 800:900(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 800:900 600:700 400:500 200:300>
>
> +.01 < . 1000:1100(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 1000:1100 800:900 600:700 400:500>
>
> +.01 < . 1200:1300(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
>
> // this packet is dropped because we have no room left.
> +.01 < . 1400:1500(100) ack 1 win 1024
>
> +.01 < . 1:200(199) ack 1 win 1024
> // Make sure kernel did not drop 200:300 sequence
> +0 > . 1:1(0) ack 300 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
> // Make room, since our RCVBUF is very small
> +0 read(4, ..., 299) = 299
>
> +.01 < . 300:400(100) ack 1 win 1024
> +0 > . 1:1(0) ack 500 <nop,nop, sack 1200:1300 1000:1100 800:900 600:700>
>
> +.01 < . 500:600(100) ack 1 win 1024
> +0 > . 1:1(0) ack 700 <nop,nop, sack 1200:1300 1000:1100 800:900>
>
> +0 read(4, ..., 400) = 400
>
> +.01 < . 700:800(100) ack 1 win 1024
> +0 > . 1:1(0) ack 900 <nop,nop, sack 1200:1300 1000:1100>
>
> +.01 < . 900:1000(100) ack 1 win 1024
> +0 > . 1:1(0) ack 1100 <nop,nop, sack 1200:1300>
>
> +.01 < . 1100:1200(100) ack 1 win 1024
> // This checks that 1200:1300 has not been removed from ooo queue
> +0 > . 1:1(0) ack 1300
>
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Sounds good!
Thank you.
> ---
> net/ipv4/tcp_input.c | 51 +++++++++++++++++++++++++++-----------------
> 1 file changed, 31 insertions(+), 20 deletions(-)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 0640453fce54b6daae0861d948f3db075830daf6..d764b5854dfcc865207b5eb749c29013ef18bdbc 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk)
> }
> }
>
> -static bool tcp_prune_ofo_queue(struct sock *sk);
> -static int tcp_prune_queue(struct sock *sk);
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
>
> static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
> unsigned int size)
> @@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
> if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
> !sk_rmem_schedule(sk, skb, size)) {
>
> - if (tcp_prune_queue(sk) < 0)
> + if (tcp_prune_queue(sk, skb) < 0)
> return -1;
>
> while (!sk_rmem_schedule(sk, skb, size)) {
> - if (!tcp_prune_ofo_queue(sk))
> + if (!tcp_prune_ofo_queue(sk, skb))
> return -1;
> }
> }
> @@ -5329,6 +5329,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
> * Clean the out-of-order queue to make room.
> * We drop high sequences packets to :
> * 1) Let a chance for holes to be filled.
> + * This means we do not drop packets from ooo queue if their sequence
> + * is before incoming packet sequence.
> * 2) not add too big latencies if thousands of packets sit there.
> * (But if application shrinks SO_RCVBUF, we could still end up
> * freeing whole queue here)
> @@ -5336,24 +5338,31 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
> *
> * Return true if queue has shrunk.
> */
> -static bool tcp_prune_ofo_queue(struct sock *sk)
> +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
> {
> struct tcp_sock *tp = tcp_sk(sk);
> struct rb_node *node, *prev;
> + bool pruned = false;
> int goal;
>
> if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
> return false;
>
> - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> goal = sk->sk_rcvbuf >> 3;
> node = &tp->ooo_last_skb->rbnode;
> +
> do {
> + struct sk_buff *skb = rb_to_skb(node);
> +
> + /* If incoming skb would land last in ofo queue, stop pruning. */
> + if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
> + break;
> + pruned = true;
> prev = rb_prev(node);
> rb_erase(node, &tp->out_of_order_queue);
> - goal -= rb_to_skb(node)->truesize;
> - tcp_drop_reason(sk, rb_to_skb(node),
> - SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> + goal -= skb->truesize;
> + tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
> + tp->ooo_last_skb = rb_to_skb(prev);
> if (!prev || goal <= 0) {
> if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
> !tcp_under_memory_pressure(sk))
> @@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
> }
> node = prev;
> } while (node);
> - tp->ooo_last_skb = rb_to_skb(prev);
>
> - /* Reset SACK state. A conforming SACK implementation will
> - * do the same at a timeout based retransmit. When a connection
> - * is in a sad state like this, we care only about integrity
> - * of the connection not performance.
> - */
> - if (tp->rx_opt.sack_ok)
> - tcp_sack_reset(&tp->rx_opt);
> - return true;
> + if (pruned) {
> + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
> + /* Reset SACK state. A conforming SACK implementation will
> + * do the same at a timeout based retransmit. When a connection
> + * is in a sad state like this, we care only about integrity
> + * of the connection not performance.
> + */
> + if (tp->rx_opt.sack_ok)
> + tcp_sack_reset(&tp->rx_opt);
> + }
> + return pruned;
> }
>
> /* Reduce allocated memory if we can, trying to get
> @@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
> * until the socket owning process reads some of the data
> * to stabilize the situation.
> */
> -static int tcp_prune_queue(struct sock *sk)
> +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
> {
> struct tcp_sock *tp = tcp_sk(sk);
>
> @@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk)
> /* Collapsing did not help, destructive actions follow.
> * This must not ever occur. */
>
> - tcp_prune_ofo_queue(sk);
> + tcp_prune_ofo_queue(sk, in_skb);
>
> if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
> return 0;
> --
> 2.38.1.273.g43a17bfeac-goog
next prev parent reply other threads:[~2022-11-01 16:54 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-11-01 3:52 [PATCH net-next] tcp: refine tcp_prune_ofo_queue() logic Eric Dumazet
2022-11-01 4:18 ` Soheil Hassas Yeganeh
2022-11-01 16:54 ` Kuniyuki Iwashima [this message]
2022-11-02 4:30 ` patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20221101165412.25234-1-kuniyu@amazon.com \
--to=kuniyu@amazon.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=eric.dumazet@gmail.com \
--cc=kuba@kernel.org \
--cc=ncardwell@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=soheil@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).