All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Willem de Bruijn <willemb@google.com>,
	Peter Oskolkov <posk@google.com>,
	Eric Dumazet <edumazet@google.com>,
	Florian Westphal <fw@strlen.de>,
	"David S. Miller" <davem@davemloft.net>,
	Mao Wenan <maowenan@huawei.com>,
	Ben Hutchings <ben.hutchings@codethink.co.uk>
Subject: [PATCH 4.4 28/34] ip: process in-order fragments efficiently
Date: Thu,  7 Feb 2019 12:42:10 +0100	[thread overview]
Message-ID: <20190207113026.653344926@linuxfoundation.org> (raw)
In-Reply-To: <20190207113025.552605181@linuxfoundation.org>

4.4-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Peter Oskolkov <posk@google.com>

commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.

This patch changes the runtime behavior of IP defrag queue:
incoming in-order fragments are added to the end of the current
list/"run" of in-order fragments at the tail.

On some workloads, UDP stream performance is substantially improved:

RX: ./udp_stream -F 10 -T 2 -l 60
TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60

with this patchset applied on a 10Gbps receiver:

  throughput=9524.18
  throughput_units=Mbit/s

upstream (net-next):

  throughput=4608.93
  throughput_units=Mbit/s

Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 net/ipv4/inet_fragment.c |    2 
 net/ipv4/ip_fragment.c   |  110 +++++++++++++++++++++++++++++------------------
 2 files changed, 70 insertions(+), 42 deletions(-)

--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -153,7 +153,7 @@ void inet_frag_destroy(struct inet_frag_
 			fp = xp;
 		} while (fp);
 	} else {
-		sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+		sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 	}
 	sum = sum_truesize + f->qsize;
 
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -127,8 +127,8 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-			 struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+			 struct sk_buff *prev_tail, struct net_device *dev);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -219,7 +219,12 @@ static void ip_expire(unsigned long arg)
 		head = skb_rb_first(&qp->q.rb_fragments);
 		if (!head)
 			goto out;
-		rb_erase(&head->rbnode, &qp->q.rb_fragments);
+		if (FRAG_CB(head)->next_frag)
+			rb_replace_node(&head->rbnode,
+					&FRAG_CB(head)->next_frag->rbnode,
+					&qp->q.rb_fragments);
+		else
+			rb_erase(&head->rbnode, &qp->q.rb_fragments);
 		memset(&head->rbnode, 0, sizeof(head->rbnode));
 		barrier();
 	}
@@ -320,7 +325,7 @@ static int ip_frag_reinit(struct ipq *qp
 		return -ETIMEDOUT;
 	}
 
-	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
 	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
@@ -329,6 +334,7 @@ static int ip_frag_reinit(struct ipq *qp
 	qp->q.fragments = NULL;
 	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
 	qp->iif = 0;
 	qp->ecn = 0;
 
@@ -340,7 +346,7 @@ static int ip_frag_queue(struct ipq *qp,
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct rb_node **rbn, *parent;
-	struct sk_buff *skb1;
+	struct sk_buff *skb1, *prev_tail;
 	struct net_device *dev;
 	unsigned int fragsize;
 	int flags, offset;
@@ -418,38 +424,41 @@ static int ip_frag_queue(struct ipq *qp,
 	 */
 
 	/* Find out where to put this fragment.  */
-	skb1 = qp->q.fragments_tail;
-	if (!skb1) {
-		/* This is the first fragment we've received. */
-		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-		qp->q.fragments_tail = skb;
-	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
-		/* This is the common/special case: skb goes to the end. */
+	prev_tail = qp->q.fragments_tail;
+	if (!prev_tail)
+		ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
+	else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+		/* This is the common case: skb goes to the end. */
 		/* Detect and discard overlaps. */
-		if (offset < (skb1->ip_defrag_offset + skb1->len))
+		if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
 			goto discard_qp;
-		/* Insert after skb1. */
-		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
-		qp->q.fragments_tail = skb;
+		if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+			ip4_frag_append_to_last_run(&qp->q, skb);
+		else
+			ip4_frag_create_run(&qp->q, skb);
 	} else {
-		/* Binary search. Note that skb can become the first fragment, but
-		 * not the last (covered above). */
+		/* Binary search. Note that skb can become the first fragment,
+		 * but not the last (covered above).
+		 */
 		rbn = &qp->q.rb_fragments.rb_node;
 		do {
 			parent = *rbn;
 			skb1 = rb_to_skb(parent);
 			if (end <= skb1->ip_defrag_offset)
 				rbn = &parent->rb_left;
-			else if (offset >= skb1->ip_defrag_offset + skb1->len)
+			else if (offset >= skb1->ip_defrag_offset +
+						FRAG_CB(skb1)->frag_run_len)
 				rbn = &parent->rb_right;
 			else /* Found an overlap with skb1. */
 				goto discard_qp;
 		} while (*rbn);
 		/* Here we have parent properly set, and rbn pointing to
-		 * one of its NULL left/right children. Insert skb. */
+		 * one of its NULL left/right children. Insert skb.
+		 */
+		ip4_frag_init_run(skb);
 		rb_link_node(&skb->rbnode, parent, rbn);
+		rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 	}
-	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 
 	if (dev)
 		qp->iif = dev->ifindex;
@@ -476,7 +485,7 @@ static int ip_frag_queue(struct ipq *qp,
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, skb, dev);
+		err = ip_frag_reasm(qp, skb, prev_tail, dev);
 		skb->_skb_refdst = orefdst;
 		return err;
 	}
@@ -495,7 +504,7 @@ err:
 
 /* Build a new IP datagram from all its fragments. */
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct net_device *dev)
+			 struct sk_buff *prev_tail, struct net_device *dev)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
@@ -519,10 +528,16 @@ static int ip_frag_reasm(struct ipq *qp,
 		fp = skb_clone(skb, GFP_ATOMIC);
 		if (!fp)
 			goto out_nomem;
-		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+		if (RB_EMPTY_NODE(&skb->rbnode))
+			FRAG_CB(prev_tail)->next_frag = fp;
+		else
+			rb_replace_node(&skb->rbnode, &fp->rbnode,
+					&qp->q.rb_fragments);
 		if (qp->q.fragments_tail == skb)
 			qp->q.fragments_tail = fp;
 		skb_morph(skb, head);
+		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 		rb_replace_node(&head->rbnode, &skb->rbnode,
 				&qp->q.rb_fragments);
 		consume_skb(head);
@@ -558,7 +573,7 @@ static int ip_frag_reasm(struct ipq *qp,
 		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 		clone->len = clone->data_len = head->data_len - plen;
-		skb->truesize += clone->truesize;
+		head->truesize += clone->truesize;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(qp->q.net, clone->truesize);
@@ -571,24 +586,36 @@ static int ip_frag_reasm(struct ipq *qp,
 	skb_push(head, head->data - skb_network_header(head));
 
 	/* Traverse the tree in order, to build frag_list. */
+	fp = FRAG_CB(head)->next_frag;
 	rbn = rb_next(&head->rbnode);
 	rb_erase(&head->rbnode, &qp->q.rb_fragments);
-	while (rbn) {
-		struct rb_node *rbnext = rb_next(rbn);
-		fp = rb_to_skb(rbn);
-		rb_erase(rbn, &qp->q.rb_fragments);
-		rbn = rbnext;
-		*nextp = fp;
-		nextp = &fp->next;
-		fp->prev = NULL;
-		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-		head->data_len += fp->len;
-		head->len += fp->len;
-		if (head->ip_summed != fp->ip_summed)
-			head->ip_summed = CHECKSUM_NONE;
-		else if (head->ip_summed == CHECKSUM_COMPLETE)
-			head->csum = csum_add(head->csum, fp->csum);
-		head->truesize += fp->truesize;
+	while (rbn || fp) {
+		/* fp points to the next sk_buff in the current run;
+		 * rbn points to the next run.
+		 */
+		/* Go through the current run. */
+		while (fp) {
+			*nextp = fp;
+			nextp = &fp->next;
+			fp->prev = NULL;
+			memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+			head->data_len += fp->len;
+			head->len += fp->len;
+			if (head->ip_summed != fp->ip_summed)
+				head->ip_summed = CHECKSUM_NONE;
+			else if (head->ip_summed == CHECKSUM_COMPLETE)
+				head->csum = csum_add(head->csum, fp->csum);
+			head->truesize += fp->truesize;
+			fp = FRAG_CB(fp)->next_frag;
+		}
+		/* Move to the next run. */
+		if (rbn) {
+			struct rb_node *rbnext = rb_next(rbn);
+
+			fp = rb_to_skb(rbn);
+			rb_erase(rbn, &qp->q.rb_fragments);
+			rbn = rbnext;
+		}
 	}
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
@@ -624,6 +651,7 @@ static int ip_frag_reasm(struct ipq *qp,
 	qp->q.fragments = NULL;
 	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
 	return 0;
 
 out_nomem:



  parent reply	other threads:[~2019-02-07 11:43 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-02-07 11:41 [PATCH 4.4 00/34] 4.4.174-stable review Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 01/34] inet: frags: change inet_frags_init_net() return value Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 02/34] inet: frags: add a pointer to struct netns_frags Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 03/34] inet: frags: refactor ipfrag_init() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 04/34] inet: frags: refactor ipv6_frag_init() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 05/34] inet: frags: refactor lowpan_net_frag_init() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 06/34] rhashtable: add rhashtable_lookup_get_insert_key() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 07/34] rhashtable: Add rhashtable_lookup() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 08/34] rhashtable: add schedule points Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 09/34] inet: frags: use rhashtables for reassembly units Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 10/34] net: ieee802154: 6lowpan: fix frag reassembly Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 11/34] ipfrag: really prevent allocation on netns exit Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 12/34] inet: frags: remove some helpers Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 13/34] inet: frags: get rif of inet_frag_evicting() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 14/34] inet: frags: remove inet_frag_maybe_warn_overflow() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 15/34] inet: frags: break the 2GB limit for frags storage Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 16/34] inet: frags: do not clone skb in ip_expire() Greg Kroah-Hartman
2019-02-07 11:41 ` [PATCH 4.4 17/34] ipv6: frags: rewrite ip6_expire_frag_queue() Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 18/34] rhashtable: reorganize struct rhashtable layout Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 19/34] inet: frags: reorganize struct netns_frags Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 20/34] inet: frags: get rid of ipfrag_skb_cb/FRAG_CB Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 21/34] inet: frags: fix ip6frag_low_thresh boundary Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 22/34] ip: discard IPv4 datagrams with overlapping segments Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 23/34] net: modify skb_rbtree_purge to return the truesize of all purged skbs Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 24/34] ipv6: defrag: drop non-last frags smaller than min mtu Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 25/34] net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 26/34] ip: use rb trees for IP frag queue Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 27/34] ip: add helpers to process in-order fragments faster Greg Kroah-Hartman
2019-02-07 11:42 ` Greg Kroah-Hartman [this message]
2019-02-07 11:42 ` [PATCH 4.4 29/34] ip: frags: fix crash in ip_do_fragment() Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 30/34] ipv4: frags: precedence bug in ip_expire() Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 31/34] inet: frags: better deal with smp races Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 32/34] net: fix pskb_trim_rcsum_slow() with odd trim offset Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 33/34] net: ipv4: do not handle duplicate fragments as overlapping Greg Kroah-Hartman
2019-02-07 11:42 ` [PATCH 4.4 34/34] rcu: Force boolean subscript for expedited stall warnings Greg Kroah-Hartman
2019-02-07 14:20 ` [PATCH 4.4 00/34] 4.4.174-stable review Guenter Roeck
2019-02-07 14:41   ` Guenter Roeck
2019-02-07 15:46     ` Greg Kroah-Hartman
2019-02-07 18:57       ` Guenter Roeck
2019-02-07 15:47   ` Greg Kroah-Hartman
2019-02-07 19:16     ` Guenter Roeck
2019-02-07 18:18 ` kernelci.org bot
2019-02-08  6:13 ` Naresh Kamboju
2019-02-08  6:46   ` Greg Kroah-Hartman
2019-02-08 10:03 ` Jon Hunter
2019-02-08 10:03   ` Jon Hunter
2019-02-08 10:28   ` Greg Kroah-Hartman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190207113026.653344926@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=ben.hutchings@codethink.co.uk \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=fw@strlen.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=maowenan@huawei.com \
    --cc=posk@google.com \
    --cc=stable@vger.kernel.org \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.