All of lore.kernel.org
 help / color / mirror / Atom feed
From: Florian Fainelli <f.fainelli@gmail.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, gregkh@linuxfoundation.org,
	stable@vger.kernel.org, edumazet@google.com,
	sthemmin@microsoft.com, Peter Oskolkov <posk@google.com>,
	Florian Westphal <fw@strlen.de>
Subject: [PATCH stable 4.9 28/29] ip: process in-order fragments efficiently
Date: Tue,  9 Oct 2018 15:49:23 -0700	[thread overview]
Message-ID: <20181009224924.30151-29-f.fainelli@gmail.com> (raw)
In-Reply-To: <20181009224924.30151-1-f.fainelli@gmail.com>

From: Peter Oskolkov <posk@google.com>

This patch changes the runtime behavior of IP defrag queue:
incoming in-order fragments are added to the end of the current
list/"run" of in-order fragments at the tail.

On some workloads, UDP stream performance is substantially improved:

RX: ./udp_stream -F 10 -T 2 -l 60
TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60

with this patchset applied on a 10Gbps receiver:

  throughput=9524.18
  throughput_units=Mbit/s

upstream (net-next):

  throughput=4608.93
  throughput_units=Mbit/s

Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c)
---
 net/ipv4/inet_fragment.c |   2 +-
 net/ipv4/ip_fragment.c   | 110 ++++++++++++++++++++++++---------------
 2 files changed, 70 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 535fa57af51e..8323d33c0ce2 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,7 +145,7 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 			fp = xp;
 		} while (fp);
 	} else {
-		sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+		sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
 	}
 	sum = sum_truesize + f->qsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e20a5afaf6e5..8f899c13a392 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -125,8 +125,8 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
-			 struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+			 struct sk_buff *prev_tail, struct net_device *dev);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -217,7 +217,12 @@ static void ip_expire(unsigned long arg)
 		head = skb_rb_first(&qp->q.rb_fragments);
 		if (!head)
 			goto out;
-		rb_erase(&head->rbnode, &qp->q.rb_fragments);
+		if (FRAG_CB(head)->next_frag)
+			rb_replace_node(&head->rbnode,
+					&FRAG_CB(head)->next_frag->rbnode,
+					&qp->q.rb_fragments);
+		else
+			rb_erase(&head->rbnode, &qp->q.rb_fragments);
 		memset(&head->rbnode, 0, sizeof(head->rbnode));
 		barrier();
 	}
@@ -318,7 +323,7 @@ static int ip_frag_reinit(struct ipq *qp)
 		return -ETIMEDOUT;
 	}
 
-	sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
 	sub_frag_mem_limit(qp->q.net, sum_truesize);
 
 	qp->q.flags = 0;
@@ -327,6 +332,7 @@ static int ip_frag_reinit(struct ipq *qp)
 	qp->q.fragments = NULL;
 	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
 	qp->iif = 0;
 	qp->ecn = 0;
 
@@ -338,7 +344,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct rb_node **rbn, *parent;
-	struct sk_buff *skb1;
+	struct sk_buff *skb1, *prev_tail;
 	struct net_device *dev;
 	unsigned int fragsize;
 	int flags, offset;
@@ -416,38 +422,41 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 */
 
 	/* Find out where to put this fragment.  */
-	skb1 = qp->q.fragments_tail;
-	if (!skb1) {
-		/* This is the first fragment we've received. */
-		rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
-		qp->q.fragments_tail = skb;
-	} else if ((skb1->ip_defrag_offset + skb1->len) < end) {
-		/* This is the common/special case: skb goes to the end. */
+	prev_tail = qp->q.fragments_tail;
+	if (!prev_tail)
+		ip4_frag_create_run(&qp->q, skb);  /* First fragment. */
+	else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
+		/* This is the common case: skb goes to the end. */
 		/* Detect and discard overlaps. */
-		if (offset < (skb1->ip_defrag_offset + skb1->len))
+		if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
 			goto discard_qp;
-		/* Insert after skb1. */
-		rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
-		qp->q.fragments_tail = skb;
+		if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
+			ip4_frag_append_to_last_run(&qp->q, skb);
+		else
+			ip4_frag_create_run(&qp->q, skb);
 	} else {
-		/* Binary search. Note that skb can become the first fragment, but
-		 * not the last (covered above). */
+		/* Binary search. Note that skb can become the first fragment,
+		 * but not the last (covered above).
+		 */
 		rbn = &qp->q.rb_fragments.rb_node;
 		do {
 			parent = *rbn;
 			skb1 = rb_to_skb(parent);
 			if (end <= skb1->ip_defrag_offset)
 				rbn = &parent->rb_left;
-			else if (offset >= skb1->ip_defrag_offset + skb1->len)
+			else if (offset >= skb1->ip_defrag_offset +
+						FRAG_CB(skb1)->frag_run_len)
 				rbn = &parent->rb_right;
 			else /* Found an overlap with skb1. */
 				goto discard_qp;
 		} while (*rbn);
 		/* Here we have parent properly set, and rbn pointing to
-		 * one of its NULL left/right children. Insert skb. */
+		 * one of its NULL left/right children. Insert skb.
+		 */
+		ip4_frag_init_run(skb);
 		rb_link_node(&skb->rbnode, parent, rbn);
+		rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 	}
-	rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
 
 	if (dev)
 		qp->iif = dev->ifindex;
@@ -474,7 +483,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, skb, dev);
+		err = ip_frag_reasm(qp, skb, prev_tail, dev);
 		skb->_skb_refdst = orefdst;
 		return err;
 	}
@@ -493,7 +502,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 
 /* Build a new IP datagram from all its fragments. */
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct net_device *dev)
+			 struct sk_buff *prev_tail, struct net_device *dev)
 {
 	struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
 	struct iphdr *iph;
@@ -517,10 +526,16 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 		fp = skb_clone(skb, GFP_ATOMIC);
 		if (!fp)
 			goto out_nomem;
-		rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+		FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+		if (RB_EMPTY_NODE(&skb->rbnode))
+			FRAG_CB(prev_tail)->next_frag = fp;
+		else
+			rb_replace_node(&skb->rbnode, &fp->rbnode,
+					&qp->q.rb_fragments);
 		if (qp->q.fragments_tail == skb)
 			qp->q.fragments_tail = fp;
 		skb_morph(skb, head);
+		FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
 		rb_replace_node(&head->rbnode, &skb->rbnode,
 				&qp->q.rb_fragments);
 		consume_skb(head);
@@ -556,7 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 		for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
 			plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
 		clone->len = clone->data_len = head->data_len - plen;
-		skb->truesize += clone->truesize;
+		head->truesize += clone->truesize;
 		clone->csum = 0;
 		clone->ip_summed = head->ip_summed;
 		add_frag_mem_limit(qp->q.net, clone->truesize);
@@ -569,24 +584,36 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	skb_push(head, head->data - skb_network_header(head));
 
 	/* Traverse the tree in order, to build frag_list. */
+	fp = FRAG_CB(head)->next_frag;
 	rbn = rb_next(&head->rbnode);
 	rb_erase(&head->rbnode, &qp->q.rb_fragments);
-	while (rbn) {
-		struct rb_node *rbnext = rb_next(rbn);
-		fp = rb_to_skb(rbn);
-		rb_erase(rbn, &qp->q.rb_fragments);
-		rbn = rbnext;
-		*nextp = fp;
-		nextp = &fp->next;
-		fp->prev = NULL;
-		memset(&fp->rbnode, 0, sizeof(fp->rbnode));
-		head->data_len += fp->len;
-		head->len += fp->len;
-		if (head->ip_summed != fp->ip_summed)
-			head->ip_summed = CHECKSUM_NONE;
-		else if (head->ip_summed == CHECKSUM_COMPLETE)
-			head->csum = csum_add(head->csum, fp->csum);
-		head->truesize += fp->truesize;
+	while (rbn || fp) {
+		/* fp points to the next sk_buff in the current run;
+		 * rbn points to the next run.
+		 */
+		/* Go through the current run. */
+		while (fp) {
+			*nextp = fp;
+			nextp = &fp->next;
+			fp->prev = NULL;
+			memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+			head->data_len += fp->len;
+			head->len += fp->len;
+			if (head->ip_summed != fp->ip_summed)
+				head->ip_summed = CHECKSUM_NONE;
+			else if (head->ip_summed == CHECKSUM_COMPLETE)
+				head->csum = csum_add(head->csum, fp->csum);
+			head->truesize += fp->truesize;
+			fp = FRAG_CB(fp)->next_frag;
+		}
+		/* Move to the next run. */
+		if (rbn) {
+			struct rb_node *rbnext = rb_next(rbn);
+
+			fp = rb_to_skb(rbn);
+			rb_erase(rbn, &qp->q.rb_fragments);
+			rbn = rbnext;
+		}
 	}
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
@@ -622,6 +649,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	qp->q.fragments = NULL;
 	qp->q.rb_fragments = RB_ROOT;
 	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
 	return 0;
 
 out_nomem:
-- 
2.17.1

  parent reply	other threads:[~2018-10-10  6:09 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-10-09 22:48 [PATCH stable 4.9 00/29] backport of IP fragmentation fixes Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 01/29] inet: frags: change inet_frags_init_net() return value Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 02/29] inet: frags: add a pointer to struct netns_frags Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 03/29] inet: frags: refactor ipfrag_init() Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 04/29] inet: frags: refactor ipv6_frag_init() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 05/29] inet: frags: refactor lowpan_net_frag_init() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 06/29] ipv6: export ip6 fragments sysctl to unprivileged users Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 07/29] rhashtable: add schedule points Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 08/29] inet: frags: use rhashtables for reassembly units Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 09/29] inet: frags: remove some helpers Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 10/29] inet: frags: get rif of inet_frag_evicting() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 11/29] inet: frags: remove inet_frag_maybe_warn_overflow() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 12/29] inet: frags: break the 2GB limit for frags storage Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 13/29] inet: frags: do not clone skb in ip_expire() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 14/29] ipv6: frags: rewrite ip6_expire_frag_queue() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 15/29] rhashtable: reorganize struct rhashtable layout Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 16/29] inet: frags: reorganize struct netns_frags Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 17/29] inet: frags: get rid of ipfrag_skb_cb/FRAG_CB Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 18/29] inet: frags: fix ip6frag_low_thresh boundary Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 19/29] ip: discard IPv4 datagrams with overlapping segments Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 20/29] net: speed up skb_rbtree_purge() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 21/29] net: modify skb_rbtree_purge to return the truesize of all purged skbs Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 22/29] ipv6: defrag: drop non-last frags smaller than min mtu Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 23/29] net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 24/29] net: add rb_to_skb() and other rb tree helpers Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 25/29] net: sk_buff rbnode reorg Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 26/29] ipv4: frags: precedence bug in ip_expire() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 27/29] ip: add helpers to process in-order fragments faster Florian Fainelli
2018-10-09 22:49 ` Florian Fainelli [this message]
2018-10-09 22:49 ` [PATCH stable 4.9 29/29] ip: frags: fix crash in ip_do_fragment() Florian Fainelli
2018-10-10  0:46 ` [PATCH stable 4.9 00/29] backport of IP fragmentation fixes Eric Dumazet
2018-10-10  4:15   ` Florian Fainelli
2018-10-10 23:18     ` Stephen Hemminger
2018-10-10 23:23       ` Florian Fainelli

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181009224924.30151-29-f.fainelli@gmail.com \
    --to=f.fainelli@gmail.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=fw@strlen.de \
    --cc=gregkh@linuxfoundation.org \
    --cc=netdev@vger.kernel.org \
    --cc=posk@google.com \
    --cc=stable@vger.kernel.org \
    --cc=sthemmin@microsoft.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.