From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
stable@vger.kernel.org, Willem de Bruijn <willemb@google.com>,
Peter Oskolkov <posk@google.com>,
Eric Dumazet <edumazet@google.com>,
Florian Westphal <fw@strlen.de>,
"David S. Miller" <davem@davemloft.net>,
Mao Wenan <maowenan@huawei.com>
Subject: [PATCH 4.4 62/65] ip: process in-order fragments efficiently
Date: Mon, 4 Feb 2019 11:36:55 +0100 [thread overview]
Message-ID: <20190204103620.474109506@linuxfoundation.org> (raw)
In-Reply-To: <20190204103610.583715954@linuxfoundation.org>
4.4-stable review patch. If anyone has any objections, please let me know.
------------------
From: Peter Oskolkov <posk@google.com>
commit a4fd284a1f8fd4b6c59aa59db2185b1e17c5c11c upstream.
This patch changes the runtime behavior of IP defrag queue:
incoming in-order fragments are added to the end of the current
list/"run" of in-order fragments at the tail.
On some workloads, UDP stream performance is substantially improved:
RX: ./udp_stream -F 10 -T 2 -l 60
TX: ./udp_stream -c -H <host> -F 10 -T 5 -l 60
with this patchset applied on a 10Gbps receiver:
throughput=9524.18
throughput_units=Mbit/s
upstream (net-next):
throughput=4608.93
throughput_units=Mbit/s
Reported-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Peter Oskolkov <posk@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
net/ipv4/inet_fragment.c | 2
net/ipv4/ip_fragment.c | 110 +++++++++++++++++++++++++++++------------------
2 files changed, 70 insertions(+), 42 deletions(-)
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -315,7 +315,7 @@ void inet_frag_destroy(struct inet_frag_
fp = xp;
} while (fp);
} else {
- sum_truesize = skb_rbtree_purge(&q->rb_fragments);
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -139,8 +139,8 @@ int ip_frag_mem(struct net *net)
return sum_frag_mem_limit(&net->ipv4.frags);
}
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
- struct net_device *dev);
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev);
struct ip4_create_arg {
struct iphdr *iph;
@@ -271,7 +271,12 @@ static void ip_expire(unsigned long arg)
head = skb_rb_first(&qp->q.rb_fragments);
if (!head)
goto out;
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ if (FRAG_CB(head)->next_frag)
+ rb_replace_node(&head->rbnode,
+ &FRAG_CB(head)->next_frag->rbnode,
+ &qp->q.rb_fragments);
+ else
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
memset(&head->rbnode, 0, sizeof(head->rbnode));
barrier();
}
@@ -373,7 +378,7 @@ static int ip_frag_reinit(struct ipq *qp
return -ETIMEDOUT;
}
- sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
@@ -382,6 +387,7 @@ static int ip_frag_reinit(struct ipq *qp
qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -393,7 +399,7 @@ static int ip_frag_queue(struct ipq *qp,
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct rb_node **rbn, *parent;
- struct sk_buff *skb1;
+ struct sk_buff *skb1, *prev_tail;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -471,38 +477,41 @@ static int ip_frag_queue(struct ipq *qp,
*/
/* Find out where to put this fragment. */
- skb1 = qp->q.fragments_tail;
- if (!skb1) {
- /* This is the first fragment we've received. */
- rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
- qp->q.fragments_tail = skb;
- } else if ((FRAG_CB(skb1)->offset + skb1->len) < end) {
- /* This is the common/special case: skb goes to the end. */
+ prev_tail = qp->q.fragments_tail;
+ if (!prev_tail)
+ ip4_frag_create_run(&qp->q, skb); /* First fragment. */
+ else if (FRAG_CB(prev_tail)->offset + prev_tail->len < end) {
+ /* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
- if (offset < (FRAG_CB(skb1)->offset + skb1->len))
+ if (offset < FRAG_CB(prev_tail)->offset + prev_tail->len)
goto discard_qp;
- /* Insert after skb1. */
- rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
- qp->q.fragments_tail = skb;
+ if (offset == FRAG_CB(prev_tail)->offset + prev_tail->len)
+ ip4_frag_append_to_last_run(&qp->q, skb);
+ else
+ ip4_frag_create_run(&qp->q, skb);
} else {
- /* Binary search. Note that skb can become the first fragment, but
- * not the last (covered above). */
+ /* Binary search. Note that skb can become the first fragment,
+ * but not the last (covered above).
+ */
rbn = &qp->q.rb_fragments.rb_node;
do {
parent = *rbn;
skb1 = rb_to_skb(parent);
if (end <= FRAG_CB(skb1)->offset)
rbn = &parent->rb_left;
- else if (offset >= FRAG_CB(skb1)->offset + skb1->len)
+ else if (offset >= FRAG_CB(skb1)->offset +
+ FRAG_CB(skb1)->frag_run_len)
rbn = &parent->rb_right;
else /* Found an overlap with skb1. */
goto discard_qp;
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
- * one of its NULL left/right children. Insert skb. */
+ * one of its NULL left/right children. Insert skb.
+ */
+ ip4_frag_init_run(skb);
rb_link_node(&skb->rbnode, parent, rbn);
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
}
- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
if (dev) {
qp->iif = dev->ifindex;
@@ -531,7 +540,7 @@ static int ip_frag_queue(struct ipq *qp,
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, skb, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev);
skb->_skb_refdst = orefdst;
return err;
}
@@ -550,7 +559,7 @@ err:
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
@@ -575,10 +584,16 @@ static int ip_frag_reasm(struct ipq *qp,
if (!fp)
goto out_nomem;
- rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+ if (RB_EMPTY_NODE(&skb->rbnode))
+ FRAG_CB(prev_tail)->next_frag = fp;
+ else
+ rb_replace_node(&skb->rbnode, &fp->rbnode,
+ &qp->q.rb_fragments);
if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
skb_morph(skb, head);
+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
rb_replace_node(&head->rbnode, &skb->rbnode,
&qp->q.rb_fragments);
consume_skb(head);
@@ -614,7 +629,7 @@ static int ip_frag_reasm(struct ipq *qp,
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- skb->truesize += clone->truesize;
+ head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
@@ -627,24 +642,36 @@ static int ip_frag_reasm(struct ipq *qp,
skb_push(head, head->data - skb_network_header(head));
/* Traverse the tree in order, to build frag_list. */
+ fp = FRAG_CB(head)->next_frag;
rbn = rb_next(&head->rbnode);
rb_erase(&head->rbnode, &qp->q.rb_fragments);
- while (rbn) {
- struct rb_node *rbnext = rb_next(rbn);
- fp = rb_to_skb(rbn);
- rb_erase(rbn, &qp->q.rb_fragments);
- rbn = rbnext;
- *nextp = fp;
- nextp = &fp->next;
- fp->prev = NULL;
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
+ while (rbn || fp) {
+ /* fp points to the next sk_buff in the current run;
+ * rbn points to the next run.
+ */
+ /* Go through the current run. */
+ while (fp) {
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ head->data_len += fp->len;
+ head->len += fp->len;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+ head->truesize += fp->truesize;
+ fp = FRAG_CB(fp)->next_frag;
+ }
+ /* Move to the next run. */
+ if (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ }
}
sub_frag_mem_limit(qp->q.net, head->truesize);
@@ -680,6 +707,7 @@ static int ip_frag_reasm(struct ipq *qp,
qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
+ qp->q.last_run_head = NULL;
return 0;
out_nomem:
next prev parent reply other threads:[~2019-02-04 10:42 UTC|newest]
Thread overview: 75+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-02-04 10:35 [PATCH 4.4 00/65] 4.4.173-stable review Greg Kroah-Hartman
2019-02-04 10:35 ` [PATCH 4.4 01/65] net: Fix usage of pskb_trim_rcsum Greg Kroah-Hartman
2019-02-04 10:35 ` [PATCH 4.4 02/65] openvswitch: Avoid OOB read when parsing flow nlattrs Greg Kroah-Hartman
2019-02-04 10:35 ` [PATCH 4.4 03/65] net: ipv4: Fix memory leak in network namespace dismantle Greg Kroah-Hartman
2019-02-04 10:35 ` [PATCH 4.4 04/65] net_sched: refetch skb protocol for each filter Greg Kroah-Hartman
2019-02-04 10:35 ` [Bridge] [PATCH 4.4 05/65] net: bridge: Fix ethernet header pointer before check skb forwardable Greg Kroah-Hartman
2019-02-04 10:35 ` Greg Kroah-Hartman
2019-02-04 10:35 ` [PATCH 4.4 06/65] mmc: Kconfig: Enable CONFIG_MMC_SDHCI_IO_ACCESSORS Greg Kroah-Hartman
2019-02-04 11:05 ` Georgi Djakov
2019-02-04 11:13 ` Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 07/65] USB: serial: simple: add Motorola Tetra TPG2200 device id Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 08/65] USB: serial: pl2303: add new PID to support PL2303TB Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 09/65] ASoC: atom: fix a missing check of snd_pcm_lib_malloc_pages Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 10/65] ARC: perf: map generic branches to correct hardware condition Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 11/65] s390/early: improve machine detection Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 12/65] s390/smp: fix CPU hotplug deadlock with CPU rescan Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 13/65] char/mwave: fix potential Spectre v1 vulnerability Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 14/65] staging: rtl8188eu: Add device code for D-Link DWA-121 rev B1 Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 15/65] tty: Handle problem if line discipline does not have receive_buf Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 16/65] tty/n_hdlc: fix __might_sleep warning Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 17/65] CIFS: Fix possible hang during async MTU reads and writes Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 18/65] Input: xpad - add support for SteelSeries Stratus Duo Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 19/65] KVM: x86: Fix single-step debugging Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 20/65] x86/kaslr: Fix incorrect i8254 outb() parameters Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 21/65] can: dev: __can_get_echo_skb(): fix bogous check for non-existing skb by removing it Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 22/65] can: bcm: check timer values before ktime conversion Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 23/65] vt: invoke notifier on screen size change Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 24/65] perf unwind: Unwind with libdw doesnt take symfs into account Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 25/65] perf unwind: Take pgoff into account when reporting elf to libdwfl Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 26/65] irqchip/gic-v3-its: Align PCI Multi-MSI allocation on their size Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 27/65] arm64: mm: remove page_mapping check in __sync_icache_dcache Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 28/65] f2fs: read page index before freeing Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 29/65] Revert "loop: Fix double mutex_unlock(&loop_ctl_mutex) in loop_control_ioctl()" Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 30/65] Revert "loop: Get rid of loop_index_mutex" Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 31/65] Revert "loop: Fold __loop_release into loop_release" Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 32/65] s390/smp: Fix calling smp_call_ipl_cpu() from ipl CPU Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 33/65] fs: add the fsnotify call to vfs_iter_write Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 34/65] ipv6: Consider sk_bound_dev_if when binding a socket to an address Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 35/65] l2tp: copy 4 more bytes to linear part if necessary Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 36/65] net/mlx4_core: Add masking for a few queries on HCA caps Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 37/65] netrom: switch to sock timer API Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 38/65] net/rose: fix NULL ax25_cb kernel panic Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 39/65] ucc_geth: Reset BQL queue when stopping device Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 40/65] l2tp: remove l2specific_len dependency in l2tp_core Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 41/65] l2tp: fix reading optional fields of L2TPv3 Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 42/65] CIFS: Do not count -ENODATA as failure for query directory Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 43/65] fs/dcache: Fix incorrect nr_dentry_unused accounting in shrink_dcache_sb() Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 44/65] ARM: cns3xxx: Fix writing to wrong PCI config registers after alignment Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 45/65] arm64: hyp-stub: Forbid kprobing of the hyp-stub Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 46/65] gfs2: Revert "Fix loop in gfs2_rbm_find" Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 47/65] platform/x86: asus-nb-wmi: Map 0x35 to KEY_SCREENLOCK Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 48/65] platform/x86: asus-nb-wmi: Drop mapping of 0x33 and 0x34 scan codes Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 49/65] mmc: sdhci-iproc: handle mmc_of_parse() errors during probe Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 50/65] kernel/exit.c: release ptraced tasks before zap_pid_ns_processes Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 51/65] mm, oom: fix use-after-free in oom_kill_process Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 52/65] cifs: Always resolve hostname before reconnecting Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 53/65] drivers: core: Remove glue dirs from sysfs earlier Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 54/65] mm: migrate: dont rely on __PageMovable() of newpage after unlocking it Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 55/65] fs: dont scan the inode cache before SB_BORN is set Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 56/65] ip: discard IPv4 datagrams with overlapping segments Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 57/65] net: modify skb_rbtree_purge to return the truesize of all purged skbs Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 58/65] inet: frags: get rif of inet_frag_evicting() Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 59/65] ip: use rb trees for IP frag queue Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 60/65] ipv6: defrag: drop non-last frags smaller than min mtu Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 61/65] ip: add helpers to process in-order fragments faster Greg Kroah-Hartman
2019-02-04 10:36 ` Greg Kroah-Hartman [this message]
2019-02-04 10:36 ` [PATCH 4.4 63/65] net: ipv4: do not handle duplicate fragments as overlapping Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 64/65] ip: frags: fix crash in ip_do_fragment() Greg Kroah-Hartman
2019-02-04 10:36 ` [PATCH 4.4 65/65] ipv4: frags: precedence bug in ip_expire() Greg Kroah-Hartman
2019-02-04 22:48 ` [PATCH 4.4 00/65] 4.4.173-stable review Guenter Roeck
2019-02-05 14:42 ` Greg Kroah-Hartman
2019-02-05 15:12 ` Guenter Roeck
2019-02-05 6:24 ` Naresh Kamboju
2019-02-05 10:17 ` Jon Hunter
2019-02-05 10:17 ` Jon Hunter
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190204103620.474109506@linuxfoundation.org \
--to=gregkh@linuxfoundation.org \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=fw@strlen.de \
--cc=linux-kernel@vger.kernel.org \
--cc=maowenan@huawei.com \
--cc=posk@google.com \
--cc=stable@vger.kernel.org \
--cc=willemb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.