From: Florian Fainelli <f.fainelli@gmail.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, gregkh@linuxfoundation.org,
stable@vger.kernel.org, edumazet@google.com,
sthemmin@microsoft.com, Soheil Hassas Yeganeh <soheil@google.com>,
Wei Wang <weiwan@google.com>,
Willem de Bruijn <willemb@google.com>
Subject: [PATCH stable 4.9 25/29] net: sk_buff rbnode reorg
Date: Tue, 9 Oct 2018 15:49:20 -0700 [thread overview]
Message-ID: <20181009224924.30151-26-f.fainelli@gmail.com> (raw)
In-Reply-To: <20181009224924.30151-1-f.fainelli@gmail.com>
From: Eric Dumazet <edumazet@google.com>
commit bffa72cf7f9df842f0016ba03586039296b4caaf upstream
skb->rbnode shares space with skb->next, skb->prev and skb->tstamp
Current uses (TCP receive ofo queue and netem) need to save/restore
tstamp, while skb->dev is either NULL (TCP) or a constant for a given
queue (netem).
Since we plan using an RB tree for TCP retransmit queue to speedup SACK
processing with large BDP, this patch exchanges skb->dev and
skb->tstamp.
This saves some overhead in both TCP and netem.
v2: removes the swtstamp field from struct tcp_skb_cb
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Wei Wang <weiwan@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
include/linux/skbuff.h | 18 ++-
include/net/inet_frag.h | 3 +-
net/ipv4/inet_fragment.c | 16 ++-
net/ipv4/ip_fragment.c | 182 +++++++++++++-----------
net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
net/ipv6/reassembly.c | 1 +
6 files changed, 128 insertions(+), 93 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7e7e12aeaf82..d966cc688750 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -639,19 +639,27 @@ struct sk_buff {
struct sk_buff *prev;
union {
- ktime_t tstamp;
- struct skb_mstamp skb_mstamp;
+ struct net_device *dev;
+ /* Some protocols might use this space to store information,
+ * while device pointer would be NULL.
+ * UDP receive path is one user.
+ */
+ unsigned long dev_scratch;
};
};
- struct rb_node rbnode; /* used in netem & tcp stack */
+ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
+ struct list_head list;
};
union {
+ struct sock *sk;
int ip_defrag_offset;
};
- struct sock *sk;
- struct net_device *dev;
+ union {
+ ktime_t tstamp;
+ struct skb_mstamp skb_mstamp;
+ };
/*
* This is the control buffer. It is free to use for every
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index f47678d2ccc2..1ff0433d94a7 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -74,7 +74,8 @@ struct inet_frag_queue {
struct timer_list timer;
spinlock_t lock;
atomic_t refcnt;
- struct sk_buff *fragments;
+ struct sk_buff *fragments; /* Used in IPv6. */
+ struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments_tail;
ktime_t stamp;
int len;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 47c240f50b99..535fa57af51e 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -136,12 +136,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments;
nf = q->net;
f = nf->f;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
+ if (fp) {
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
+ fp = xp;
+ } while (fp);
+ } else {
+ sum_truesize = skb_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8bfb34e9ea32..11d3dc649ef0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -134,7 +134,7 @@ static bool frag_expire_skip_icmp(u32 user)
static void ip_expire(unsigned long arg)
{
const struct iphdr *iph;
- struct sk_buff *head;
+ struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
int err;
@@ -150,14 +150,31 @@ static void ip_expire(unsigned long arg)
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
-
- head = qp->q.fragments;
-
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+ if (!qp->q.flags & INET_FRAG_FIRST_IN)
goto out;
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+ * pull the head out of the tree in order to be able to
+ * deal with head->dev.
+ */
+ if (qp->q.fragments) {
+ head = qp->q.fragments;
+ qp->q.fragments = head->next;
+ } else {
+ head = skb_rb_first(&qp->q.rb_fragments);
+ if (!head)
+ goto out;
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+ }
+ if (head == qp->q.fragments_tail)
+ qp->q.fragments_tail = NULL;
+
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -177,16 +194,16 @@ static void ip_expire(unsigned long arg)
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
- skb_get(head);
spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- kfree_skb(head);
goto out_rcu_unlock;
out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
+ if (head)
+ kfree_skb(head);
ipq_put(qp);
}
@@ -229,7 +246,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid);
qp->rid = end;
- rc = qp->q.fragments && (end - start) > max;
+ rc = qp->q.fragments_tail && (end - start) > max;
if (rc) {
struct net *net;
@@ -243,7 +260,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
- struct sk_buff *fp;
unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -251,20 +267,14 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- fp = qp->q.fragments;
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
+ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -276,7 +286,8 @@ static int ip_frag_reinit(struct ipq *qp)
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct sk_buff *prev, *next;
+ struct rb_node **rbn, *parent;
+ struct sk_buff *skb1;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -339,58 +350,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (err)
goto err;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = qp->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = qp->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
-found:
/* RFC5722, Section 4, amended by Errata ID : 3089
* When reassembling an IPv6 datagram, if
* one or more its constituent fragments is determined to be an
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
- * We do the same here for IPv4.
+ * We do the same here for IPv4 (and increment an snmp counter).
*/
- /* Is there an overlap with the previous fragment? */
- if (prev &&
- (prev->ip_defrag_offset + prev->len) > offset)
- goto discard_qp;
-
- /* Is there an overlap with the next fragment? */
- if (next && next->ip_defrag_offset < end)
- goto discard_qp;
+ /* Find out where to put this fragment. */
+ skb1 = qp->q.fragments_tail;
+ if (!skb1) {
+ /* This is the first fragment we've received. */
+ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+ qp->q.fragments_tail = skb;
+ } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+ /* This is the common/special case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < (skb1->ip_defrag_offset + skb1->len))
+ goto discard_qp;
+ /* Insert after skb1. */
+ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+ qp->q.fragments_tail = skb;
+ } else {
+ /* Binary search. Note that skb can become the first fragment, but
+ * not the last (covered above). */
+ rbn = &qp->q.rb_fragments.rb_node;
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
+ if (end <= skb1->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= skb1->ip_defrag_offset + skb1->len)
+ rbn = &parent->rb_right;
+ else /* Found an overlap with skb1. */
+ goto discard_qp;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb. */
+ rb_link_node(&skb->rbnode, parent, rbn);
+ }
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
- dev = skb->dev;
if (dev)
qp->iif = dev->ifindex;
- /* Makes sure compiler wont do silly aliasing games */
- barrier();
skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- qp->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- qp->q.fragments = skb;
-
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
@@ -412,7 +423,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, prev, dev);
+ err = ip_frag_reasm(qp, skb, dev);
skb->_skb_refdst = orefdst;
return err;
}
@@ -429,15 +440,15 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
return err;
}
-
/* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
- struct sk_buff *fp, *head = qp->q.fragments;
+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+ struct sk_buff **nextp; /* To build frag_list. */
+ struct rb_node *rbn;
int len;
int ihlen;
int err;
@@ -451,25 +462,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail;
}
/* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
goto out_nomem;
-
- fp->next = head->next;
- if (!fp->next)
+ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+ if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
- prev->next = fp;
-
- skb_morph(head, qp->q.fragments);
- head->next = qp->q.fragments->next;
-
- consume_skb(qp->q.fragments);
- qp->q.fragments = head;
+ skb_morph(skb, head);
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &qp->q.rb_fragments);
+ consume_skb(head);
+ head = skb;
}
- WARN_ON(!head);
WARN_ON(head->ip_defrag_offset != 0);
/* Allocate a new buffer for the datagram. */
@@ -494,24 +500,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
goto out_nomem;
- clone->next = head->next;
- head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
+ skb->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
+ /* Traverse the tree in order, to build frag_list. */
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ while (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
@@ -522,7 +539,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
sub_frag_mem_limit(qp->q.net, head->truesize);
+ *nextp = NULL;
head->next = NULL;
+ head->prev = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -550,6 +569,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
return 0;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b81541701346..907c2d5753dd 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -470,6 +470,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
head->csum);
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return true;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 78656bbe50e7..74ffbcb306a6 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -466,6 +466,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return 1;
--
2.17.1
next prev parent reply other threads:[~2018-10-10 6:09 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-10-09 22:48 [PATCH stable 4.9 00/29] backport of IP fragmentation fixes Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 01/29] inet: frags: change inet_frags_init_net() return value Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 02/29] inet: frags: add a pointer to struct netns_frags Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 03/29] inet: frags: refactor ipfrag_init() Florian Fainelli
2018-10-09 22:48 ` [PATCH stable 4.9 04/29] inet: frags: refactor ipv6_frag_init() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 05/29] inet: frags: refactor lowpan_net_frag_init() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 06/29] ipv6: export ip6 fragments sysctl to unprivileged users Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 07/29] rhashtable: add schedule points Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 08/29] inet: frags: use rhashtables for reassembly units Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 09/29] inet: frags: remove some helpers Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 10/29] inet: frags: get rif of inet_frag_evicting() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 11/29] inet: frags: remove inet_frag_maybe_warn_overflow() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 12/29] inet: frags: break the 2GB limit for frags storage Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 13/29] inet: frags: do not clone skb in ip_expire() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 14/29] ipv6: frags: rewrite ip6_expire_frag_queue() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 15/29] rhashtable: reorganize struct rhashtable layout Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 16/29] inet: frags: reorganize struct netns_frags Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 17/29] inet: frags: get rid of ipfrag_skb_cb/FRAG_CB Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 18/29] inet: frags: fix ip6frag_low_thresh boundary Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 19/29] ip: discard IPv4 datagrams with overlapping segments Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 20/29] net: speed up skb_rbtree_purge() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 21/29] net: modify skb_rbtree_purge to return the truesize of all purged skbs Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 22/29] ipv6: defrag: drop non-last frags smaller than min mtu Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 23/29] net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 24/29] net: add rb_to_skb() and other rb tree helpers Florian Fainelli
2018-10-09 22:49 ` Florian Fainelli [this message]
2018-10-09 22:49 ` [PATCH stable 4.9 26/29] ipv4: frags: precedence bug in ip_expire() Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 27/29] ip: add helpers to process in-order fragments faster Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 28/29] ip: process in-order fragments efficiently Florian Fainelli
2018-10-09 22:49 ` [PATCH stable 4.9 29/29] ip: frags: fix crash in ip_do_fragment() Florian Fainelli
2018-10-10 0:46 ` [PATCH stable 4.9 00/29] backport of IP fragmentation fixes Eric Dumazet
2018-10-10 4:15 ` Florian Fainelli
2018-10-10 23:18 ` Stephen Hemminger
2018-10-10 23:23 ` Florian Fainelli
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20181009224924.30151-26-f.fainelli@gmail.com \
--to=f.fainelli@gmail.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=gregkh@linuxfoundation.org \
--cc=netdev@vger.kernel.org \
--cc=soheil@google.com \
--cc=stable@vger.kernel.org \
--cc=sthemmin@microsoft.com \
--cc=weiwan@google.com \
--cc=willemb@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.