[RFC net-next PATCH 3/3] net: remove fragmentation LRU list system

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jesper Dangaard Brouer <brouer@redhat.com>
To: Eric Dumazet <eric.dumazet@gmail.com>,
	"David S. Miller" <davem@davemloft.net>,
	Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>, netdev@vger.kernel.org
Subject: [RFC net-next PATCH 3/3] net: remove fragmentation LRU list system
Date: Thu, 18 Apr 2013 23:39:01 +0200	[thread overview]
Message-ID: <20130418213805.14296.21621.stgit@dragon> (raw)
In-Reply-To: <20130418213637.14296.43143.stgit@dragon>

WORK-IN-PROGRESS

I have a bug in inet_frag_cleanup_netns(), which clean/deletes fragments
when a network namespace is taken down.

TODO:
- netns mem limits and hash cleaning have conflicts
- Howto can we re-add the IPSTATS_MIB_REASMFAILS stats?
- Remove nf->low_thresh
- Doc new max_hash_depth

NOT-signed-off
---

 include/net/inet_frag.h                 |   36 ++++++------
 include/net/ipv6.h                      |    2 -
 net/ipv4/inet_fragment.c                |   96 +++++++++++++++++--------------
 net/ipv4/ip_fragment.c                  |   19 +-----
 net/ipv6/netfilter/nf_conntrack_reasm.c |    5 --
 net/ipv6/reassembly.c                   |    8 ---
 6 files changed, 75 insertions(+), 91 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b2cd72a..276900b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,9 +4,7 @@
 #include <linux/percpu_counter.h>
 
 struct netns_frags {
-	int			nqueues;
-	struct list_head	lru_list;
-	spinlock_t		lru_lock;
+	struct percpu_counter   nqueues;
 
 	/* The percpu_counter "mem" need to be cacheline aligned.
 	 *  mem.count must not share cacheline with other writers
@@ -133,28 +131,30 @@ static inline int sum_frag_mem_limit(struct netns_frags *nf)
 	return res;
 }
 
-static inline void inet_frag_lru_move(struct inet_frag_queue *q)
+static inline void dec_frag_nqueues(struct inet_frag_queue *q)
 {
-	spin_lock(&q->net->lru_lock);
-	list_move_tail(&q->lru_list, &q->net->lru_list);
-	spin_unlock(&q->net->lru_lock);
+	percpu_counter_dec(&q->net->nqueues);
 }
 
-static inline void inet_frag_lru_del(struct inet_frag_queue *q)
+static inline void inc_frag_nqueues(struct inet_frag_queue *q)
 {
-	spin_lock(&q->net->lru_lock);
-	list_del(&q->lru_list);
-	q->net->nqueues--;
-	spin_unlock(&q->net->lru_lock);
+	percpu_counter_inc(&q->net->nqueues);
 }
 
-static inline void inet_frag_lru_add(struct netns_frags *nf,
-				     struct inet_frag_queue *q)
+static inline void init_frag_nqueues(struct netns_frags *nf)
 {
-	spin_lock(&nf->lru_lock);
-	list_add_tail(&q->lru_list, &nf->lru_list);
-	q->net->nqueues++;
-	spin_unlock(&nf->lru_lock);
+	percpu_counter_init(&nf->nqueues, 0);
+}
+
+static inline int sum_frag_nqueues(struct netns_frags *nf)
+{
+	int res;
+
+	local_bh_disable();
+	res = percpu_counter_sum_positive(&nf->nqueues);
+	local_bh_enable();
+
+	return res;
 }
 
 /* RFC 3168 support :
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0810aa5..c35873c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -286,7 +286,7 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 #if IS_ENABLED(CONFIG_IPV6)
 static inline int ip6_frag_nqueues(struct net *net)
 {
-	return net->ipv6.frags.nqueues;
+	return sum_frag_nqueues(&net->ipv6.frags);
 }
 
 static inline int ip6_frag_mem(struct net *net)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index fd9fe5c..8cb46fa 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -105,10 +105,8 @@ EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_init_net(struct netns_frags *nf)
 {
-	nf->nqueues = 0;
+	init_frag_nqueues(nf);
 	init_frag_mem_limit(nf);
-	INIT_LIST_HEAD(&nf->lru_list);
-	spin_lock_init(&nf->lru_lock);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
 
@@ -118,18 +116,6 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
-{
-	nf->low_thresh = 0;
-
-	local_bh_disable();
-	inet_frag_evictor(nf, f, true);
-	local_bh_enable();
-
-	percpu_counter_destroy(&nf->mem);
-}
-EXPORT_SYMBOL(inet_frags_exit_net);
-
 static inline void fq_unlink_hash(struct inet_frag_queue *fq,
 				  struct inet_frags *f)
 {
@@ -160,7 +146,7 @@ void __inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f,
 	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
 		if (unlink_hash)
 			fq_unlink_hash(fq, f);
-		inet_frag_lru_del(fq);
+		dec_frag_nqueues(fq);
 		atomic_dec(&fq->refcnt);
 		fq->last_in |= INET_FRAG_COMPLETE;
 	}
@@ -212,46 +198,60 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
+int inet_frag_cleanup_netns(struct netns_frags *nf, struct inet_frags *f)
 {
+	struct hlist_head kill_list;
 	struct inet_frag_queue *q;
-	int work, evicted = 0;
+	struct hlist_node *n;
+	int i, evicted = 0;
 
-	if (!force) {
-		if (frag_mem_limit(nf) <= nf->high_thresh)
-			return 0;
-	}
+	/* Move inet_frag_queue's from hash table to the kill_list */
+	read_lock(&f->lock);
+	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
+		struct inet_frag_bucket *hb;
 
-	work = frag_mem_limit(nf) - nf->low_thresh;
-	while (work > 0) {
-		spin_lock(&nf->lru_lock);
+		hb = &f->hash[i];
+		spin_lock(&hb->chain_lock);
+		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
 
-		if (list_empty(&nf->lru_list)) {
-			spin_unlock(&nf->lru_lock);
-			break;
+			if (q->net == nf) { /* Only for specific netns */
+				atomic_inc(&q->refcnt);
+				hlist_del(&q->list);
+				hlist_add_head(&q->list, &kill_list);
+			}
 		}
+		spin_unlock(&hb->chain_lock);
+	}
+	read_unlock(&f->lock);
 
-		q = list_first_entry(&nf->lru_list,
-				struct inet_frag_queue, lru_list);
-		atomic_inc(&q->refcnt);
-		/* Remove q from list to avoid several CPUs grabbing it */
-		list_del_init(&q->lru_list);
-
-		spin_unlock(&nf->lru_lock);
-
+	/* Kill off the inet_frag_queue's */
+	hlist_for_each_entry_safe(q, n, &kill_list, list) {
 		spin_lock(&q->lock);
 		if (!(q->last_in & INET_FRAG_COMPLETE))
 			inet_frag_kill(q, f);
 		spin_unlock(&q->lock);
 
 		if (atomic_dec_and_test(&q->refcnt))
-			inet_frag_destroy(q, f, &work);
+			inet_frag_destroy(q, f, NULL);
+
 		evicted++;
 	}
 
 	return evicted;
 }
-EXPORT_SYMBOL(inet_frag_evictor);
+
+void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+{
+	nf->low_thresh = 0;
+
+	local_bh_disable();
+	//BROKEN: inet_frag_cleanup_netns(nf, f);
+	local_bh_enable();
+
+	percpu_counter_destroy(&nf->nqueues);
+	percpu_counter_destroy(&nf->mem);
+}
+EXPORT_SYMBOL(inet_frags_exit_net);
 
 static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
@@ -295,7 +295,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 
 	atomic_inc(&qp->refcnt);
 	hlist_add_head(&qp->list, &hb->chain);
-	inet_frag_lru_add(nf, qp);
+	inc_frag_nqueues(qp);
 	spin_unlock(&hb->chain_lock);
 	read_unlock(&f->lock);
 	return qp;
@@ -356,9 +356,19 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		q_evict = q; /* candidate for eviction */
 	}
 	/* Not found situation */
-	if (depth > f->max_hash_depth) {
-		atomic_inc(&q_evict->refcnt);
-		hlist_del_init(&q_evict->list);
+	if ((depth > f->max_hash_depth)
+	    || (frag_mem_limit(nf) > nf->high_thresh)) {
+
+		/* This is in principal wrong, notice
+		 * frag_mem_limit(nf) is per netns, and we might
+		 * delete q_evict belonging to another netns, which is
+		 * not going to lower the frag_mem_limit(nf) for this
+		 * netns. What to do?
+		 */
+		if (q_evict) {
+			atomic_inc(&q_evict->refcnt);
+			hlist_del_init(&q_evict->list);
+		}
 	} else
 		q_evict = NULL;
 	spin_unlock(&hb->chain_lock);
@@ -374,7 +384,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 			inet_frag_destroy(q_evict, f, NULL);
 
 		LIMIT_NETDEBUG(KERN_WARNING "%s(): Fragment hash bucket"
-			       " list length grew over limit (len %d),"
+			       " grew over lenght or mem limit (len %d),"
 			       " Dropping another fragment.\n",
 			       __func__, depth);
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 4e8489b..193e360 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -18,6 +18,7 @@
  *		John McDonald	:	0 length frag bug.
  *		Alexey Kuznetsov:	SMP races, threading, cleanup.
  *		Patrick McHardy :	LRU queue of frag heads for evictor.
+ *		Jesper D Brouer :	Removed LRU queue for evictor.
  */
 
 #define pr_fmt(fmt) "IPv4: " fmt
@@ -88,7 +89,7 @@ static struct inet_frags ip4_frags;
 
 int ip_frag_nqueues(struct net *net)
 {
-	return net->ipv4.frags.nqueues;
+	return sum_frag_nqueues(&net->ipv4.frags);
 }
 
 int ip_frag_mem(struct net *net)
@@ -176,18 +177,6 @@ static void ipq_kill(struct ipq *ipq)
 	inet_frag_kill(&ipq->q, &ip4_frags);
 }
 
-/* Memory limiting on fragments.  Evictor trashes the oldest
- * fragment queue until we are back under the threshold.
- */
-static void ip_evictor(struct net *net)
-{
-	int evicted;
-
-	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
-	if (evicted)
-		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
-}
-
 /*
  * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
  */
@@ -495,7 +484,6 @@ found:
 	    qp->q.meat == qp->q.len)
 		return ip_frag_reasm(qp, prev, dev);
 
-	inet_frag_lru_move(&qp->q);
 	return -EINPROGRESS;
 
 err:
@@ -645,9 +633,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 
-	/* Start by cleaning up the memory. */
-	ip_evictor(net);
-
 	/* Lookup (or create) queue header */
 	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
 		int ret;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3713764..c865d3c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -338,7 +338,6 @@ found:
 		fq->q.last_in |= INET_FRAG_FIRST_IN;
 	}
 
-	inet_frag_lru_move(&fq->q);
 	return 0;
 
 discard_fq:
@@ -583,10 +582,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
 	hdr = ipv6_hdr(clone);
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
-	local_bh_disable();
-	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
-	local_bh_enable();
-
 	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq == NULL) {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 45de5eb..ab1c843 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -25,6 +25,7 @@
  *	David Stevens and
  *	YOSHIFUJI,H. @USAGI	Always remove fragment header to
  *				calculate ICV correctly.
+ *	Jesper Dangaard Brouer	Removed LRU queue for evictor.
  */
 
 #define pr_fmt(fmt) "IPv6: " fmt
@@ -343,7 +344,6 @@ found:
 	    fq->q.meat == fq->q.len)
 		return ip6_frag_reasm(fq, prev, dev);
 
-	inet_frag_lru_move(&fq->q);
 	return -1;
 
 discard_fq:
@@ -512,7 +512,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_queue *fq;
 	const struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct net *net = dev_net(skb_dst(skb)->dev);
-	int evicted;
 
 	IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS);
 
@@ -537,11 +536,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false);
-	if (evicted)
-		IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
-				 IPSTATS_MIB_REASMFAILS, evicted);
-
 	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq != NULL) {

     prev parent reply	other threads:[~2013-04-18 21:36 UTC|newest]

Thread overview: 28+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-04-18 21:37 [net-next PATCH 0/3] net: frag code fixes and RFC for LRU removal Jesper Dangaard Brouer
2013-04-18 21:37 ` [net-next PATCH 1/3] net: fix race bug in fragmentation create code Jesper Dangaard Brouer
2013-04-19  1:00   ` Hannes Frederic Sowa
2013-04-19  8:09     ` Jesper Dangaard Brouer
2013-04-18 21:38 ` [net-next PATCH 2/3] net: fix enforcing of fragment queue hash list depth Jesper Dangaard Brouer
2013-04-19  0:52   ` Hannes Frederic Sowa
2013-04-19 10:11   ` Eric Dumazet
2013-04-19 10:41     ` David Laight
2013-04-19 11:14       ` Eric Dumazet
2013-04-19 12:19     ` Jesper Dangaard Brouer
2013-04-19 12:45       ` Hannes Frederic Sowa
2013-04-19 14:29         ` Jesper Dangaard Brouer
2013-04-19 15:06           ` Hannes Frederic Sowa
2013-04-19 19:44           ` Hannes Frederic Sowa
2013-04-22  9:10             ` Jesper Dangaard Brouer
2013-04-22 14:54               ` Hannes Frederic Sowa
2013-04-22 16:30                 ` Jesper Dangaard Brouer
2013-04-22 17:49                 ` Jesper Dangaard Brouer
2013-04-23  0:20                   ` Hannes Frederic Sowa
2013-04-23 14:19                     ` Jesper Dangaard Brouer
2013-04-23 20:54                       ` Hannes Frederic Sowa
2013-04-19 14:42       ` Eric Dumazet
2013-04-19 14:45       ` Eric Dumazet
2013-04-19 14:45       ` Eric Dumazet
2013-04-19 14:49       ` Eric Dumazet
2013-04-24 13:35         ` Jesper Dangaard Brouer
2013-04-24 15:05           ` Eric Dumazet
2013-04-18 21:39 ` Jesper Dangaard Brouer [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:b2cd72a dfblob:276900b dfblob:0810aa5 dfblob:c35873c
dfblob:fd9fe5c dfblob:8cb46fa dfblob:4e8489b dfblob:193e360
dfblob:3713764 dfblob:c865d3c dfblob:45de5eb dfblob:ab1c843 )
 OR (
bs:"[RFC net-next PATCH 3/3] net: remove fragmentation LRU list system" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130418213805.14296.21621.stgit@dragon \
    --to=brouer@redhat.com \
    --cc=davem@davemloft.net \
    --cc=eric.dumazet@gmail.com \
    --cc=hannes@stressinduktion.org \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.