Re: debugging kernel during packet drops

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: debugging kernel during packet drops
       [not found]         ` <1269447674.3213.64.camel@edumazet-laptop>
@ 2010-03-25  9:32           ` Eric Dumazet
  2010-03-25 10:35             ` Patrick McHardy
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-03-25  9:32 UTC (permalink / raw)
  To: Jorrit Kronjee; +Cc: netfilter-devel, netdev, Patrick McHardy

Le mercredi 24 mars 2010 à 17:22 +0100, Eric Dumazet a écrit :
> Le mercredi 24 mars 2010 à 16:20 +0100, Jorrit Kronjee a écrit :

> > 
> > I hope this helps! Is there anything special I need to do to use RPS?
> > 
> 
> Sure this helps a lot !
> 
> You might try RPS by doing :
> 
> echo f >/sys/class/net/eth3/queues/rx-0/rps_cpus
> 
> (But you'll also need a new xt_hashlimit module to make it more
> scalable, I can work on this this week if necessary)
> 

Here is patch I cooked for xt_hashlimit (on top of net-next-2.6) to make
it use RCU and scale better in your case (allowing several concurrent
cpus once RPS is activated), but also on more general cases.

[PATCH] xt_hashlimit: RCU conversion

xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads.

After RCU conversion, central lock is only used when a writer wants to
add or delete an entry. For 'readers', updating an existing entry, they
use an individual lock per entry.


Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/netfilter/xt_hashlimit.c |  290 +++++++++++++++++----------------
 1 file changed, 151 insertions(+), 139 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 9e9c489..a4f1b69 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -41,6 +41,54 @@ MODULE_DESCRIPTION("Xtables: per hash-bucket rate-limit match");
 MODULE_ALIAS("ipt_hashlimit");
 MODULE_ALIAS("ip6t_hashlimit");
 
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maximum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+}
+
 struct hashlimit_net {
 	struct hlist_head	htables;
 	struct proc_dir_entry	*ipt_hashlimit;
@@ -80,12 +128,14 @@ struct dsthash_ent {
 	struct dsthash_dst dst;
 
 	/* modified structure members in the end */
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
-		u_int32_t credit_cap, cost;
+		u_int32_t cost;
 	} rateinfo;
+	struct rcu_head rcu;
 };
 
 struct xt_hashlimit_htable {
@@ -95,10 +145,12 @@ struct xt_hashlimit_htable {
 	bool rnd_initialized;
 
 	struct hashlimit_cfg1 cfg;	/* config */
+	u_int32_t credit_cap;
 
 	/* used internally */
-	spinlock_t lock;		/* lock for list_head */
 	u_int32_t rnd;			/* random seed for hash */
+
+	spinlock_t hlock;		/* lock for writes to hash[]/count/rnd */
 	unsigned int count;		/* number entries in table */
 	struct timer_list timer;	/* timer for gc */
 
@@ -142,55 +194,27 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
 	u_int32_t hash = hash_dst(ht, dst);
 
 	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
 				return ent;
+			}
 	}
 	return NULL;
 }
 
-/* allocate dsthash_ent, initialize dst, put in htable and lock it */
-static struct dsthash_ent *
-dsthash_alloc_init(struct xt_hashlimit_htable *ht,
-		   const struct dsthash_dst *dst)
-{
-	struct dsthash_ent *ent;
-
-	/* initialize hash with random val at the time we allocate
-	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
-		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
-		ht->rnd_initialized = true;
-	}
-
-	if (ht->cfg.max && ht->count >= ht->cfg.max) {
-		/* FIXME: do something. question is what.. */
-		if (net_ratelimit())
-			printk(KERN_WARNING
-				"xt_hashlimit: max count of %u reached\n",
-				ht->cfg.max);
-		return NULL;
-	}
 
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
-	if (!ent) {
-		if (net_ratelimit())
-			printk(KERN_ERR
-				"xt_hashlimit: can't allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
 
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
-	return ent;
+	kmem_cache_free(hashlimit_cachep, ent);
 }
 
-static inline void
-dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
+static void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
 static void htable_gc(unsigned long htlong);
@@ -243,11 +267,12 @@ static int htable_create_v0(struct net *net, struct xt_hashlimit_info *minfo, u_
 	for (i = 0; i < hinfo->cfg.size; i++)
 		INIT_HLIST_HEAD(&hinfo->hash[i]);
 
+	hinfo->credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
 	hinfo->use = 1;
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	spin_lock_init(&hinfo->lock);
+	spin_lock_init(&hinfo->hlock);
 	hinfo->pde = proc_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
@@ -305,11 +330,12 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	for (i = 0; i < hinfo->cfg.size; i++)
 		INIT_HLIST_HEAD(&hinfo->hash[i]);
 
+	hinfo->credit_cap = user2credits(hinfo->cfg.avg * hinfo->cfg.burst);
 	hinfo->use = 1;
 	hinfo->count = 0;
 	hinfo->family = family;
 	hinfo->rnd_initialized = false;
-	spin_lock_init(&hinfo->lock);
+	spin_lock_init(&hinfo->hlock);
 
 	hinfo->pde = proc_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
@@ -349,7 +375,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
 	unsigned int i;
 
 	/* lock hash table and iterate over it */
-	spin_lock_bh(&ht->lock);
+	spin_lock_bh(&ht->hlock);
 	for (i = 0; i < ht->cfg.size; i++) {
 		struct dsthash_ent *dh;
 		struct hlist_node *pos, *n;
@@ -358,7 +384,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht,
 				dsthash_free(ht, dh);
 		}
 	}
-	spin_unlock_bh(&ht->lock);
+	spin_unlock_bh(&ht->hlock);
 }
 
 /* hash table garbage collector, run by timer */
@@ -417,59 +443,12 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 	mutex_unlock(&hashlimit_mutex);
 }
 
-/* The algorithm used is the Simple Token Bucket Filter (TBF)
- * see net/sched/sch_tbf.c in the linux source tree
- */
-
-/* Rusty: This is my (non-mathematically-inclined) understanding of
-   this algorithm.  The `average rate' in jiffies becomes your initial
-   amount of credit `credit' and the most credit you can ever have
-   `credit_cap'.  The `peak rate' becomes the cost of passing the
-   test, `cost'.
-
-   `prev' tracks the last packet hit: you gain one credit per jiffy.
-   If you get credit balance more than this, the extra credit is
-   discarded.  Every time the match passes, you lose `cost' credits;
-   if you don't have that many, the test fails.
-
-   See Alexey's formal explanation in net/sched/sch_tbf.c.
-
-   To get the maximum range, we multiply by this factor (ie. you get N
-   credits per jiffy).  We want to allow a rate as low as 1 per day
-   (slowest userspace tool allows), which means
-   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
-*/
-#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
-
-/* Repeated shift and or gives us all 1s, final shift and add 1 gives
- * us the power of 2 below the theoretical max, so GCC simply does a
- * shift. */
-#define _POW2_BELOW2(x) ((x)|((x)>>1))
-#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
-#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
-#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
-#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
-#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
-
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
-
-/* Precision saver. */
-static inline u_int32_t
-user2credits(u_int32_t user)
-{
-	/* If multiplying would overflow... */
-	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
-		/* Divide first. */
-		return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
-
-	return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
-}
-
-static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
+				   struct xt_hashlimit_htable *hinfo)
 {
 	dh->rateinfo.credit += (now - dh->rateinfo.prev) * CREDITS_PER_JIFFY;
-	if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
-		dh->rateinfo.credit = dh->rateinfo.credit_cap;
+	if (dh->rateinfo.credit > hinfo->credit_cap)
+		dh->rateinfo.credit = hinfo->credit_cap;
 	dh->rateinfo.prev = now;
 }
 
@@ -563,9 +542,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 					   &_ports);
 		break;
 	default:
-		_ports[0] = _ports[1] = 0;
-		ports = _ports;
-		break;
+		return 0;
 	}
 	if (!ports)
 		return -1;
@@ -576,6 +553,48 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 	return 0;
 }
 
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+dsthash_alloc_init(struct xt_hashlimit_htable *ht,
+		   const struct dsthash_dst *dst)
+{
+	struct dsthash_ent *ent = NULL;
+
+	spin_lock(&ht->hlock);
+	/* initialize hash with random val at the time we allocate
+	 * the first hashtable entry */
+	if (unlikely(!ht->rnd_initialized)) {
+		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
+		ht->rnd_initialized = true;
+	}
+
+	if (ht->cfg.max && ht->count >= ht->cfg.max) {
+		/* FIXME: do something. question is what.. */
+		if (net_ratelimit())
+			printk(KERN_WARNING
+				"xt_hashlimit: max count of %u reached\n",
+				ht->cfg.max);
+	} else 
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+	if (!ent) {
+		if (net_ratelimit())
+			pr_err("xt_hashlimit: can't allocate dsthash_ent\n");
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
+		ent->expires = jiffies + msecs_to_jiffies(ht->cfg.expire);
+		ent->rateinfo.prev = jiffies;
+		ent->rateinfo.credit = ht->credit_cap;
+		ent->rateinfo.cost = user2credits(ht->cfg.avg);
+		spin_lock(&ent->lock);
+
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->hlock);
+	return ent;
+}
+
 static bool
 hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
@@ -588,38 +607,30 @@ hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (!dh) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (!dh) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
-		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-		dh->rateinfo.prev = jiffies;
-		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-						   hinfo->cfg.burst);
-		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
-						       hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now);
+		rateinfo_recalc(dh, now, hinfo);
 	}
 
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* We're underlimit. */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return true;
 	}
 
-	spin_unlock_bh(&hinfo->lock);
-
-	/* default case: we're overlimit, thus don't match */
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	return false;
 
 hotdrop:
@@ -639,36 +650,31 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (dh == NULL) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
 
-		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-		dh->rateinfo.prev = jiffies;
-		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-		                      hinfo->cfg.burst);
-		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
-		                          hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
-		rateinfo_recalc(dh, now);
+		rateinfo_recalc(dh, now, hinfo);
 	}
 
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
@@ -843,12 +849,12 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = {
 
 /* PROC stuff */
 static void *dl_seq_start(struct seq_file *s, loff_t *pos)
-	__acquires(htable->lock)
+	__acquires(htable->hlock)
 {
 	struct xt_hashlimit_htable *htable = s->private;
 	unsigned int *bucket;
 
-	spin_lock_bh(&htable->lock);
+	spin_lock_bh(&htable->hlock);
 	if (*pos >= htable->cfg.size)
 		return NULL;
 
@@ -874,46 +880,52 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
 }
 
 static void dl_seq_stop(struct seq_file *s, void *v)
-	__releases(htable->lock)
+	__releases(htable->hlock)
 {
 	struct xt_hashlimit_htable *htable = s->private;
 	unsigned int *bucket = (unsigned int *)v;
 
 	kfree(bucket);
-	spin_unlock_bh(&htable->lock);
+	spin_unlock_bh(&htable->hlock);
 }
 
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
-				   struct seq_file *s)
+				   struct seq_file *s, struct xt_hashlimit_htable *htable)
 {
+	int res = 0;
+
+	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
-	rateinfo_recalc(ent, jiffies);
+	rateinfo_recalc(ent, jiffies, htable);
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip.src,
 				 ntohs(ent->dst.src_port),
 				 &ent->dst.ip.dst,
 				 ntohs(ent->dst.dst_port),
-				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
+				 ent->rateinfo.credit, htable->credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip6.src,
 				 ntohs(ent->dst.src_port),
 				 &ent->dst.ip6.dst,
 				 ntohs(ent->dst.dst_port),
-				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
+				 ent->rateinfo.credit, htable->credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #endif
 	default:
 		BUG();
-		return 0;
 	}
+	spin_unlock(&ent->lock);
+	return res;
 }
 
 static int dl_seq_show(struct seq_file *s, void *v)
@@ -925,7 +937,7 @@ static int dl_seq_show(struct seq_file *s, void *v)
 
 	if (!hlist_empty(&htable->hash[*bucket])) {
 		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node)
-			if (dl_seq_real_show(ent, htable->family, s))
+			if (dl_seq_real_show(ent, htable->family, s, htable))
 				return -1;
 	}
 	return 0;
@@ -1037,9 +1049,9 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: debugging kernel during packet drops
  2010-03-25  9:32           ` debugging kernel during packet drops Eric Dumazet
@ 2010-03-25 10:35             ` Patrick McHardy
  2010-03-25 11:02               ` Eric Dumazet
                                 ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Patrick McHardy @ 2010-03-25 10:35 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jorrit Kronjee, netfilter-devel, netdev

Eric Dumazet wrote:
> Here is patch I cooked for xt_hashlimit (on top of net-next-2.6) to make
> it use RCU and scale better in your case (allowing several concurrent
> cpus once RPS is activated), but also on more general cases.
> 
> [PATCH] xt_hashlimit: RCU conversion
> 
> xt_hashlimit uses a central lock per hash table and suffers from
> contention on some workloads.
> 
> After RCU conversion, central lock is only used when a writer wants to
> add or delete an entry. For 'readers', updating an existing entry, they
> use an individual lock per entry.

This clashes with some recent cleanups in nf-next-2.6.git. I'm
also expecting a patch from Jan to remove the old v0 revision
very soon (probably today). Please rediff once I've pushed that out.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: debugging kernel during packet drops
  2010-03-25 10:35             ` Patrick McHardy
@ 2010-03-25 11:02               ` Eric Dumazet
  2010-03-31 12:23                 ` [PATCH nf-next-2.6] xt_hashlimit: RCU conversion Eric Dumazet
  2010-03-25 12:42               ` debugging kernel during packet drops Jan Engelhardt
  2010-03-30 12:06               ` Jan Engelhardt
  2 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-03-25 11:02 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jorrit Kronjee, netfilter-devel, netdev

Le jeudi 25 mars 2010 à 11:35 +0100, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > Here is patch I cooked for xt_hashlimit (on top of net-next-2.6) to make
> > it use RCU and scale better in your case (allowing several concurrent
> > cpus once RPS is activated), but also on more general cases.
> > 
> > [PATCH] xt_hashlimit: RCU conversion
> > 
> > xt_hashlimit uses a central lock per hash table and suffers from
> > contention on some workloads.
> > 
> > After RCU conversion, central lock is only used when a writer wants to
> > add or delete an entry. For 'readers', updating an existing entry, they
> > use an individual lock per entry.
> 
> This clashes with some recent cleanups in nf-next-2.6.git. I'm
> also expecting a patch from Jan to remove the old v0 revision
> very soon (probably today). Please rediff once I've pushed that out.

Sure, this patch was mainly for Jorrit tests (and he uses net-next-2.6),

I will submit several patches for mainline :)

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: debugging kernel during packet drops
  2010-03-25 10:35             ` Patrick McHardy
  2010-03-25 11:02               ` Eric Dumazet
@ 2010-03-25 12:42               ` Jan Engelhardt
  2010-03-30 12:06               ` Jan Engelhardt
  2 siblings, 0 replies; 10+ messages in thread
From: Jan Engelhardt @ 2010-03-25 12:42 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, Jorrit Kronjee, netfilter-devel, netdev


On Thursday 2010-03-25 11:35, Patrick McHardy wrote:
>Eric Dumazet wrote:
>> Here is patch I cooked for xt_hashlimit (on top of net-next-2.6) to make
>> it use RCU and scale better in your case (allowing several concurrent
>> cpus once RPS is activated), but also on more general cases.
>> 
>> [PATCH] xt_hashlimit: RCU conversion
>
>This clashes with some recent cleanups in nf-next-2.6.git. I'm
>also expecting a patch from Jan to remove the old v0 revision
>very soon (probably today).

I was waiting for you to process
http://marc.info/?l=netfilter-devel&m=126936212932683&w=2
which is a prerequisite for the hashlimit v0 removal.
(I expected that you might miss some mails due to trash.net
having been down yesterday. No biggie.)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: debugging kernel during packet drops
  2010-03-25 10:35             ` Patrick McHardy
  2010-03-25 11:02               ` Eric Dumazet
  2010-03-25 12:42               ` debugging kernel during packet drops Jan Engelhardt
@ 2010-03-30 12:06               ` Jan Engelhardt
  2010-03-30 14:12                 ` Patrick McHardy
  2 siblings, 1 reply; 10+ messages in thread
From: Jan Engelhardt @ 2010-03-30 12:06 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Eric Dumazet, Jorrit Kronjee, netfilter-devel, netdev

On Thursday 2010-03-25 11:35, Patrick McHardy wrote:

>Eric Dumazet wrote:
>> Here is patch I cooked for xt_hashlimit (on top of net-next-2.6) to make
>> it use RCU and scale better in your case (allowing several concurrent
>> cpus once RPS is activated), but also on more general cases.
>> 
>> [PATCH] xt_hashlimit: RCU conversion
>> 
>> xt_hashlimit uses a central lock per hash table and suffers from
>> contention on some workloads.
>> 
>> After RCU conversion, central lock is only used when a writer wants to
>> add or delete an entry. For 'readers', updating an existing entry, they
>> use an individual lock per entry.
>
>This clashes with some recent cleanups in nf-next-2.6.git. I'm
>also expecting a patch from Jan to remove the old v0 revision
>very soon (probably today). Please rediff once I've pushed that out.

One 12-series request has been sitting there for a while. Was there 
something not in order with it?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: debugging kernel during packet drops
  2010-03-30 12:06               ` Jan Engelhardt
@ 2010-03-30 14:12                 ` Patrick McHardy
  0 siblings, 0 replies; 10+ messages in thread
From: Patrick McHardy @ 2010-03-30 14:12 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: Eric Dumazet, Jorrit Kronjee, netfilter-devel, netdev

Jan Engelhardt wrote:
> On Thursday 2010-03-25 11:35, Patrick McHardy wrote:
> 
>> This clashes with some recent cleanups in nf-next-2.6.git. I'm
>> also expecting a patch from Jan to remove the old v0 revision
>> very soon (probably today). Please rediff once I've pushed that out.
> 
> One 12-series request has been sitting there for a while. Was there 
> something not in order with it?

No, I'm currently a bit backlogged due to getting sick. I'm running
your series for testing though, I'll probably push it out tommorrow.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
  2010-03-25 11:02               ` Eric Dumazet
@ 2010-03-31 12:23                 ` Eric Dumazet
  2010-04-01 11:03                   ` Patrick McHardy
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-03-31 12:23 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jorrit Kronjee, netfilter-devel, netdev

xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads. (Multiqueue NIC or if RPS is enabled)

After RCU conversion, central lock is only used when a writer wants to
add or delete an entry.

For 'readers', updating an existing entry, they use an individual lock
per entry.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/netfilter/xt_hashlimit.c |  115 ++++++++++++++++++---------------
 1 file changed, 66 insertions(+), 49 deletions(-)
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5470bb0..f245de6 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -81,12 +81,14 @@ struct dsthash_ent {
 	struct dsthash_dst dst;
 
 	/* modified structure members in the end */
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
 		u_int32_t credit_cap, cost;
 	} rateinfo;
+	struct rcu_head rcu;
 };
 
 struct xt_hashlimit_htable {
@@ -143,54 +145,30 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
 	u_int32_t hash = hash_dst(ht, dst);
 
 	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
 				return ent;
+			}
 	}
 	return NULL;
 }
 
-/* allocate dsthash_ent, initialize dst, put in htable and lock it */
-static struct dsthash_ent *
-dsthash_alloc_init(struct xt_hashlimit_htable *ht,
-		   const struct dsthash_dst *dst)
+static void dsthash_free_rcu(struct rcu_head *head)
 {
-	struct dsthash_ent *ent;
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
 
-	/* initialize hash with random val at the time we allocate
-	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
-		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
-		ht->rnd_initialized = true;
-	}
-
-	if (ht->cfg.max && ht->count >= ht->cfg.max) {
-		/* FIXME: do something. question is what.. */
-		if (net_ratelimit())
-			pr_err("max count of %u reached\n", ht->cfg.max);
-		return NULL;
-	}
-
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
-	if (!ent) {
-		if (net_ratelimit())
-			pr_err("cannot allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
-
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
-	return ent;
+	kmem_cache_free(hashlimit_cachep, ent);
 }
 
-static inline void
-dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
+static void dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
+
+
 static void htable_gc(unsigned long htlong);
 
 static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
@@ -500,6 +478,49 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 	return 0;
 }
 
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+dsthash_alloc_init(struct xt_hashlimit_htable *ht,
+		   const struct dsthash_dst *dst)
+{
+	struct dsthash_ent *ent;
+
+	spin_lock(&ht->lock);
+	/* initialize hash with random val at the time we allocate
+	 * the first hashtable entry */
+	if (unlikely(!ht->rnd_initialized)) {
+		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
+		ht->rnd_initialized = true;
+	}
+
+	if (ht->cfg.max && ht->count >= ht->cfg.max) {
+		/* FIXME: do something. question is what.. */
+		if (net_ratelimit())
+			pr_err("max count of %u reached\n", ht->cfg.max);
+		ent = NULL;
+	} else
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+	if (!ent) {
+		if (net_ratelimit())
+			pr_err("cannot allocate dsthash_ent\n");
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
+
+		ent->expires = jiffies + msecs_to_jiffies(ht->cfg.expire);
+		ent->rateinfo.prev = jiffies;
+		ent->rateinfo.credit = user2credits(ht->cfg.avg *
+		                      ht->cfg.burst);
+		ent->rateinfo.credit_cap = user2credits(ht->cfg.avg *
+		                          ht->cfg.burst);
+		ent->rateinfo.cost = user2credits(ht->cfg.avg);
+		spin_lock(&ent->lock);
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->lock);
+	return ent;
+}
 static bool
 hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
@@ -512,22 +533,14 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (dh == NULL) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
-		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
-		dh->rateinfo.prev = jiffies;
-		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
-		                      hinfo->cfg.burst);
-		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg *
-		                          hinfo->cfg.burst);
-		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
 	} else {
 		/* update expiration timeout */
 		dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
@@ -537,11 +550,13 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
@@ -817,9 +832,11 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+
+	rcu_barrier_bh();
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
  2010-03-31 12:23                 ` [PATCH nf-next-2.6] xt_hashlimit: RCU conversion Eric Dumazet
@ 2010-04-01 11:03                   ` Patrick McHardy
  2010-04-01 12:10                     ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Patrick McHardy @ 2010-04-01 11:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jorrit Kronjee, netfilter-devel, netdev

Eric Dumazet wrote:
> xt_hashlimit uses a central lock per hash table and suffers from
> contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> 
> After RCU conversion, central lock is only used when a writer wants to
> add or delete an entry.
> 
> For 'readers', updating an existing entry, they use an individual lock
> per entry.

Looks good to me, thanks Eric.

> -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
> -static struct dsthash_ent *
> -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
> -		   const struct dsthash_dst *dst)

Is there a reason for moving this function downwards in the file?
That unnecessarily increases the diff and makes the patch harder to
review. For review purposes I moved it back up, resulting in 42
lines less diff.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
  2010-04-01 11:03                   ` Patrick McHardy
@ 2010-04-01 12:10                     ` Eric Dumazet
  2010-04-01 12:36                       ` Patrick McHardy
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2010-04-01 12:10 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Jorrit Kronjee, netfilter-devel, netdev

Le jeudi 01 avril 2010 à 13:03 +0200, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > xt_hashlimit uses a central lock per hash table and suffers from
> > contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> > 
> > After RCU conversion, central lock is only used when a writer wants to
> > add or delete an entry.
> > 
> > For 'readers', updating an existing entry, they use an individual lock
> > per entry.
> 
> Looks good to me, thanks Eric.
> 
> > -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
> > -static struct dsthash_ent *
> > -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
> > -		   const struct dsthash_dst *dst)
> 
> Is there a reason for moving this function downwards in the file?
> That unnecessarily increases the diff and makes the patch harder to
> review. For review purposes I moved it back up, resulting in 42
> lines less diff.
> --

Well, this is because I had to move inside this function various
initializations and these inits use user2credits() which was defined
after dsthash_alloc_init().

But we can avoid this since we hold the entry spinlock, and before hash
insertion.

Only the lookup fields and the spinlock MUST be committed to memory
before the insert. Other fields can be initialized later by caller.

Here is V2 of patch, I added locking as well in dl_seq_real_show()
because we call rateinfo_recalc(). htable main lock doesnt anymore
protects each entry rateinfo.

Thanks

[PATCH nf-next-2.6] xt_hashlimit: RCU conversion

xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads. (Multiqueue NIC or if RPS is enabled)

After RCU conversion, central lock is only used when a writer wants to
add or delete an entry.

For 'readers', updating an existing entry, they use an individual lock
per entry.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5470bb0..453178d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -81,12 +81,14 @@ struct dsthash_ent {
 	struct dsthash_dst dst;
 
 	/* modified structure members in the end */
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
 		u_int32_t credit_cap, cost;
 	} rateinfo;
+	struct rcu_head rcu;
 };
 
 struct xt_hashlimit_htable {
@@ -143,9 +145,11 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
 	u_int32_t hash = hash_dst(ht, dst);
 
 	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
 				return ent;
+			}
 	}
 	return NULL;
 }
@@ -157,9 +161,10 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 {
 	struct dsthash_ent *ent;
 
+	spin_lock(&ht->lock);
 	/* initialize hash with random val at the time we allocate
 	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
+	if (unlikely(!ht->rnd_initialized)) {
 		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
 		ht->rnd_initialized = true;
 	}
@@ -168,27 +173,36 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 		/* FIXME: do something. question is what.. */
 		if (net_ratelimit())
 			pr_err("max count of %u reached\n", ht->cfg.max);
-		return NULL;
-	}
-
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+		ent = NULL;
+	} else
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
 	if (!ent) {
 		if (net_ratelimit())
 			pr_err("cannot allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
 
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
+		spin_lock(&ent->lock);
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->lock);
 	return ent;
 }
 
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+	kmem_cache_free(hashlimit_cachep, ent);
+}
+
 static inline void
 dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
 static void htable_gc(unsigned long htlong);
@@ -512,15 +526,14 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (dh == NULL) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
 		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
 		dh->rateinfo.prev = jiffies;
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
@@ -537,11 +550,13 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
@@ -666,12 +681,15 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				   struct seq_file *s)
 {
+	int res;
+
+	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
 	rateinfo_recalc(ent, jiffies);
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip.src,
 				 ntohs(ent->dst.src_port),
@@ -679,9 +697,10 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip6.src,
 				 ntohs(ent->dst.src_port),
@@ -689,11 +708,14 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #endif
 	default:
 		BUG();
-		return 0;
+		res = 0;
 	}
+	spin_unlock(&ent->lock);
+	return res;
 }
 
 static int dl_seq_show(struct seq_file *s, void *v)
@@ -817,9 +839,11 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+
+	rcu_barrier_bh();
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);



^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
  2010-04-01 12:10                     ` Eric Dumazet
@ 2010-04-01 12:36                       ` Patrick McHardy
  0 siblings, 0 replies; 10+ messages in thread
From: Patrick McHardy @ 2010-04-01 12:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jorrit Kronjee, netfilter-devel, netdev

Eric Dumazet wrote:
> Le jeudi 01 avril 2010 à 13:03 +0200, Patrick McHardy a écrit :
>>> -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
>>> -static struct dsthash_ent *
>>> -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
>>> -		   const struct dsthash_dst *dst)
>> Is there a reason for moving this function downwards in the file?
>> That unnecessarily increases the diff and makes the patch harder to
>> review. For review purposes I moved it back up, resulting in 42
>> lines less diff.
>> --
> 
> Well, this is because I had to move inside this function various
> initializations and these inits use user2credits() which was defined
> after dsthash_alloc_init().

I thought I also tried to compile it after my change, but apparently
I made some mistake :)

> But we can avoid this since we hold the entry spinlock, and before hash
> insertion.
> 
> Only the lookup fields and the spinlock MUST be committed to memory
> before the insert. Other fields can be initialized later by caller.
> 
> Here is V2 of patch, I added locking as well in dl_seq_real_show()
> because we call rateinfo_recalc(). htable main lock doesnt anymore
> protects each entry rateinfo.
> 
> Thanks
> 
> [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
> 
> xt_hashlimit uses a central lock per hash table and suffers from
> contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> 
> After RCU conversion, central lock is only used when a writer wants to
> add or delete an entry.
> 
> For 'readers', updating an existing entry, they use an individual lock
> per entry.

Applied, thanks a lot Eric.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2010-04-01 12:36 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <4BA74950.6000305@infopact.nl>
     [not found] ` <4BA7A5D8.5080101@trash.net>
     [not found]   ` <4BA8DAC5.6050002@infopact.nl>
     [not found]     ` <1269364893.2983.296.camel@edumazet-laptop>
     [not found]       ` <4BAA2DC5.7000409@infopact.nl>
     [not found]         ` <1269447674.3213.64.camel@edumazet-laptop>
2010-03-25  9:32           ` debugging kernel during packet drops Eric Dumazet
2010-03-25 10:35             ` Patrick McHardy
2010-03-25 11:02               ` Eric Dumazet
2010-03-31 12:23                 ` [PATCH nf-next-2.6] xt_hashlimit: RCU conversion Eric Dumazet
2010-04-01 11:03                   ` Patrick McHardy
2010-04-01 12:10                     ` Eric Dumazet
2010-04-01 12:36                       ` Patrick McHardy
2010-03-25 12:42               ` debugging kernel during packet drops Jan Engelhardt
2010-03-30 12:06               ` Jan Engelhardt
2010-03-30 14:12                 ` Patrick McHardy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).