[PATCH 1/9] rhashtable: Do hashing inside of rhashtable_lookup

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/9] rhashtable: Do hashing inside of rhashtable_lookup_compare()
       [not found] <cover.1418647641.git.tgraf@suug.ch>
@ 2014-12-15 12:51 ` Thomas Graf
  2014-12-15 12:51 ` [PATCH 5/9] nft_hash: Remove rhashtable_remove_pprev() Thomas Graf
  2014-12-15 12:51 ` [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking Thomas Graf
  2 siblings, 0 replies; 36+ messages in thread
From: Thomas Graf @ 2014-12-15 12:51 UTC (permalink / raw)
  To: davem
  Cc: netdev, kernel, herbert, paulmck, edumazet, john.r.fastabend,
	josh, netfilter-devel

Hash the key inside of rhashtable_lookup_compare() like
rhashtable_lookup() does. This allows to simplify the hashing
functions and keep them private.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
---
 include/linux/rhashtable.h |  5 +--
 lib/rhashtable.c           | 91 +++++++++++++++-------------------------------
 net/netfilter/nft_hash.c   | 46 ++++++++++++++---------
 net/netlink/af_netlink.c   |  5 +--
 4 files changed, 61 insertions(+), 86 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b93fd89..1b51221 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -96,9 +96,6 @@ static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len);
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
-
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
@@ -111,7 +108,7 @@ int rhashtable_expand(struct rhashtable *ht);
 int rhashtable_shrink(struct rhashtable *ht);
 
 void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
 
 void rhashtable_destroy(const struct rhashtable *ht);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6c3c723..1ee0eb6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -42,69 +42,39 @@ static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 	return (void *) he - ht->p.head_offset;
 }
 
-static u32 __hashfn(const struct rhashtable *ht, const void *key,
-		      u32 len, u32 hsize)
+static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 {
-	u32 h;
-
-	h = ht->p.hashfn(key, len, ht->p.hash_rnd);
-
-	return h & (hsize - 1);
-}
-
-/**
- * rhashtable_hashfn - compute hash for key of given length
- * @ht:		hash table to compute for
- * @key:	pointer to key
- * @len:	length of key
- *
- * Computes the hash value using the hash function provided in the 'hashfn'
- * of struct rhashtable_params. The returned value is guaranteed to be
- * smaller than the number of buckets in the hash table.
- */
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len)
-{
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
-
-	return __hashfn(ht, key, len, tbl->size);
+	return hash & (tbl->size - 1);
 }
-EXPORT_SYMBOL_GPL(rhashtable_hashfn);
 
-static u32 obj_hashfn(const struct rhashtable *ht, const void *ptr, u32 hsize)
+static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 {
-	if (unlikely(!ht->p.key_len)) {
-		u32 h;
-
-		h = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	u32 hash;
 
-		return h & (hsize - 1);
-	}
+	if (unlikely(!ht->p.key_len))
+		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	else
+		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
+				    ht->p.hash_rnd);
 
-	return __hashfn(ht, ptr + ht->p.key_offset, ht->p.key_len, hsize);
+	return hash;
 }
 
-/**
- * rhashtable_obj_hashfn - compute hash for hashed object
- * @ht:		hash table to compute for
- * @ptr:	pointer to hashed object
- *
- * Computes the hash value using the hash function `hashfn` respectively
- * 'obj_hashfn' depending on whether the hash table is set up to work with
- * a fixed length key. The returned value is guaranteed to be smaller than
- * the number of buckets in the hash table.
- */
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr)
+static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	u32 hash;
+
+	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
 
-	return obj_hashfn(ht, ptr, tbl->size);
+	return rht_bucket_index(tbl, hash);
 }
-EXPORT_SYMBOL_GPL(rhashtable_obj_hashfn);
 
 static u32 head_hashfn(const struct rhashtable *ht,
-		       const struct rhash_head *he, u32 hsize)
+		       const struct bucket_table *tbl,
+		       const struct rhash_head *he)
 {
-	return obj_hashfn(ht, rht_obj(ht, he), hsize);
+	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
 
 static struct bucket_table *bucket_table_alloc(size_t nbuckets)
@@ -170,9 +140,9 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, p, new_tbl->size);
+	h = head_hashfn(ht, new_tbl, p);
 	rht_for_each(he, p->next, ht) {
-		if (head_hashfn(ht, he, new_tbl->size) != h)
+		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
 	}
@@ -184,7 +154,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	next = NULL;
 	if (he) {
 		rht_for_each(he, he->next, ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == h) {
+			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
 			}
@@ -237,9 +207,9 @@ int rhashtable_expand(struct rhashtable *ht)
 	 * single imprecise chain.
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
-		h = i & (old_tbl->size - 1);
+		h = rht_bucket_index(old_tbl, i);
 		rht_for_each(he, old_tbl->buckets[h], ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == i) {
+			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
 			}
@@ -353,7 +323,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	hash = head_hashfn(ht, obj, tbl->size);
+	hash = head_hashfn(ht, tbl, obj);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	ht->nelems++;
@@ -413,7 +383,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	h = head_hashfn(ht, obj, tbl->size);
+	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
 	rht_for_each(he, tbl->buckets[h], ht) {
@@ -452,7 +422,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 
 	BUG_ON(!ht->p.key_len);
 
-	h = __hashfn(ht, key, ht->p.key_len, tbl->size);
+	h = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[h], ht) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
@@ -467,7 +437,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
 /**
  * rhashtable_lookup_compare - search hash table with compare function
  * @ht:		hash table
- * @hash:	hash value of desired entry
+ * @key:	the pointer to the key
  * @compare:	compare function, must return true on match
  * @arg:	argument passed on to compare function
  *
@@ -479,15 +449,14 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
 	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	struct rhash_head *he;
+	u32 hash;
 
-	if (unlikely(hash >= tbl->size))
-		return NULL;
-
+	hash = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 1e316ce..614ee09 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -94,28 +94,40 @@ static void nft_hash_remove(const struct nft_set *set,
 	kfree(he);
 }
 
+struct nft_compare_arg {
+	const struct nft_set *set;
+	struct nft_set_elem *elem;
+};
+
+static bool nft_hash_compare(void *ptr, void *arg)
+{
+	struct nft_hash_elem *he = ptr;
+	struct nft_compare_arg *x = arg;
+
+	if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
+		x->elem->cookie = &he->node;
+		x->elem->flags = 0;
+		if (x->set->flags & NFT_SET_MAP)
+			nft_data_copy(&x->elem->data, he->data);
+
+		return true;
+	}
+
+	return false;
+}
+
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
 	const struct rhashtable *priv = nft_set_priv(set);
-	const struct bucket_table *tbl = rht_dereference_rcu(priv->tbl, priv);
-	struct rhash_head __rcu * const *pprev;
-	struct nft_hash_elem *he;
-	u32 h;
-
-	h = rhashtable_hashfn(priv, &elem->key, set->klen);
-	pprev = &tbl->buckets[h];
-	rht_for_each_entry_rcu(he, tbl->buckets[h], node) {
-		if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
-			pprev = &he->node.next;
-			continue;
-		}
+	struct nft_compare_arg arg = {
+		.set = set,
+		.elem = elem,
+	};
 
-		elem->cookie = (void *)pprev;
-		elem->flags = 0;
-		if (set->flags & NFT_SET_MAP)
-			nft_data_copy(&elem->data, he->data);
+	if (rhashtable_lookup_compare(priv, &elem->key,
+				      &nft_hash_compare, &arg))
 		return 0;
-	}
+
 	return -ENOENT;
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ef5f77b..dc8354b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1022,11 +1022,8 @@ static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
 		.net = net,
 		.portid = portid,
 	};
-	u32 hash;
 
-	hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid));
-
-	return rhashtable_lookup_compare(&table->hash, hash,
+	return rhashtable_lookup_compare(&table->hash, &portid,
 					 &netlink_compare, &arg);
 }
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 5/9] nft_hash: Remove rhashtable_remove_pprev()
       [not found] <cover.1418647641.git.tgraf@suug.ch>
  2014-12-15 12:51 ` [PATCH 1/9] rhashtable: Do hashing inside of rhashtable_lookup_compare() Thomas Graf
@ 2014-12-15 12:51 ` Thomas Graf
  2014-12-15 12:51 ` [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking Thomas Graf
  2 siblings, 0 replies; 36+ messages in thread
From: Thomas Graf @ 2014-12-15 12:51 UTC (permalink / raw)
  To: davem
  Cc: netdev, kernel, herbert, paulmck, edumazet, john.r.fastabend,
	josh, netfilter-devel

The removal function of nft_hash currently stores a reference to the
previous element during lookup which is used to optimize removal later
on. This was possible because a lock is held throughout calling
rhashtable_lookup() and rhashtable_remove().

With the introdution of deferred table resizing in parallel to lookups
and insertions, the nftables lock will no longer synchronize all
table mutations and the stored pprev may become invalid.

Removing this optimization makes removal slightly more expensive on
average but allows taking the resize cost out of the insert and
remove path.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
---
 include/linux/rhashtable.h |  2 --
 lib/rhashtable.c           | 34 +++++++---------------------------
 net/netfilter/nft_hash.c   | 11 +++--------
 3 files changed, 10 insertions(+), 37 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b54e24a..f624d4b 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -105,8 +105,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev);
 
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 0bd29c1..e6b85c4 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -345,32 +345,6 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
 /**
- * rhashtable_remove_pprev - remove object from hash table given previous element
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- * @pprev:	pointer to previous element
- *
- * Identical to rhashtable_remove() but caller is alreayd aware of the element
- * in front of the element to be deleted. This is in particular useful for
- * deletion when combined with walking or lookup.
- */
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev)
-{
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-
-	ASSERT_RHT_MUTEX(ht);
-
-	RCU_INIT_POINTER(*pprev, obj->next);
-	ht->nelems--;
-
-	if (ht->p.shrink_decision &&
-	    ht->p.shrink_decision(ht, tbl->size))
-		rhashtable_shrink(ht);
-}
-EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
-
-/**
  * rhashtable_remove - remove object from hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
@@ -403,7 +377,13 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 			continue;
 		}
 
-		rhashtable_remove_pprev(ht, he, pprev);
+		RCU_INIT_POINTER(*pprev, he->next);
+		ht->nelems--;
+
+		if (ht->p.shrink_decision &&
+		    ht->p.shrink_decision(ht, tbl->size))
+			rhashtable_shrink(ht);
+
 		return true;
 	}
 
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index d93f1f4..7f903cf 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -83,15 +83,10 @@ static void nft_hash_remove(const struct nft_set *set,
 			    const struct nft_set_elem *elem)
 {
 	struct rhashtable *priv = nft_set_priv(set);
-	struct rhash_head *he, __rcu **pprev;
-
-	pprev = elem->cookie;
-	he = rht_dereference((*pprev), priv);
-
-	rhashtable_remove_pprev(priv, he, pprev);
 
+	rhashtable_remove(priv, elem->cookie);
 	synchronize_rcu();
-	kfree(he);
+	kfree(elem->cookie);
 }
 
 struct nft_compare_arg {
@@ -105,7 +100,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
 	struct nft_compare_arg *x = arg;
 
 	if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
-		x->elem->cookie = &he->node;
+		x->elem->cookie = he;
 		x->elem->flags = 0;
 		if (x->set->flags & NFT_SET_MAP)
 			nft_data_copy(&x->elem->data, he->data);
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
       [not found] <cover.1418647641.git.tgraf@suug.ch>
  2014-12-15 12:51 ` [PATCH 1/9] rhashtable: Do hashing inside of rhashtable_lookup_compare() Thomas Graf
  2014-12-15 12:51 ` [PATCH 5/9] nft_hash: Remove rhashtable_remove_pprev() Thomas Graf
@ 2014-12-15 12:51 ` Thomas Graf
  2015-01-16 15:34   ` Patrick McHardy
  2 siblings, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2014-12-15 12:51 UTC (permalink / raw)
  To: davem
  Cc: netdev, kernel, herbert, paulmck, edumazet, john.r.fastabend,
	josh, netfilter-devel

Introduces an array of spinlocks to protect bucket mutations. The number
of spinlocks per CPU is configurable and selected based on the hash of
the bucket. This allows for parallel insertions and removals of entries
which do not share a lock.

The patch also defers expansion and shrinking to a worker queue which
allows insertion and removal from atomic context. Insertions and
deletions may occur in parallel to it and are only held up briefly
while the particular bucket is linked or unzipped.

Mutations of the bucket table pointer is protected by a new mutex, read
access is RCU protected.

In the event of an expansion or shrinking, the new bucket table allocated
is exposed as a so called future table as soon as the resize process
starts.  Lookups, deletions, and insertions will briefly use both tables.
The future table becomes the main table after an RCU grace period and
initial linking of the old to the new table was performed. Optimization
of the chains to make use of the new number of buckets follows only the
new table is in use.

The side effect of this is that during that RCU grace period, a bucket
traversal using any rht_for_each() variant on the main table will not see
any insertions performed during the RCU grace period which would at that
point land in the future table. The lookup will see them as it searches
both tables if needed.

Having multiple insertions and removals occur in parallel requires nelems
to become an atomic counter.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
---
 include/linux/rhashtable.h |  37 ++--
 lib/rhashtable.c           | 458 ++++++++++++++++++++++++++++++++++-----------
 net/netfilter/nft_hash.c   |  27 ++-
 net/netlink/af_netlink.c   |  15 +-
 4 files changed, 384 insertions(+), 153 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f624d4b..a1688f0 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -19,6 +19,7 @@
 #define _LINUX_RHASHTABLE_H
 
 #include <linux/rculist.h>
+#include <linux/workqueue.h>
 
 struct rhash_head {
 	struct rhash_head __rcu		*next;
@@ -26,8 +27,17 @@ struct rhash_head {
 
 #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
 
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @locks_mask: Mask to apply before accessing locks[]
+ * @locks: Array of spinlocks protecting individual buckets
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
 	size_t				size;
+	unsigned int			locks_mask;
+	spinlock_t			*locks;
 	struct rhash_head __rcu		*buckets[];
 };
 
@@ -45,11 +55,11 @@ struct rhashtable;
  * @hash_rnd: Seed to use while hashing
  * @max_shift: Maximum number of shifts while expanding
  * @min_shift: Minimum number of shifts while shrinking
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
  * @hashfn: Function to hash key
  * @obj_hashfn: Function to hash object
  * @grow_decision: If defined, may return true if table should expand
  * @shrink_decision: If defined, may return true if table should shrink
- * @mutex_is_held: Must return true if protecting mutex is held
  */
 struct rhashtable_params {
 	size_t			nelem_hint;
@@ -59,37 +69,42 @@ struct rhashtable_params {
 	u32			hash_rnd;
 	size_t			max_shift;
 	size_t			min_shift;
+	size_t			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	bool			(*grow_decision)(const struct rhashtable *ht,
 						 size_t new_size);
 	bool			(*shrink_decision)(const struct rhashtable *ht,
 						   size_t new_size);
-#ifdef CONFIG_PROVE_LOCKING
-	int			(*mutex_is_held)(void *parent);
-	void			*parent;
-#endif
 };
 
 /**
  * struct rhashtable - Hash table handle
  * @tbl: Bucket table
+ * @future_tbl: Table under construction during expansion/shrinking
  * @nelems: Number of elements in table
  * @shift: Current size (1 << shift)
  * @p: Configuration parameters
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @being_destroyed: True if table is set up for destruction
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
-	size_t				nelems;
+	struct bucket_table __rcu       *future_tbl;
+	atomic_t			nelems;
 	size_t				shift;
 	struct rhashtable_params	p;
+	struct delayed_work             run_work;
+	struct mutex                    mutex;
+	bool                            being_destroyed;
 };
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
 #else
-static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
 	return 1;
 }
@@ -112,11 +127,11 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
 int rhashtable_expand(struct rhashtable *ht);
 int rhashtable_shrink(struct rhashtable *ht);
 
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup(struct rhashtable *ht, const void *key);
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
 
-void rhashtable_destroy(const struct rhashtable *ht);
+void rhashtable_destroy(struct rhashtable *ht);
 
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e6b85c4..312e343 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,19 +26,42 @@
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4UL
+#define BUCKET_LOCKS_PER_CPU   128UL
+
+enum {
+	RHT_LOCK_NORMAL,
+	RHT_LOCK_NESTED,
+	RHT_LOCK_NESTED2,
+};
+
+/* The bucket lock is selected based on the hash and protects mutations
+ * on a group of hash buckets.
+ *
+ * IMPORTANT: When holding the bucket lock of both the old and new table
+ * during expansions and shrinking, the old bucket lock must always be
+ * acquired first.
+ */
+static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
+{
+	return &tbl->locks[hash & tbl->locks_mask];
+}
 
 #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+#define ASSERT_BUCKET_LOCK(TBL, HASH) \
+	BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-	return ht->p.mutex_is_held(ht->p.parent);
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 {
-	return 1;
+	spinlock_t *lock = bucket_lock(tbl, hash);
+
+	return (debug_locks) ? lockdep_is_held(lock) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
@@ -66,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 	return hash;
 }
 
-static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
+static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	u32 hash;
@@ -95,7 +118,49 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	return pprev;
 }
 
-static struct bucket_table *bucket_table_alloc(size_t nbuckets)
+static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
+{
+	unsigned int i, size;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+
+	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
+	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+
+	/* Never allocate more than one lock per bucket */
+	size = min_t(unsigned int, size, tbl->size);
+
+	if (sizeof(spinlock_t) != 0) {
+#ifdef CONFIG_NUMA
+		if (size * sizeof(spinlock_t) > PAGE_SIZE)
+			tbl->locks = vmalloc(size * sizeof(spinlock_t));
+		else
+#endif
+		tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
+					   GFP_KERNEL);
+		if (!tbl->locks)
+			return -ENOMEM;
+		for (i = 0; i < size; i++)
+			spin_lock_init(&tbl->locks[i]);
+	}
+	tbl->locks_mask = size - 1;
+
+	return 0;
+}
+
+static void bucket_table_free(const struct bucket_table *tbl)
+{
+	if (tbl)
+		kvfree(tbl->locks);
+
+	kvfree(tbl);
+}
+
+static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
+					       size_t nbuckets)
 {
 	struct bucket_table *tbl;
 	size_t size;
@@ -110,12 +175,12 @@ static struct bucket_table *bucket_table_alloc(size_t nbuckets)
 
 	tbl->size = nbuckets;
 
-	return tbl;
-}
+	if (alloc_bucket_locks(ht, tbl) < 0) {
+		bucket_table_free(tbl);
+		return NULL;
+	}
 
-static void bucket_table_free(const struct bucket_table *tbl)
-{
-	kvfree(tbl);
+	return tbl;
 }
 
 /**
@@ -126,7 +191,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return ht->nelems > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -138,41 +203,59 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return ht->nelems < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
 static void hashtable_chain_unzip(const struct rhashtable *ht,
 				  const struct bucket_table *new_tbl,
-				  struct bucket_table *old_tbl, size_t n)
+				  struct bucket_table *old_tbl,
+				  size_t old_hash)
 {
 	struct rhash_head *he, *p, *next;
-	unsigned int h;
+	spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
+	unsigned int new_hash, new_hash2;
+
+	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
 
 	/* Old bucket empty, no work needed. */
-	p = rht_dereference(old_tbl->buckets[n], ht);
+	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
+				   old_hash);
 	if (!p)
 		return;
 
+	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
 	/* Advance the old bucket pointer one or more times until it
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each_continue(he, p->next, old_tbl, n) {
-		if (head_hashfn(ht, new_tbl, he) != h)
+	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
+		new_hash2 = head_hashfn(ht, new_tbl, he);
+		if (new_hash != new_hash2)
 			break;
 		p = he;
 	}
-	RCU_INIT_POINTER(old_tbl->buckets[n], p->next);
+	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	/* If we have encountered an entry that maps to a different bucket in
+	 * the new table, lock down that bucket as well as we might cut off
+	 * the end of the chain.
+	 */
+	new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
 
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each_continue(he, he->next, old_tbl, n) {
-			if (head_hashfn(ht, new_tbl, he) == h) {
+		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
 				break;
 			}
@@ -182,7 +265,23 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Set p's next pointer to that subsequent node pointer,
 	 * bypassing the nodes which do not hash to p's bucket
 	 */
-	RCU_INIT_POINTER(p->next, next);
+	rcu_assign_pointer(p->next, next);
+
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_unlock_bh(new_bucket_lock2);
+	spin_unlock_bh(new_bucket_lock);
+}
+
+static void link_old_to_new(struct bucket_table *new_tbl,
+			    unsigned int new_hash, struct rhash_head *entry)
+{
+	spinlock_t *new_bucket_lock;
+
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
+	spin_unlock_bh(new_bucket_lock);
 }
 
 /**
@@ -195,43 +294,59 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
- * The caller must ensure that no concurrent table mutations take place.
- * It is however valid to have concurrent lookups if they are RCU protected.
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head *he;
-	unsigned int i, h;
-	bool complete;
+	spinlock_t *old_bucket_lock;
+	unsigned int new_hash, old_hash;
+	bool complete = false;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
 		return 0;
 
-	new_tbl = bucket_table_alloc(old_tbl->size * 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
 	ht->shift++;
 
-	/* For each new bucket, search the corresponding old bucket
-	 * for the first entry that hashes to the new bucket, and
-	 * link the new bucket to that entry. Since all the entries
-	 * which will end up in the new bucket appear in the same
-	 * old bucket, this constructs an entirely valid new hash
-	 * table, but with multiple buckets "zipped" together into a
-	 * single imprecise chain.
+	/* Make insertions go into the new, empty table right away. Deletions
+	 * and lookups will be attempted in both tables until we synchronize.
+	 * The synchronize_rcu() guarantees for the new table to be picked up
+	 * so no new additions go into the old table while we relink.
+	 */
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
+
+	/* For each new bucket, search the corresponding old bucket for the
+	 * first entry that hashes to the new bucket, and link the end of
+	 * newly formed bucket chain (containing entries added to future
+	 * table) to that entry. Since all the entries which will end up in
+	 * the new bucket appear in the same old bucket, this constructs an
+	 * entirely valid new hash table, but with multiple buckets
+	 * "zipped" together into a single imprecise chain.
 	 */
-	for (i = 0; i < new_tbl->size; i++) {
-		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl, h) {
-			if (head_hashfn(ht, new_tbl, he) == i) {
-				RCU_INIT_POINTER(new_tbl->buckets[i], he);
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_hash = rht_bucket_index(old_tbl, new_hash);
+		old_bucket_lock = bucket_lock(old_tbl, old_hash);
+
+		spin_lock_bh(old_bucket_lock);
+		rht_for_each(he, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
+				link_old_to_new(new_tbl, new_hash, he);
 				break;
 			}
 		}
+		spin_unlock_bh(old_bucket_lock);
 	}
 
 	/* Publish the new table pointer. Lookups may now traverse
@@ -241,7 +356,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	rcu_assign_pointer(ht->tbl, new_tbl);
 
 	/* Unzip interleaved hash chains */
-	do {
+	while (!complete && !ht->being_destroyed) {
 		/* Wait for readers. All new readers will see the new
 		 * table, and thus no references to the old table will
 		 * remain.
@@ -253,12 +368,17 @@ int rhashtable_expand(struct rhashtable *ht)
 		 * table): ...
 		 */
 		complete = true;
-		for (i = 0; i < old_tbl->size; i++) {
-			hashtable_chain_unzip(ht, new_tbl, old_tbl, i);
-			if (old_tbl->buckets[i] != NULL)
+		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			old_bucket_lock = bucket_lock(old_tbl, old_hash);
+			spin_lock_bh(old_bucket_lock);
+
+			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
+			if (old_tbl->buckets[old_hash] != NULL)
 				complete = false;
+
+			spin_unlock_bh(old_bucket_lock);
 		}
-	} while (!complete);
+	}
 
 	bucket_table_free(old_tbl);
 	return 0;
@@ -272,38 +392,65 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
  * The caller must ensure that no concurrent table mutations take place.
  * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
-	unsigned int i;
+	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
+	spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
+	unsigned int new_hash;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->shift <= ht->p.min_shift)
 		return 0;
 
-	ntbl = bucket_table_alloc(tbl->size / 2);
-	if (ntbl == NULL)
+	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
+	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift--;
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
 
-	/* Link each bucket in the new table to the first bucket
-	 * in the old table that contains entries which will hash
-	 * to the new bucket.
+	/* Link the first entry in the old bucket to the end of the
+	 * bucket in the new table. As entries are concurrently being
+	 * added to the new table, lock down the new bucket. As we
+	 * always divide the size in half when shrinking, each bucket
+	 * in the new table maps to exactly two buckets in the old
+	 * table.
+	 *
+	 * As removals can occur concurrently on the old table, we need
+	 * to lock down both matching buckets in the old table.
 	 */
-	for (i = 0; i < ntbl->size; i++) {
-		ntbl->buckets[i] = tbl->buckets[i];
-		RCU_INIT_POINTER(*bucket_tail(ntbl, i),
-				 tbl->buckets[i + ntbl->size]);
-
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_bucket_lock1 = bucket_lock(tbl, new_hash);
+		old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
+		new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+		spin_lock_bh(old_bucket_lock1);
+		spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
+
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash]);
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash + new_tbl->size]);
+
+		spin_unlock_bh(new_bucket_lock);
+		spin_unlock_bh(old_bucket_lock2);
+		spin_unlock_bh(old_bucket_lock1);
 	}
 
 	/* Publish the new, valid hash table */
-	rcu_assign_pointer(ht->tbl, ntbl);
+	rcu_assign_pointer(ht->tbl, new_tbl);
+	ht->shift--;
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -316,31 +463,63 @@ int rhashtable_shrink(struct rhashtable *ht)
 }
 EXPORT_SYMBOL_GPL(rhashtable_shrink);
 
+static void rht_deferred_worker(struct work_struct *work)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl;
+
+	ht = container_of(work, struct rhashtable, run_work.work);
+	mutex_lock(&ht->mutex);
+	tbl = rht_dereference(ht->tbl, ht);
+
+	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		rhashtable_expand(ht);
+	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
+		rhashtable_shrink(ht);
+
+	mutex_unlock(&ht->mutex);
+}
+
 /**
  * rhashtable_insert - insert object into hash hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
- * Will automatically grow the table via rhashtable_expand() if the the
- * grow_decision function specified at rhashtable_init() returns true.
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
  *
- * The caller must ensure that no concurrent table mutations occur. It is
- * however valid to have concurrent lookups if they are RCU protected.
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
  */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-	u32 hash;
+	struct bucket_table *tbl;
+	spinlock_t *lock;
+	unsigned hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
 
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
-	ht->nelems++;
+	spin_unlock_bh(lock);
 
-	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
-		rhashtable_expand(ht);
+	atomic_inc(&ht->nelems);
+
+	/* Only grow the table if no resizing is currently in progress. */
+	if (ht->tbl != ht->future_tbl &&
+	    ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		schedule_delayed_work(&ht->run_work, 0);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
@@ -361,32 +540,56 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *tbl;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	u32 h;
+	spinlock_t *lock;
+	unsigned int hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = head_hashfn(ht, tbl, obj);
 
-	h = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl, h) {
+restart:
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
 		}
 
-		RCU_INIT_POINTER(*pprev, he->next);
-		ht->nelems--;
+		rcu_assign_pointer(*pprev, obj->next);
+		atomic_dec(&ht->nelems);
 
-		if (ht->p.shrink_decision &&
+		spin_unlock_bh(lock);
+
+		if (ht->tbl != ht->future_tbl &&
+		    ht->p.shrink_decision &&
 		    ht->p.shrink_decision(ht, tbl->size))
-			rhashtable_shrink(ht);
+			schedule_delayed_work(&ht->run_work, 0);
+
+		rcu_read_unlock();
 
 		return true;
 	}
 
+	if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
+		spin_unlock_bh(lock);
+
+		tbl = rht_dereference_rcu(ht->tbl, ht);
+		hash = head_hashfn(ht, tbl, obj);
+
+		lock = bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+		goto restart;
+	}
+
+	spin_unlock_bh(lock);
+	rcu_read_unlock();
+
 	return false;
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
@@ -402,25 +605,35 @@ EXPORT_SYMBOL_GPL(rhashtable_remove);
  * This lookup function may only be used for fixed key hash table (key_len
  * paramter set). It will BUG() if used inappropriately.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  */
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
+void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
-	u32 h;
+	u32 hash;
 
 	BUG_ON(!ht->p.key_len);
 
-	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, h) {
+	rcu_read_lock();
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	hash = key_hashfn(ht, key, ht->p.key_len);
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+
+	rcu_read_unlock();
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup);
@@ -435,25 +648,36 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  * Traverses the bucket chain behind the provided hash value and calls the
  * specified compare function for each entry.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
 	u32 hash;
 
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, hash) {
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+	rcu_read_unlock();
+
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
@@ -485,9 +709,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -507,9 +728,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.head_offset = offsetof(struct test_obj, node),
  *	.hashfn = jhash,
  *	.obj_hashfn = my_hash_fn,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  */
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
@@ -529,18 +747,29 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
 
-	tbl = bucket_table_alloc(size);
+	memset(ht, 0, sizeof(*ht));
+	mutex_init(&ht->mutex);
+	memcpy(&ht->p, params, sizeof(*params));
+
+	if (params->locks_mul)
+		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
+	else
+		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+
+	tbl = bucket_table_alloc(ht, size);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	memset(ht, 0, sizeof(*ht));
 	ht->shift = ilog2(tbl->size);
-	memcpy(&ht->p, params, sizeof(*params));
 	RCU_INIT_POINTER(ht->tbl, tbl);
+	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
 	if (!ht->p.hash_rnd)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_init);
@@ -553,9 +782,16 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
  * has to make sure that no resizing may happen by unpublishing the hashtable
  * and waiting for the quiescent cycle before releasing the bucket array.
  */
-void rhashtable_destroy(const struct rhashtable *ht)
+void rhashtable_destroy(struct rhashtable *ht)
 {
-	bucket_table_free(ht->tbl);
+	ht->being_destroyed = true;
+
+	mutex_lock(&ht->mutex);
+
+	cancel_delayed_work(&ht->run_work);
+	bucket_table_free(rht_dereference(ht->tbl, ht));
+
+	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
 
@@ -570,13 +806,6 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
 #define TEST_PTR	((void *) 0xdeadbeef)
 #define TEST_NEXPANDS	4
 
-#ifdef CONFIG_PROVE_LOCKING
-static int test_mutex_is_held(void *parent)
-{
-	return 1;
-}
-#endif
-
 struct test_obj {
 	void			*ptr;
 	int			value;
@@ -646,10 +875,10 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 				i, tbl->buckets[i], cnt);
 	}
 
-	pr_info("  Traversal complete: counted=%u, nelems=%zu, entries=%d\n",
-		total, ht->nelems, TEST_ENTRIES);
+	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d\n",
+		total, atomic_read(&ht->nelems), TEST_ENTRIES);
 
-	if (total != ht->nelems || total != TEST_ENTRIES)
+	if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
 		pr_warn("Test failed: Total count mismatch ^^^");
 }
 
@@ -688,7 +917,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table expansion iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_expand(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -698,7 +929,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table shrinkage iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_shrink(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -741,9 +974,6 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = &test_mutex_is_held,
-#endif
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 7f903cf..75887d7 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -33,7 +33,7 @@ static bool nft_hash_lookup(const struct nft_set *set,
 			    const struct nft_data *key,
 			    struct nft_data *data)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	const struct nft_hash_elem *he;
 
 	he = rhashtable_lookup(priv, key);
@@ -113,7 +113,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
 
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	struct nft_compare_arg arg = {
 		.set = set,
 		.elem = elem,
@@ -129,7 +129,7 @@ static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 			  struct nft_set_iter *iter)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	const struct bucket_table *tbl;
 	const struct nft_hash_elem *he;
 	struct nft_set_elem elem;
@@ -162,13 +162,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
 	return sizeof(struct rhashtable);
 }
 
-#ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nfnl_lock_is_held(void *parent)
-{
-	return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
-}
-#endif
-
 static int nft_hash_init(const struct nft_set *set,
 			 const struct nft_set_desc *desc,
 			 const struct nlattr * const tb[])
@@ -182,9 +175,6 @@ static int nft_hash_init(const struct nft_set *set,
 		.hashfn = jhash,
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = lockdep_nfnl_lock_is_held,
-#endif
 	};
 
 	return rhashtable_init(priv, &params);
@@ -192,16 +182,23 @@ static int nft_hash_init(const struct nft_set *set,
 
 static void nft_hash_destroy(const struct nft_set *set)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
-	const struct bucket_table *tbl = priv->tbl;
+	struct rhashtable *priv = nft_set_priv(set);
+	const struct bucket_table *tbl;
 	struct nft_hash_elem *he;
 	struct rhash_head *pos, *next;
 	unsigned int i;
 
+	/* Stop an eventual async resizing */
+	priv->being_destroyed = true;
+	mutex_lock(&priv->mutex);
+
+	tbl = rht_dereference(priv->tbl, priv);
 	for (i = 0; i < tbl->size; i++) {
 		rht_for_each_entry_safe(he, pos, next, tbl, i, node)
 			nft_hash_elem_destroy(set, he);
 	}
+	mutex_unlock(&priv->mutex);
+
 	rhashtable_destroy(priv);
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index bfae4be..b38b148 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -114,15 +114,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 DEFINE_MUTEX(nl_sk_hash_lock);
 EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
 
-#ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nl_sk_hash_is_held(void *parent)
-{
-	if (debug_locks)
-		return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
-	return 1;
-}
-#endif
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static DEFINE_SPINLOCK(netlink_tap_lock);
@@ -1083,7 +1074,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
 		goto err;
 
 	err = -ENOMEM;
-	if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX))
+	if (BITS_PER_LONG > 32 &&
+	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
 		goto err;
 
 	nlk_sk(sk)->portid = portid;
@@ -3134,9 +3126,6 @@ static int __init netlink_proto_init(void)
 		.max_shift = 16, /* 64K */
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = lockdep_nl_sk_hash_is_held,
-#endif
 	};
 
 	if (err != 0)
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2014-12-15 12:51 ` [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking Thomas Graf
@ 2015-01-16 15:34   ` Patrick McHardy
  2015-01-16 15:58     ` Thomas Graf
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 15:34 UTC (permalink / raw)
  To: Thomas Graf
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 15.12, Thomas Graf wrote:
> The patch also defers expansion and shrinking to a worker queue which
> allows insertion and removal from atomic context. Insertions and
> deletions may occur in parallel to it and are only held up briefly
> while the particular bucket is linked or unzipped.
>
> Mutations of the bucket table pointer is protected by a new mutex, read
> access is RCU protected.
> 
> In the event of an expansion or shrinking, the new bucket table allocated
> is exposed as a so called future table as soon as the resize process
> starts.  Lookups, deletions, and insertions will briefly use both tables.
> The future table becomes the main table after an RCU grace period and
> initial linking of the old to the new table was performed. Optimization
> of the chains to make use of the new number of buckets follows only the
> new table is in use.

AFAICT nft_hash_walk() will miss new entries during this period.
Am I missing anything here?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 15:34   ` Patrick McHardy
@ 2015-01-16 15:58     ` Thomas Graf
  2015-01-16 16:03       ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 15:58 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 01/16/15 at 03:34pm, Patrick McHardy wrote:
> On 15.12, Thomas Graf wrote:
> > In the event of an expansion or shrinking, the new bucket table allocated
> > is exposed as a so called future table as soon as the resize process
> > starts.  Lookups, deletions, and insertions will briefly use both tables.
> > The future table becomes the main table after an RCU grace period and
> > initial linking of the old to the new table was performed. Optimization
> > of the chains to make use of the new number of buckets follows only the
> > new table is in use.
> 
> AFAICT nft_hash_walk() will miss new entries during this period.
> Am I missing anything here?

A walker may not see insertions that occur after the walker was started
if resizing is enabled. Is that a problem for nftables?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 15:58     ` Thomas Graf
@ 2015-01-16 16:03       ` Patrick McHardy
  2015-01-16 16:15         ` Thomas Graf
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 16:03 UTC (permalink / raw)
  To: Thomas Graf
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 03:34pm, Patrick McHardy wrote:
> > On 15.12, Thomas Graf wrote:
> > > In the event of an expansion or shrinking, the new bucket table allocated
> > > is exposed as a so called future table as soon as the resize process
> > > starts.  Lookups, deletions, and insertions will briefly use both tables.
> > > The future table becomes the main table after an RCU grace period and
> > > initial linking of the old to the new table was performed. Optimization
> > > of the chains to make use of the new number of buckets follows only the
> > > new table is in use.
> > 
> > AFAICT nft_hash_walk() will miss new entries during this period.
> > Am I missing anything here?
> 
> A walker may not see insertions that occur after the walker was started
> if resizing is enabled. Is that a problem for nftables?

No, that would be Ok. The case I'm wondering about is:

- insertion
- resize starts
- another insertion
- walker, resize not finished yet

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:03       ` Patrick McHardy
@ 2015-01-16 16:15         ` Thomas Graf
  2015-01-16 16:32           ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 16:15 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 01/16/15 at 04:03pm, Patrick McHardy wrote:
> On 16.01, Thomas Graf wrote:
> > A walker may not see insertions that occur after the walker was started
> > if resizing is enabled. Is that a problem for nftables?
> 
> No, that would be Ok. The case I'm wondering about is:
> 
> - insertion
> - resize starts
> - another insertion
> - walker, resize not finished yet

Correct, walker may not see "another insertion". The window for this
behavior to occur is not the full resize operation, just the linking
period, but it may occur. The length of the window is typically
equal to a grace period.

We can provide a synchronization function to block the dumper or the
insertion/removal until the linking is complete. The latter would
give the old runtime behaviour back (variable runtime of insert),
the blocked dumper might be preferred. What do you think?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:15         ` Thomas Graf
@ 2015-01-16 16:32           ` Patrick McHardy
  2015-01-16 16:38             ` Thomas Graf
  2015-01-16 16:43             ` David Laight
  0 siblings, 2 replies; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 16:32 UTC (permalink / raw)
  To: Thomas Graf
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 04:03pm, Patrick McHardy wrote:
> > On 16.01, Thomas Graf wrote:
> > > A walker may not see insertions that occur after the walker was started
> > > if resizing is enabled. Is that a problem for nftables?
> > 
> > No, that would be Ok. The case I'm wondering about is:
> > 
> > - insertion
> > - resize starts
> > - another insertion
> > - walker, resize not finished yet
> 
> Correct, walker may not see "another insertion". The window for this
> behavior to occur is not the full resize operation, just the linking
> period, but it may occur. The length of the window is typically
> equal to a grace period.
> 
> We can provide a synchronization function to block the dumper or the
> insertion/removal until the linking is complete. The latter would
> give the old runtime behaviour back (variable runtime of insert),
> the blocked dumper might be preferred. What do you think?

If we have to block, the dumper if of course preferred. Taking the
mutex should do fine I guess?

I suppose walking both tables without any races would be rather
complicated.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:32           ` Patrick McHardy
@ 2015-01-16 16:38             ` Thomas Graf
  2015-01-16 16:51               ` Patrick McHardy
  2015-01-16 16:43             ` David Laight
  1 sibling, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 16:38 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 01/16/15 at 04:32pm, Patrick McHardy wrote:
> If we have to block, the dumper if of course preferred. Taking the
> mutex should do fine I guess?

That will work, it will also ensure that the walker doesn't see the
pre unzip state which would cause entries to be duplicated in the
walker.

> I suppose walking both tables without any races would be rather
> complicated.

Very interested if you can come up with something ;-)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:32           ` Patrick McHardy
  2015-01-16 16:38             ` Thomas Graf
@ 2015-01-16 16:43             ` David Laight
  2015-01-16 16:53               ` Thomas Graf
  1 sibling, 1 reply; 36+ messages in thread
From: David Laight @ 2015-01-16 16:43 UTC (permalink / raw)
  To: 'Patrick McHardy', Thomas Graf
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	kernel@vger.kernel.org, herbert@gondor.apana.org.au,
	paulmck@linux.vnet.ibm.com, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

From: Patrick McHardy
> On 16.01, Thomas Graf wrote:
> > On 01/16/15 at 04:03pm, Patrick McHardy wrote:
> > > On 16.01, Thomas Graf wrote:
> > > > A walker may not see insertions that occur after the walker was started
> > > > if resizing is enabled. Is that a problem for nftables?
> > >
> > > No, that would be Ok. The case I'm wondering about is:
> > >
> > > - insertion
> > > - resize starts
> > > - another insertion
> > > - walker, resize not finished yet
> >
> > Correct, walker may not see "another insertion". The window for this
> > behavior to occur is not the full resize operation, just the linking
> > period, but it may occur. The length of the window is typically
> > equal to a grace period.
> >
> > We can provide a synchronization function to block the dumper or the
> > insertion/removal until the linking is complete. The latter would
> > give the old runtime behaviour back (variable runtime of insert),
> > the blocked dumper might be preferred. What do you think?
> 
> If we have to block, the dumper if of course preferred. Taking the
> mutex should do fine I guess?
> 
> I suppose walking both tables without any races would be rather
> complicated.

The walker is unlikely to see items that get inserted early in the hash
table even without a resize.

I'd be more worried about the walker missing big blocks of entries or
getting duplicate entries during a resize.
This might be a problem if the walker is a user-process table dump,
in which case you can't assume it will finish in any finite time.

	David

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:38             ` Thomas Graf
@ 2015-01-16 16:51               ` Patrick McHardy
  0 siblings, 0 replies; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 16:51 UTC (permalink / raw)
  To: Thomas Graf
  Cc: davem, netdev, kernel, herbert, paulmck, edumazet,
	john.r.fastabend, josh, netfilter-devel

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 04:32pm, Patrick McHardy wrote:
> > If we have to block, the dumper if of course preferred. Taking the
> > mutex should do fine I guess?
> 
> That will work, it will also ensure that the walker doesn't see the
> pre unzip state which would cause entries to be duplicated in the
> walker.
> 
> > I suppose walking both tables without any races would be rather
> > complicated.
> 
> Very interested if you can come up with something ;-)

For now I'll stick to the mutex. Thanks!

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:43             ` David Laight
@ 2015-01-16 16:53               ` Thomas Graf
  2015-01-16 18:36                 ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 16:53 UTC (permalink / raw)
  To: David Laight
  Cc: 'Patrick McHardy', davem@davemloft.net,
	netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	paulmck@linux.vnet.ibm.com, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On 01/16/15 at 04:43pm, David Laight wrote:
> The walker is unlikely to see items that get inserted early in the hash
> table even without a resize.

I don't follow, you have to explain this statement.

Walkers which don't want to see duplicates or miss entries should
just take the mutex.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 16:53               ` Thomas Graf
@ 2015-01-16 18:36                 ` Patrick McHardy
  2015-01-16 19:18                   ` Thomas Graf
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 18:36 UTC (permalink / raw)
  To: Thomas Graf
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 04:43pm, David Laight wrote:
> > The walker is unlikely to see items that get inserted early in the hash
> > table even without a resize.
> 
> I don't follow, you have to explain this statement.
> 
> Walkers which don't want to see duplicates or miss entries should
> just take the mutex.

Well, we do have a problem with interrupted dumps. As you know once
the netlink message buffer is full, we return to userspace and
continue dumping during the next read. Expanding obviously changes
the order since we rehash from bucket N to N and 2N, so this will
indeed cause duplicate (doesn't matter) and missed entries.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 18:36                 ` Patrick McHardy
@ 2015-01-16 19:18                   ` Thomas Graf
  2015-01-16 19:35                     ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 19:18 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 01/16/15 at 06:36pm, Patrick McHardy wrote:
> On 16.01, Thomas Graf wrote:
> > On 01/16/15 at 04:43pm, David Laight wrote:
> > > The walker is unlikely to see items that get inserted early in the hash
> > > table even without a resize.
> > 
> > I don't follow, you have to explain this statement.
> > 
> > Walkers which don't want to see duplicates or miss entries should
> > just take the mutex.
> 
> Well, we do have a problem with interrupted dumps. As you know once
> the netlink message buffer is full, we return to userspace and
> continue dumping during the next read. Expanding obviously changes
> the order since we rehash from bucket N to N and 2N, so this will
> indeed cause duplicate (doesn't matter) and missed entries.

Right,but that's a Netlink dump issue and not specific to rhashtable.
Putting the sequence number check in place should be sufficient
for sets, right?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 19:18                   ` Thomas Graf
@ 2015-01-16 19:35                     ` Patrick McHardy
  2015-01-16 20:46                       ` Pablo Neira Ayuso
                                         ` (2 more replies)
  0 siblings, 3 replies; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 19:35 UTC (permalink / raw)
  To: Thomas Graf
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 06:36pm, Patrick McHardy wrote:
> > On 16.01, Thomas Graf wrote:
> > > On 01/16/15 at 04:43pm, David Laight wrote:
> > > > The walker is unlikely to see items that get inserted early in the hash
> > > > table even without a resize.
> > > 
> > > I don't follow, you have to explain this statement.
> > > 
> > > Walkers which don't want to see duplicates or miss entries should
> > > just take the mutex.
> > 
> > Well, we do have a problem with interrupted dumps. As you know once
> > the netlink message buffer is full, we return to userspace and
> > continue dumping during the next read. Expanding obviously changes
> > the order since we rehash from bucket N to N and 2N, so this will
> > indeed cause duplicate (doesn't matter) and missed entries.
> 
> Right,but that's a Netlink dump issue and not specific to rhashtable.

Well, rhashtable (or generally resizing) will make it a lot worse.
Usually we at worst miss entries which were added during the dump,
which is made up by the notifications.

With resizing we might miss anything, its completely undeterministic.

> Putting the sequence number check in place should be sufficient
> for sets, right?

I don't see how. The problem is that the ordering of the hash changes
and it will skip different entries than those that have already been
dumped.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 19:35                     ` Patrick McHardy
@ 2015-01-16 20:46                       ` Pablo Neira Ayuso
  2015-01-16 20:53                         ` Patrick McHardy
  2015-01-19  9:01                         ` Paul E. McKenney
  2015-01-16 20:49                       ` Herbert Xu
  2015-01-16 21:36                       ` Thomas Graf
  2 siblings, 2 replies; 36+ messages in thread
From: Pablo Neira Ayuso @ 2015-01-16 20:46 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	paulmck@linux.vnet.ibm.com, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On Fri, Jan 16, 2015 at 07:35:57PM +0000, Patrick McHardy wrote:
> On 16.01, Thomas Graf wrote:
> > On 01/16/15 at 06:36pm, Patrick McHardy wrote:
> > > On 16.01, Thomas Graf wrote:
> > > > On 01/16/15 at 04:43pm, David Laight wrote:
> > > > > The walker is unlikely to see items that get inserted early in the hash
> > > > > table even without a resize.
> > > > 
> > > > I don't follow, you have to explain this statement.
> > > > 
> > > > Walkers which don't want to see duplicates or miss entries should
> > > > just take the mutex.
> > > 
> > > Well, we do have a problem with interrupted dumps. As you know once
> > > the netlink message buffer is full, we return to userspace and
> > > continue dumping during the next read. Expanding obviously changes
> > > the order since we rehash from bucket N to N and 2N, so this will
> > > indeed cause duplicate (doesn't matter) and missed entries.
> > 
> > Right,but that's a Netlink dump issue and not specific to rhashtable.
> 
> Well, rhashtable (or generally resizing) will make it a lot worse.
> Usually we at worst miss entries which were added during the dump,
> which is made up by the notifications.
> 
> With resizing we might miss anything, its completely undeterministic.
> 
> > Putting the sequence number check in place should be sufficient
> > for sets, right?
> 
> I don't see how. The problem is that the ordering of the hash changes
> and it will skip different entries than those that have already been
> dumped.

I think the generation counter should catch up this sort of problems.
The resizing is triggered by a new/deletion element, which bumps it
once the transaction is handled.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 19:35                     ` Patrick McHardy
  2015-01-16 20:46                       ` Pablo Neira Ayuso
@ 2015-01-16 20:49                       ` Herbert Xu
  2015-01-16 21:31                         ` Patrick McHardy
  2015-01-16 21:36                       ` Thomas Graf
  2 siblings, 1 reply; 36+ messages in thread
From: Herbert Xu @ 2015-01-16 20:49 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On Fri, Jan 16, 2015 at 07:35:57PM +0000, Patrick McHardy wrote:
> 
> Well, rhashtable (or generally resizing) will make it a lot worse.
> Usually we at worst miss entries which were added during the dump,
> which is made up by the notifications.
> 
> With resizing we might miss anything, its completely undeterministic.

Correct.  If you want to have a stable dump you will need to have
data structure outside the hash table.  For example, with xfrm_state
we do it with a linked list.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 20:46                       ` Pablo Neira Ayuso
@ 2015-01-16 20:53                         ` Patrick McHardy
  2015-01-19  9:01                         ` Paul E. McKenney
  1 sibling, 0 replies; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 20:53 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	paulmck@linux.vnet.ibm.com, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On 16.01, Pablo Neira Ayuso wrote:
> On Fri, Jan 16, 2015 at 07:35:57PM +0000, Patrick McHardy wrote:
> > On 16.01, Thomas Graf wrote:
> > > On 01/16/15 at 06:36pm, Patrick McHardy wrote:
> > > > 
> > > > Well, we do have a problem with interrupted dumps. As you know once
> > > > the netlink message buffer is full, we return to userspace and
> > > > continue dumping during the next read. Expanding obviously changes
> > > > the order since we rehash from bucket N to N and 2N, so this will
> > > > indeed cause duplicate (doesn't matter) and missed entries.
> > > 
> > > Right,but that's a Netlink dump issue and not specific to rhashtable.
> > 
> > Well, rhashtable (or generally resizing) will make it a lot worse.
> > Usually we at worst miss entries which were added during the dump,
> > which is made up by the notifications.
> > 
> > With resizing we might miss anything, its completely undeterministic.
> > 
> > > Putting the sequence number check in place should be sufficient
> > > for sets, right?
> > 
> > I don't see how. The problem is that the ordering of the hash changes
> > and it will skip different entries than those that have already been
> > dumped.
> 
> I think the generation counter should catch up this sort of problems.
> The resizing is triggered by a new/deletion element, which bumps it
> once the transaction is handled.

I don't think so, it tracks only two generations, we can have an
arbitrary amount of changes while performing a dump.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 20:49                       ` Herbert Xu
@ 2015-01-16 21:31                         ` Patrick McHardy
  2015-01-17  0:33                           ` Herbert Xu
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 21:31 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 17.01, Herbert Xu wrote:
> On Fri, Jan 16, 2015 at 07:35:57PM +0000, Patrick McHardy wrote:
> > 
> > Well, rhashtable (or generally resizing) will make it a lot worse.
> > Usually we at worst miss entries which were added during the dump,
> > which is made up by the notifications.
> > 
> > With resizing we might miss anything, its completely undeterministic.
> 
> Correct.  If you want to have a stable dump you will need to have
> data structure outside the hash table.  For example, with xfrm_state
> we do it with a linked list.

That's certainly one possibility, however since we might have a huge
number of elements, its very undesirable to use even a little memory
for this.

I'm tending towards deferring resize operations while dumps are in
progress. Since we only allow dumps by root, it seems the worst
thing that can happen is that we run using a non optimal hash,
which is comparable to having a badly structured ruleset.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 19:35                     ` Patrick McHardy
  2015-01-16 20:46                       ` Pablo Neira Ayuso
  2015-01-16 20:49                       ` Herbert Xu
@ 2015-01-16 21:36                       ` Thomas Graf
  2015-01-16 22:07                         ` Patrick McHardy
  2015-01-19  9:45                         ` David Laight
  2 siblings, 2 replies; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 21:36 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 01/16/15 at 07:35pm, Patrick McHardy wrote:
> On 16.01, Thomas Graf wrote:
> > Right,but that's a Netlink dump issue and not specific to rhashtable.
> 
> Well, rhashtable (or generally resizing) will make it a lot worse.
> Usually we at worst miss entries which were added during the dump,
> which is made up by the notifications.
> 
> With resizing we might miss anything, its completely undeterministic.
> 
> > Putting the sequence number check in place should be sufficient
> > for sets, right?
> 
> I don't see how. The problem is that the ordering of the hash changes
> and it will skip different entries than those that have already been
> dumped.

Since an resize can only be triggered through insert/remove you
simply bump a sequence number when you insert/remove and have
userspace restart the dump when NLM_F_DUMP_INTR is set.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 21:36                       ` Thomas Graf
@ 2015-01-16 22:07                         ` Patrick McHardy
  2015-01-16 23:34                           ` Thomas Graf
  2015-01-19  9:45                         ` David Laight
  1 sibling, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-16 22:07 UTC (permalink / raw)
  To: Thomas Graf
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 07:35pm, Patrick McHardy wrote:
> > On 16.01, Thomas Graf wrote:
> > > Right,but that's a Netlink dump issue and not specific to rhashtable.
> > 
> > Well, rhashtable (or generally resizing) will make it a lot worse.
> > Usually we at worst miss entries which were added during the dump,
> > which is made up by the notifications.
> > 
> > With resizing we might miss anything, its completely undeterministic.
> > 
> > > Putting the sequence number check in place should be sufficient
> > > for sets, right?
> > 
> > I don't see how. The problem is that the ordering of the hash changes
> > and it will skip different entries than those that have already been
> > dumped.
> 
> Since an resize can only be triggered through insert/remove you
> simply bump a sequence number when you insert/remove and have
> userspace restart the dump when NLM_F_DUMP_INTR is set.

I'm afraid that's not good enough. The resize operation is deferred,
so even if userspace does not perform an operation after starting
the dump, the hash might change.

We can obviously work around that by incrementing a generation
counter in rhashtable. The main problem I see is that with something
very actively changing the ruleset, we might never complete a dump.

Dumps are usually rare, I think its preferrable to defer rehashing.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 22:07                         ` Patrick McHardy
@ 2015-01-16 23:34                           ` Thomas Graf
  2015-01-17  8:02                             ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Thomas Graf @ 2015-01-16 23:34 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 01/16/15 at 10:07pm, Patrick McHardy wrote:
> I'm afraid that's not good enough. The resize operation is deferred,
> so even if userspace does not perform an operation after starting
> the dump, the hash might change.
> 
> We can obviously work around that by incrementing a generation
> counter in rhashtable. The main problem I see is that with something
> very actively changing the ruleset, we might never complete a dump.

Right, I suggest adding a function pointer to rhashtable_params,
which when set, is called on resize so users can bump their own
sequence counter on resize.

> Dumps are usually rare, I think its preferrable to defer rehashing.

Resize operations should be *really* rare as well unless you start
with really small hash table sizes and constantly add/remove at the
watermark.

Re-dumping on insert/remove is a different story of course. Do you
care about missed insert/removals for dumps? If not we can do the
sequence number consistency checking for resizing only.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 21:31                         ` Patrick McHardy
@ 2015-01-17  0:33                           ` Herbert Xu
  2015-01-17  8:06                             ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Herbert Xu @ 2015-01-17  0:33 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On Fri, Jan 16, 2015 at 09:31:56PM +0000, Patrick McHardy wrote:
> 
> I'm tending towards deferring resize operations while dumps are in
> progress. Since we only allow dumps by root, it seems the worst
> thing that can happen is that we run using a non optimal hash,
> which is comparable to having a badly structured ruleset.

BTW, the current rhashtable has a deficiency in that the insert
operation never fails.  However, with delayed resizing, we must
allow insertion to fail or there can be too many insertions that
may overwhelm the hash table or even worse overflow the hash table
size counter.

So in this scenario, a dump may cause insertion failures by delaying
the completion of the expansion.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 23:34                           ` Thomas Graf
@ 2015-01-17  8:02                             ` Patrick McHardy
  2015-01-19 12:58                               ` Thomas Graf
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-17  8:02 UTC (permalink / raw)
  To: Thomas Graf
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 16.01, Thomas Graf wrote:
> On 01/16/15 at 10:07pm, Patrick McHardy wrote:
> > I'm afraid that's not good enough. The resize operation is deferred,
> > so even if userspace does not perform an operation after starting
> > the dump, the hash might change.
> > 
> > We can obviously work around that by incrementing a generation
> > counter in rhashtable. The main problem I see is that with something
> > very actively changing the ruleset, we might never complete a dump.
> 
> Right, I suggest adding a function pointer to rhashtable_params,
> which when set, is called on resize so users can bump their own
> sequence counter on resize.
> 
> > Dumps are usually rare, I think its preferrable to defer rehashing.
> 
> Resize operations should be *really* rare as well unless you start
> with really small hash table sizes and constantly add/remove at the
> watermark.

Which are far enough from each other that this should only happen
in really unlucky cases.

> Re-dumping on insert/remove is a different story of course. Do you
> care about missed insert/removals for dumps? If not we can do the
> sequence number consistency checking for resizing only.

No, that has always been undeterministic with netlink. We want to
dump everything that was present when the dump was started and is
still present when it finishes. Anything else can be handled using
notifications.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-17  0:33                           ` Herbert Xu
@ 2015-01-17  8:06                             ` Patrick McHardy
  2015-01-17  9:32                               ` Herbert Xu
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-17  8:06 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 17.01, Herbert Xu wrote:
> On Fri, Jan 16, 2015 at 09:31:56PM +0000, Patrick McHardy wrote:
> > 
> > I'm tending towards deferring resize operations while dumps are in
> > progress. Since we only allow dumps by root, it seems the worst
> > thing that can happen is that we run using a non optimal hash,
> > which is comparable to having a badly structured ruleset.
> 
> BTW, the current rhashtable has a deficiency in that the insert
> operation never fails.  However, with delayed resizing, we must
> allow insertion to fail or there can be too many insertions that
> may overwhelm the hash table or even worse overflow the hash table
> size counter.
> 
> So in this scenario, a dump may cause insertion failures by delaying
> the completion of the expansion.

Resizing might also fail because of memory allocation problems, but
I'd argue that its better to continue with a non-optimal sized table
and retry later than to completely fail, at least unless the API
user has explicitly requested this behaviour.

As for the element counter, yeah, it should prevent overflow. In that
case I agree that failing insertion is the easiest solution.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-17  8:06                             ` Patrick McHardy
@ 2015-01-17  9:32                               ` Herbert Xu
  2015-01-17  9:51                                 ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Herbert Xu @ 2015-01-17  9:32 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On Sat, Jan 17, 2015 at 08:06:21AM +0000, Patrick McHardy wrote:
> 
> Resizing might also fail because of memory allocation problems, but
> I'd argue that its better to continue with a non-optimal sized table
> and retry later than to completely fail, at least unless the API
> user has explicitly requested this behaviour.
> 
> As for the element counter, yeah, it should prevent overflow. In that
> case I agree that failing insertion is the easiest solution.

Well you have to consider the security aspect.  These days root-
only is no longer an acceptable excuse given things like namespaces.

If you don't fail the insertions while the expansion is ongoing,
and assuming a dump can postpone expansions, then you can essentially
insert entries into the hash table at will which is an easy DoS
attack.

Note that you don't have to fail the insertion right away.  I
think waiting until you reach max * 2 would be fine.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-17  9:32                               ` Herbert Xu
@ 2015-01-17  9:51                                 ` Patrick McHardy
  2015-01-17 10:13                                   ` Herbert Xu
  0 siblings, 1 reply; 36+ messages in thread
From: Patrick McHardy @ 2015-01-17  9:51 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

Am 17. Januar 2015 09:32:28 GMT+00:00, schrieb Herbert Xu <herbert@gondor.apana.org.au>:
>On Sat, Jan 17, 2015 at 08:06:21AM +0000, Patrick McHardy wrote:
>> 
>> Resizing might also fail because of memory allocation problems, but
>> I'd argue that its better to continue with a non-optimal sized table
>> and retry later than to completely fail, at least unless the API
>> user has explicitly requested this behaviour.
>> 
>> As for the element counter, yeah, it should prevent overflow. In that
>> case I agree that failing insertion is the easiest solution.
>
>Well you have to consider the security aspect.  These days root-
>only is no longer an acceptable excuse given things like namespaces.
>
>If you don't fail the insertions while the expansion is ongoing,
>and assuming a dump can postpone expansions, then you can essentially
>insert entries into the hash table at will which is an easy DoS
>attack.

I agree, however at least in the case of nftables you can easily do the same thing by adding millions of rules.

It doesn't make things worse.

>Note that you don't have to fail the insertion right away.  I
>think waiting until you reach max * 2 would be fine.




^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-17  9:51                                 ` Patrick McHardy
@ 2015-01-17 10:13                                   ` Herbert Xu
  2015-01-17 11:56                                     ` Patrick McHardy
  0 siblings, 1 reply; 36+ messages in thread
From: Herbert Xu @ 2015-01-17 10:13 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On Sat, Jan 17, 2015 at 09:51:46AM +0000, Patrick McHardy wrote:
> 
> I agree, however at least in the case of nftables you can easily do the same thing by adding millions of rules.

I think that's a problem in itself.  If a single packet can kill
the CPU through millions of rules, then namespaces would be a joke.
There has to be a limit to the number of rules or the processing
has to be deferred into thread context (thus subject to scheduler
control) at some point.

> It doesn't make things worse.

So I don't think that's a valid justification for ignoring this
hash table problem.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-17 10:13                                   ` Herbert Xu
@ 2015-01-17 11:56                                     ` Patrick McHardy
  0 siblings, 0 replies; 36+ messages in thread
From: Patrick McHardy @ 2015-01-17 11:56 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 17.01, Herbert Xu wrote:
> On Sat, Jan 17, 2015 at 09:51:46AM +0000, Patrick McHardy wrote:
> > 
> > I agree, however at least in the case of nftables you can easily do the same thing by adding millions of rules.
> 
> I think that's a problem in itself.  If a single packet can kill
> the CPU through millions of rules, then namespaces would be a joke.
> There has to be a limit to the number of rules or the processing
> has to be deferred into thread context (thus subject to scheduler
> control) at some point.

I think that's a problem that's unrelated to netfilter. Its quite easy
to configure something that will make the network stack eat up all
the CPU, consider f.i.:

bridge name	bridge id		STP enabled	interfaces
br0		8000.625dda62a3d4	no		veth0
							veth1

Now thing of bridging, veth, TC actions, iptables, routing, all in
combination. Sure, single cases can be caught it might be possible
to restrict them, but I don't believe that in the near term we will
be able to handle this properly.

And even if all loops etc are handled, what keeps the user from
creating a million veth devices and putting them into a long
chain?

This needs to be fixed on a different level in my opinion.

> > It doesn't make things worse.
> 
> So I don't think that's a valid justification for ignoring this
> hash table problem.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 20:46                       ` Pablo Neira Ayuso
  2015-01-16 20:53                         ` Patrick McHardy
@ 2015-01-19  9:01                         ` Paul E. McKenney
  2015-01-21  5:23                           ` Herbert Xu
  1 sibling, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2015-01-19  9:01 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Patrick McHardy, Thomas Graf, David Laight, davem@davemloft.net,
	netdev@vger.kernel.org, herbert@gondor.apana.org.au,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On Fri, Jan 16, 2015 at 09:46:44PM +0100, Pablo Neira Ayuso wrote:
> On Fri, Jan 16, 2015 at 07:35:57PM +0000, Patrick McHardy wrote:
> > On 16.01, Thomas Graf wrote:
> > > On 01/16/15 at 06:36pm, Patrick McHardy wrote:
> > > > On 16.01, Thomas Graf wrote:
> > > > > On 01/16/15 at 04:43pm, David Laight wrote:
> > > > > > The walker is unlikely to see items that get inserted early in the hash
> > > > > > table even without a resize.
> > > > > 
> > > > > I don't follow, you have to explain this statement.
> > > > > 
> > > > > Walkers which don't want to see duplicates or miss entries should
> > > > > just take the mutex.
> > > > 
> > > > Well, we do have a problem with interrupted dumps. As you know once
> > > > the netlink message buffer is full, we return to userspace and
> > > > continue dumping during the next read. Expanding obviously changes
> > > > the order since we rehash from bucket N to N and 2N, so this will
> > > > indeed cause duplicate (doesn't matter) and missed entries.
> > > 
> > > Right,but that's a Netlink dump issue and not specific to rhashtable.
> > 
> > Well, rhashtable (or generally resizing) will make it a lot worse.
> > Usually we at worst miss entries which were added during the dump,
> > which is made up by the notifications.
> > 
> > With resizing we might miss anything, its completely undeterministic.
> > 
> > > Putting the sequence number check in place should be sufficient
> > > for sets, right?
> > 
> > I don't see how. The problem is that the ordering of the hash changes
> > and it will skip different entries than those that have already been
> > dumped.
> 
> I think the generation counter should catch up this sort of problems.
> The resizing is triggered by a new/deletion element, which bumps it
> once the transaction is handled.

One unconventional way of handling this is to associate the scan with
a one-to-one resize operation.  This can be implemented to have the
effect of taking a snapshot of the table.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 36+ messages in thread

* RE: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-16 21:36                       ` Thomas Graf
  2015-01-16 22:07                         ` Patrick McHardy
@ 2015-01-19  9:45                         ` David Laight
  1 sibling, 0 replies; 36+ messages in thread
From: David Laight @ 2015-01-19  9:45 UTC (permalink / raw)
  To: 'Thomas Graf', Patrick McHardy
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

From: Thomas Graf
> On 01/16/15 at 07:35pm, Patrick McHardy wrote:
> > On 16.01, Thomas Graf wrote:
> > > Right,but that's a Netlink dump issue and not specific to rhashtable.
> >
> > Well, rhashtable (or generally resizing) will make it a lot worse.
> > Usually we at worst miss entries which were added during the dump,
> > which is made up by the notifications.
> >
> > With resizing we might miss anything, its completely undeterministic.
> >
> > > Putting the sequence number check in place should be sufficient
> > > for sets, right?
> >
> > I don't see how. The problem is that the ordering of the hash changes
> > and it will skip different entries than those that have already been
> > dumped.
> 
> Since an resize can only be triggered through insert/remove you
> simply bump a sequence number when you insert/remove and have
> userspace restart the dump when NLM_F_DUMP_INTR is set.

You could suppress 'shrink' while a userspace dump is in progress
and force userspace to restart on 'grow'.

The suppress does rather require that you know when any userspace 'walk'
gives up. OTOH it should be possible to defer the 'shrink' everytime
a walk reads some more of the data.

	David

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-17  8:02                             ` Patrick McHardy
@ 2015-01-19 12:58                               ` Thomas Graf
  0 siblings, 0 replies; 36+ messages in thread
From: Thomas Graf @ 2015-01-19 12:58 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: David Laight, davem@davemloft.net, netdev@vger.kernel.org,
	herbert@gondor.apana.org.au, paulmck@linux.vnet.ibm.com,
	edumazet@google.com, john.r.fastabend@intel.com,
	josh@joshtriplett.org, netfilter-devel@vger.kernel.org

On 01/17/15 at 08:02am, Patrick McHardy wrote:
> On 16.01, Thomas Graf wrote:
> > Resize operations should be *really* rare as well unless you start
> > with really small hash table sizes and constantly add/remove at the
> > watermark.
> 
> Which are far enough from each other that this should only happen
> in really unlucky cases.
> 
> > Re-dumping on insert/remove is a different story of course. Do you
> > care about missed insert/removals for dumps? If not we can do the
> > sequence number consistency checking for resizing only.
> 
> No, that has always been undeterministic with netlink. We want to
> dump everything that was present when the dump was started and is
> still present when it finishes. Anything else can be handled using
> notifications.

It looks like we want to provide two ways to resolve this:

1) Walker holds ht->mutex the entire time to block out resizes.
   Optionally the walker can acquire all bucket locks. Such
   scenarios would seem to benefit from either a single or a very
   small number of bucket locks.

2) Walker holds ht->mutex during individual Netlink message
   construction periods and relases it while user space reads the
   message. rhashtable provides a hook which is called when a
   resize operation is scheduled allowing for the walker code to
   bump a sequence number and notify user space that the dump is
   inconsistent, causing it to request a new dump.

I'll provide an API to achieve (2). (1) is already achieveable with
the current API.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-19  9:01                         ` Paul E. McKenney
@ 2015-01-21  5:23                           ` Herbert Xu
  2015-01-21  5:29                             ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Herbert Xu @ 2015-01-21  5:23 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Pablo Neira Ayuso, Patrick McHardy, Thomas Graf, David Laight,
	davem@davemloft.net, netdev@vger.kernel.org, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On Mon, Jan 19, 2015 at 01:01:21AM -0800, Paul E. McKenney wrote:
>
> One unconventional way of handling this is to associate the scan with
> a one-to-one resize operation.  This can be implemented to have the
> effect of taking a snapshot of the table.

The problem is that in general (not for netfilter, but certainly
other rhashtable users such as netlink) dumps/walks can be started
by ordinary users and I don't think we can afford creating a
snapshot for each one, or can we?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-21  5:23                           ` Herbert Xu
@ 2015-01-21  5:29                             ` Paul E. McKenney
  2015-01-21  5:30                               ` Herbert Xu
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2015-01-21  5:29 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Pablo Neira Ayuso, Patrick McHardy, Thomas Graf, David Laight,
	davem@davemloft.net, netdev@vger.kernel.org, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On Wed, Jan 21, 2015 at 04:23:33PM +1100, Herbert Xu wrote:
> On Mon, Jan 19, 2015 at 01:01:21AM -0800, Paul E. McKenney wrote:
> >
> > One unconventional way of handling this is to associate the scan with
> > a one-to-one resize operation.  This can be implemented to have the
> > effect of taking a snapshot of the table.
> 
> The problem is that in general (not for netfilter, but certainly
> other rhashtable users such as netlink) dumps/walks can be started
> by ordinary users and I don't think we can afford creating a
> snapshot for each one, or can we?

Well, you -could- batch them up, so that a single snapshot covered
several users, and once that set was done and memory reclaimed, a second
snapshot could cover any additional users that requested dumps/walks in
the meantime.  Or are users allowed to walk arbitrarily slowly through
the table?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-21  5:29                             ` Paul E. McKenney
@ 2015-01-21  5:30                               ` Herbert Xu
  2015-01-21  5:36                                 ` Paul E. McKenney
  0 siblings, 1 reply; 36+ messages in thread
From: Herbert Xu @ 2015-01-21  5:30 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Pablo Neira Ayuso, Patrick McHardy, Thomas Graf, David Laight,
	davem@davemloft.net, netdev@vger.kernel.org, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On Tue, Jan 20, 2015 at 09:29:07PM -0800, Paul E. McKenney wrote:
> 
> Well, you -could- batch them up, so that a single snapshot covered
> several users, and once that set was done and memory reclaimed, a second
> snapshot could cover any additional users that requested dumps/walks in
> the meantime.  Or are users allowed to walk arbitrarily slowly through
> the table?

Yes they can.  You could CTRL-Z a dumper quite easily.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking
  2015-01-21  5:30                               ` Herbert Xu
@ 2015-01-21  5:36                                 ` Paul E. McKenney
  0 siblings, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2015-01-21  5:36 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Pablo Neira Ayuso, Patrick McHardy, Thomas Graf, David Laight,
	davem@davemloft.net, netdev@vger.kernel.org, edumazet@google.com,
	john.r.fastabend@intel.com, josh@joshtriplett.org,
	netfilter-devel@vger.kernel.org

On Wed, Jan 21, 2015 at 04:30:25PM +1100, Herbert Xu wrote:
> On Tue, Jan 20, 2015 at 09:29:07PM -0800, Paul E. McKenney wrote:
> > 
> > Well, you -could- batch them up, so that a single snapshot covered
> > several users, and once that set was done and memory reclaimed, a second
> > snapshot could cover any additional users that requested dumps/walks in
> > the meantime.  Or are users allowed to walk arbitrarily slowly through
> > the table?
> 
> Yes they can.  You could CTRL-Z a dumper quite easily.

OK, never mind, then!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2015-01-21  5:36 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <cover.1418647641.git.tgraf@suug.ch>
2014-12-15 12:51 ` [PATCH 1/9] rhashtable: Do hashing inside of rhashtable_lookup_compare() Thomas Graf
2014-12-15 12:51 ` [PATCH 5/9] nft_hash: Remove rhashtable_remove_pprev() Thomas Graf
2014-12-15 12:51 ` [PATCH 7/9] rhashtable: Per bucket locks & deferred expansion/shrinking Thomas Graf
2015-01-16 15:34   ` Patrick McHardy
2015-01-16 15:58     ` Thomas Graf
2015-01-16 16:03       ` Patrick McHardy
2015-01-16 16:15         ` Thomas Graf
2015-01-16 16:32           ` Patrick McHardy
2015-01-16 16:38             ` Thomas Graf
2015-01-16 16:51               ` Patrick McHardy
2015-01-16 16:43             ` David Laight
2015-01-16 16:53               ` Thomas Graf
2015-01-16 18:36                 ` Patrick McHardy
2015-01-16 19:18                   ` Thomas Graf
2015-01-16 19:35                     ` Patrick McHardy
2015-01-16 20:46                       ` Pablo Neira Ayuso
2015-01-16 20:53                         ` Patrick McHardy
2015-01-19  9:01                         ` Paul E. McKenney
2015-01-21  5:23                           ` Herbert Xu
2015-01-21  5:29                             ` Paul E. McKenney
2015-01-21  5:30                               ` Herbert Xu
2015-01-21  5:36                                 ` Paul E. McKenney
2015-01-16 20:49                       ` Herbert Xu
2015-01-16 21:31                         ` Patrick McHardy
2015-01-17  0:33                           ` Herbert Xu
2015-01-17  8:06                             ` Patrick McHardy
2015-01-17  9:32                               ` Herbert Xu
2015-01-17  9:51                                 ` Patrick McHardy
2015-01-17 10:13                                   ` Herbert Xu
2015-01-17 11:56                                     ` Patrick McHardy
2015-01-16 21:36                       ` Thomas Graf
2015-01-16 22:07                         ` Patrick McHardy
2015-01-16 23:34                           ` Thomas Graf
2015-01-17  8:02                             ` Patrick McHardy
2015-01-19 12:58                               ` Thomas Graf
2015-01-19  9:45                         ` David Laight

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).