Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 17/18] rhashtable: rename rht_for_each*continue as *from.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

The pattern set by list.h is that for_each..continue()
iterators start at the next entry after the given one,
while for_each..next() iterators start at the given
entry.

The rht_for_each*continue() iterates are documented as though the
start at the 'next' entry, but actually start at the given entry,
and they are used expecting that behaviour.
So fix the documentation and change the names to *from for consistency
with list.h

Signed-off-by: NeilBrown <neilb@suse.com>
---
 .clang-format              |    8 ++++----
 include/linux/rhashtable.h |   38 +++++++++++++++++++-------------------
 lib/rhashtable.c           |    4 ++--
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/.clang-format b/.clang-format
index faffc0d5af4e..53a5188df01a 100644
--- a/.clang-format
+++ b/.clang-format
@@ -332,14 +332,14 @@ ForEachMacros:
   - 'rhl_for_each_entry_rcu'
   - 'rhl_for_each_rcu'
   - 'rht_for_each'
-  - 'rht_for_each_continue'
+  - 'rht_for_each_from'
   - 'rht_for_each_entry'
-  - 'rht_for_each_entry_continue'
+  - 'rht_for_each_entry_from'
   - 'rht_for_each_entry_rcu'
-  - 'rht_for_each_entry_rcu_continue'
+  - 'rht_for_each_entry_rcu_from'
   - 'rht_for_each_entry_safe'
   - 'rht_for_each_rcu'
-  - 'rht_for_each_rcu_continue'
+  - 'rht_for_each_rcu_from'
   - '__rq_for_each_bio'
   - 'rq_for_each_segment'
   - 'scsi_for_each_prot_sg'
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f58cb62dce05..e148f72d4a89 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -418,13 +418,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
 }
 
 /**
- * rht_for_each_continue - continue iterating over hash chain
+ * rht_for_each_from - iterate over hash chain from given head
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  */
-#define rht_for_each_continue(pos, head, tbl, hash) \
+#define rht_for_each_from(pos, head, tbl, hash) \
 	for (pos = rht_dereference_bucket(head, tbl, hash); \
 	     !rht_is_a_nulls(pos); \
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
@@ -436,18 +436,18 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash)
+	rht_for_each_from(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash)
 
 /**
- * rht_for_each_entry_continue - continue iterating over hash chain
+ * rht_for_each_entry_from - iterate over hash chain from given head
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
-#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
+#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member)	\
 	for (pos = rht_dereference_bucket(head, tbl, hash);		\
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
@@ -461,7 +461,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_continue(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \
+	rht_for_each_entry_from(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \
 				    tbl, hash, member)
 
 /**
@@ -487,9 +487,9 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
- * rht_for_each_rcu_continue - continue iterating over rcu hash chain
+ * rht_for_each_rcu_from - iterate over rcu hash chain from given head
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  *
@@ -497,7 +497,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+#define rht_for_each_rcu_from(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
 		pos = rht_ptr(rht_dereference_bucket_rcu(head, tbl, hash)); \
 	     !rht_is_a_nulls(pos);					\
@@ -521,10 +521,10 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
 	     pos = rcu_dereference_raw(pos->next))
 
 /**
- * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
+ * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct rhash_head to use as a loop cursor.
- * @head:	the previous &struct rhash_head to continue from
+ * @head:	the &struct rhash_head to start from
  * @tbl:	the &struct bucket_table
  * @hash:	the hash value / bucket index
  * @member:	name of the &struct rhash_head within the hashable struct.
@@ -533,7 +533,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
@@ -552,7 +552,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
-	rht_for_each_entry_rcu_continue(tpos, pos,			   \
+	rht_for_each_entry_rcu_from(tpos, pos,			   \
 					rht_ptr(*rht_bucket(tbl, hash)),   \
 					tbl, hash, member)
 
@@ -609,7 +609,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 	hash = rht_key_hashfn(ht, tbl, key, params);
 	head = rht_bucket(tbl, hash);
 	do {
-		rht_for_each_rcu_continue(he, *head, tbl, hash) {
+		rht_for_each_rcu_from(he, *head, tbl, hash) {
 			if (params.obj_cmpfn ?
 			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
 			    rhashtable_compare(&arg, rht_obj(ht, he)))
@@ -744,7 +744,7 @@ static inline void *__rhashtable_insert_fast(
 	}
 
 
-	rht_for_each_continue(head, rht_ptr(*headp), tbl, hash) {
+	rht_for_each_from(head, rht_ptr(*headp), tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -1027,7 +1027,7 @@ static inline int __rhashtable_remove_fast_one(
 	lock = pprev;
 	rht_lock(lock);
 
-	rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
+	rht_for_each_from(he, rht_ptr(*pprev), tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -1189,7 +1189,7 @@ static inline int __rhashtable_replace_fast(
 	lock = pprev;
 	rht_lock(lock);
 
-	rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
+	rht_for_each_from(he, rht_ptr(*pprev), tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1b9820787cd5..84288142ddf6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -232,7 +232,7 @@ static int rhashtable_rehash_one(struct rhashtable *ht,
 
 	err = -ENOENT;
 
-	rht_for_each_continue(entry, rht_ptr(*pprev), old_tbl, old_hash) {
+	rht_for_each_from(entry, rht_ptr(*pprev), old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
@@ -527,7 +527,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 	int elasticity;
 
 	elasticity = RHT_ELASTICITY;
-	rht_for_each_continue(head, rht_ptr(*pprev), tbl, hash) {
+	rht_for_each_from(head, rht_ptr(*pprev), tbl, hash) {
 		struct rhlist_head *list;
 		struct rhlist_head *plist;
 

^ permalink raw reply related

* [PATCH 18/18] rhashtable: add rhashtable_walk_delay_rehash()
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

This function allows an rhashtable walk to complete
without any restarts as long as it progresses reasonably
quickly.
Any current rehash is waited for, then a counter is incremented
to stop any further rehash from happening until the 100%
occupancy mark is reached.
This particularly ensures that restarts won't happen due to
the hash table shrinking as shrinking is delayed indefinitely.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable-types.h |    3 ++
 include/linux/rhashtable.h       |   14 ++++++++--
 lib/rhashtable.c                 |   54 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index c08c3bcfb5ad..43de8d10638e 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -76,6 +76,7 @@ struct rhashtable_params {
  * @max_elems: Maximum number of elements in table
  * @p: Configuration parameters
  * @rhlist: True if this is an rhltable
+ * @rehash_delayers: number of walkers asking for rehash to be delayed.
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
@@ -90,6 +91,7 @@ struct rhashtable {
 	unsigned int			max_elems;
 	struct rhashtable_params	p;
 	bool				rhlist;
+	atomic_t			rehash_delayers;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
@@ -134,6 +136,7 @@ struct rhashtable_iter {
 	unsigned int skip;
 	bool p_is_unsafe;
 	bool end_of_table;
+	bool delay_rehash;
 };
 
 int rhashtable_init(struct rhashtable *ht,
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index e148f72d4a89..42b5d037ad2e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -233,7 +233,8 @@ static inline bool __rht_grow_above_75(const struct rhashtable *ht,
 				       unsigned int nelems)
 {
 	return nelems > (tbl->size / 4 * 3) &&
-	       (!ht->p.max_size || tbl->size < ht->p.max_size);
+	       (!ht->p.max_size || tbl->size < ht->p.max_size) &&
+		atomic_read(&ht->rehash_delayers) == 0;
 }
 /**
  * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
@@ -263,7 +264,8 @@ static inline bool __rht_shrink_below_30(const struct rhashtable *ht,
 {
 	/* Shrink table beneath 30% load */
 	return nelems < (tbl->size * 3 / 10) &&
-	       tbl->size > ht->p.min_size;
+	       tbl->size > ht->p.min_size &&
+	       atomic_read(&ht->rehash_delayers) == 0;
 }
 /**
  * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
@@ -285,9 +287,14 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
 	return __rht_shrink_below_30(ht, tbl, nelems);
 }
 
+/**
+ * rht_grow_above_100 - returns true if nelems > table-size
+ * @ht:		hash table
+ * @tbl:	current table
+ */
 static inline bool __rht_grow_above_100(const struct rhashtable *ht,
 					const struct bucket_table *tbl,
-					unsigned int nelems)
+					const unsigned int nelems)
 {
 	return nelems > tbl->size &&
 		(!ht->p.max_size || tbl->size < ht->p.max_size);
@@ -357,6 +364,7 @@ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 void rhashtable_walk_enter(struct rhashtable *ht,
 			   struct rhashtable_iter *iter);
 void rhashtable_walk_exit(struct rhashtable_iter *iter);
+void rhashtable_walk_delay_rehash(struct rhashtable_iter *iter);
 int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU);
 
 static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 84288142ddf6..aa1e7be8fc5b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -436,8 +436,16 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	nelems = check_nelems(ht);
+	if (atomic_read(&ht->rehash_delayers) &&
+	    tbl == ht->tbl &&
+	    !__rht_grow_above_100(ht, tbl, nelems))
+		goto out;
 
-	if (__rht_grow_above_75(ht, tbl, nelems)) {
+	/* Need to test both _75 and _100 and _75 always
+	 * says "no need to grow" if ->rehash_delayers
+	 */
+	if (__rht_grow_above_75(ht, tbl, nelems) ||
+	    __rht_grow_above_100(ht, tbl, nelems)) {
 		unsigned int size = roundup_pow_of_two(nelems * 4 / 3);
 
 		if (ht->p.max_size && size > ht->p.max_size)
@@ -453,6 +461,7 @@ static void rht_deferred_worker(struct work_struct *work)
 	if (!err)
 		err = rhashtable_rehash_table(ht);
 
+out:
 	mutex_unlock(&ht->mutex);
 
 	if (err)
@@ -711,6 +720,7 @@ void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
 	iter->slot = 0;
 	iter->skip = 0;
 	iter->end_of_table = 0;
+	iter->delay_rehash = false;
 
 	spin_lock(&ht->lock);
 	iter->walker.tbl =
@@ -732,9 +742,50 @@ void rhashtable_walk_exit(struct rhashtable_iter *iter)
 	if (iter->walker.tbl)
 		list_del(&iter->walker.list);
 	spin_unlock(&iter->ht->lock);
+	if (iter->delay_rehash &&
+	    atomic_dec_and_test(&iter->ht->rehash_delayers))
+		schedule_work(&iter->ht->run_work);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 
+/**
+ * rhashtable_walk_delay_rehash - ask for rehash to be delayed
+ * @iter:	Hash table Iterator
+ *
+ * If a rehash happens during a table walk, the walk can
+ * see duplicate entries and need to restart.
+ * If the walk calls this function immediately after
+ * rhashtable_walk_enter(), the chance of this happening is
+ * substantially reduced.  It waits for any
+ * current rehash to complete, then temporarily disables
+ * shinking and blocks and growth of the table until it
+ * reaches 100% occupancy, rather than the normal 70%.
+ *
+ * This function can be useful when walking a table to
+ * produce a debugfs file.  It should probably *not* be
+ * used when the file is access by an unprivileged process
+ * as that could conceivably result in a minor denial for service.
+ */
+void rhashtable_walk_delay_rehash(struct rhashtable_iter *iter)
+{
+	if (!iter->delay_rehash) {
+		struct rhashtable *ht = iter->ht;
+
+		atomic_inc(&ht->rehash_delayers);
+		iter->delay_rehash = true;
+		/* make sure any schedule rehash has finished */
+		flush_work(&ht->run_work);
+		mutex_lock(&ht->mutex);
+		/* Make double-sure there is only one table */
+		while (rhashtable_rehash_table(ht) == -EAGAIN)
+			;
+		mutex_unlock(&ht->mutex);
+		/* Need to swallow any -EAGAIN */
+		rhashtable_walk_start(iter);
+		rhashtable_walk_stop(iter);
+	}
+}
+
 /**
  * rhashtable_walk_start_check - Start a hash table walk
  * @iter:	Hash table iterator
@@ -1113,6 +1164,7 @@ int rhashtable_init(struct rhashtable *ht,
 			bucket_table_free(tbl);
 			return -ENOMEM;
 		}
+	atomic_set(&ht->rehash_delayers, 0);
 
 	RCU_INIT_POINTER(ht->tbl, tbl);
 

^ permalink raw reply related

* [PATCH 16/18] rhashtable: allow percpu element counter
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

When an rhashtable is receiving a high rate of updates on multiple
CPUs, the atomic update to the nelems counter can become a
measurable contention point.

We can use a per-cpu counter instead of an atomic_t.  This reduces the
contention on updates, but reduces the precision for grow/shrink
decisions.  This results in a performance trade-off between updates
and lookups.  Updates should be faster due to reduced contention,
lookups could be slower if a resize is delayed.  This trade-off will
only be appropriate on rhashtables which have a high proportion of
updates compared to lookups.

A fast read of a percpu counter can have an error of as much
as 32 times the number of CPUs.  A more precise count can be
extracted, but this takes longer.  It seems sensible to
use the fast read to schedule a rehash, and the slow read
to choose the new size of the table.  This has the drawback
that if the fast read is sufficiently higher than the slow read,
we might repeatedly schedule a rehash which never happens.
To avoid this, we record the most recently noticed difference
between the two reads and use this to adjust the estimate returned
by the fast read.  If the rehash thread determines that a rehash is
not needed, it will update the recored error so that updaters
will not schedule the thread again until there is substantially
more growth.

When the fast-read shows that the table is over 100% full,
we need to double check with a slow read at that point too,
and update the recorded error to ensure we only resize
when it is really appropriate, but don't keep performing slow
tests to see exactly when it is appropriate.

The error we keep (in pnelems_error) is 1.5 time the absolute
difference between percpu_counter_read_positive() and
percpu_counter_sum_positive().  The extra 50% reduces the frequency
with which we have to do the slow read, at a cost of allowing
the occupancy rate to grow well above 1 (but usually less than 2).

This patch includes a change to include/linux/percpu_counter.h
to make some functions accept a const pointer.

It also adds a module parameter to test_rhashtable.ko to allow
testing of percpu counters.

Signed-off-by: NeilBrown <neil@brown.name>
---
 include/linux/percpu_counter.h   |    4 +
 include/linux/rhashtable-types.h |    8 ++
 include/linux/rhashtable.h       |  126 +++++++++++++++++++++++++++++++++-----
 lib/rhashtable.c                 |   70 +++++++++++++++++----
 lib/test_rhashtable.c            |   14 ++++
 5 files changed, 189 insertions(+), 33 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 4f052496cdfd..a541a6c49a88 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -66,7 +66,7 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 	return __percpu_counter_sum(fbc);
 }
 
-static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read(const struct percpu_counter *fbc)
 {
 	return fbc->count;
 }
@@ -76,7 +76,7 @@ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
  * number for some counter which should never be negative.
  *
  */
-static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
+static inline s64 percpu_counter_read_positive(const struct percpu_counter *fbc)
 {
 	s64 ret = fbc->count;
 
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 39e5e1fb9b65..c08c3bcfb5ad 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
+#include <linux/percpu_counter.h>
 
 struct rhash_head {
 	struct rhash_head __rcu		*next;
@@ -49,6 +50,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
  * @automatic_shrinking: Enable automatic shrinking of tables
+ * @percpu_count: Use a per-cpu counter instead of atomic_t
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
  * @obj_cmpfn: Function to compare key with object
@@ -61,6 +63,7 @@ struct rhashtable_params {
 	unsigned int		max_size;
 	u16			min_size;
 	bool			automatic_shrinking;
+	bool			percpu_count;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
@@ -77,6 +80,9 @@ struct rhashtable_params {
  * @mutex: Mutex to protect current/future table swapping
  * @lock: Spin lock to protect walker list
  * @nelems: Number of elements in table
+ * @pnelems: Alternate per-cpu nelems counter
+ * @pnelems_error: absolute difference between approximate and actual
+ *          percpu count at last check: 1.5 * abs("read" - "sum").
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
@@ -88,6 +94,8 @@ struct rhashtable {
 	struct mutex                    mutex;
 	spinlock_t			lock;
 	atomic_t			nelems;
+	struct percpu_counter		pnelems;
+	unsigned int			pnelems_error;
 };
 
 /**
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 8e4d483ddf22..f58cb62dce05 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -27,6 +27,8 @@
 #include <linux/bit_spinlock.h>
 
 #include <linux/rhashtable-types.h>
+#include <linux/percpu_counter.h>
+
 /*
  * The end of the chain is marked with a special nulls marks which has
  * the least significant bit set.
@@ -200,30 +202,95 @@ static inline unsigned int rht_head_hashfn(
 	       rht_key_hashfn(ht, tbl, ptr + params.key_offset, params);
 }
 
+/*
+ * Per-CPU element counting.
+ * During insert/remove we check the percpu element counter (if active)
+ * with percpu_counter_read_positive().  This is fast but has a possible
+ * error of as much as 32*NUM_CPUS.
+ * In the rehash thread, we determine the number of elements with
+ * percpu_counter_sum_positive().  This is slower and more precise.
+ * We chose a new table size based on this precise count.
+ * If the approximate count is more than 70% of the table size, while
+ * the precise count is below that, the above rules would cause the
+ * rehash thread to be continually scheduled, which is not ideal.
+ * So we record an error, being 1.5 times the absolute difference between
+ * the approximate and precise element counts.  When testing if we need to grow,
+ * we subtract the error from the approximate count, and test that.
+ * When testing if we need to shrink, we add the error.
+ * This ensures we don't thrash the rehash thread, at the cost of delaying
+ * resize operations.  This is the cost trade-off for using per-cpu
+ * element counting:  insert/remove will be faster by avoiding an atomic
+ * operation 31/32 of the time, but lookup might be slower as hash chains
+ * can be expected to be slightly longer on average.  If the error is
+ * so bad that hash chains grow to 16 (could happen when the table is small
+ * and lots of CPUs are adding items), that will trigger a rehash which will
+ * correct the table size update the recorded error.  Possibly the '16' should
+ * be reduce to 8 when per-cpu element counting is active.
+ */
+
+static inline bool __rht_grow_above_75(const struct rhashtable *ht,
+				       const struct bucket_table *tbl,
+				       unsigned int nelems)
+{
+	return nelems > (tbl->size / 4 * 3) &&
+	       (!ht->p.max_size || tbl->size < ht->p.max_size);
+}
 /**
  * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
  * @ht:		hash table
  * @tbl:	current table
  */
 static inline bool rht_grow_above_75(const struct rhashtable *ht,
-				     const struct bucket_table *tbl)
+				     const struct bucket_table *tbl,
+				     const struct rhashtable_params params)
 {
 	/* Expand table when exceeding 75% load */
-	return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) &&
-	       (!ht->p.max_size || tbl->size < ht->p.max_size);
+	unsigned int nelems;
+
+	if (!params.percpu_count)
+		nelems = atomic_read(&ht->nelems);
+	else {
+		nelems = percpu_counter_read_positive(&ht->pnelems);
+		if (nelems > ht->pnelems_error)
+			nelems -= ht->pnelems_error;
+	}
+	return __rht_grow_above_75(ht, tbl, nelems);
 }
 
+static inline bool __rht_shrink_below_30(const struct rhashtable *ht,
+					 const struct bucket_table *tbl,
+					 const unsigned int nelems)
+{
+	/* Shrink table beneath 30% load */
+	return nelems < (tbl->size * 3 / 10) &&
+	       tbl->size > ht->p.min_size;
+}
 /**
  * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
  * @ht:		hash table
  * @tbl:	current table
  */
 static inline bool rht_shrink_below_30(const struct rhashtable *ht,
-				       const struct bucket_table *tbl)
+				       const struct bucket_table *tbl,
+				       const struct rhashtable_params params)
 {
 	/* Shrink table beneath 30% load */
-	return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) &&
-	       tbl->size > ht->p.min_size;
+	unsigned int nelems;
+
+	if (params.percpu_count)
+		nelems = percpu_counter_read_positive(&ht->pnelems) +
+			ht->pnelems_error;
+	else
+		nelems = atomic_read(&ht->nelems);
+	return __rht_shrink_below_30(ht, tbl, nelems);
+}
+
+static inline bool __rht_grow_above_100(const struct rhashtable *ht,
+					const struct bucket_table *tbl,
+					unsigned int nelems)
+{
+	return nelems > tbl->size &&
+		(!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
 /**
@@ -232,10 +299,19 @@ static inline bool rht_shrink_below_30(const struct rhashtable *ht,
  * @tbl:	current table
  */
 static inline bool rht_grow_above_100(const struct rhashtable *ht,
-				      const struct bucket_table *tbl)
+				      const struct bucket_table *tbl,
+				      const struct rhashtable_params params)
 {
-	return atomic_read(&ht->nelems) > tbl->size &&
-		(!ht->p.max_size || tbl->size < ht->p.max_size);
+	unsigned int nelems;
+
+	if (!params.percpu_count)
+		nelems = atomic_read(&ht->nelems);
+	else {
+		nelems = percpu_counter_read_positive(&ht->pnelems);
+		if (nelems > ht->pnelems_error)
+			nelems -= ht->pnelems_error;
+	}
+	return __rht_grow_above_100(ht, tbl, nelems);
 }
 
 /**
@@ -244,9 +320,19 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
  * @tbl:	current table
  */
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
-				      const struct bucket_table *tbl)
+				      const struct bucket_table *tbl,
+				      const struct rhashtable_params params)
 {
-	return atomic_read(&ht->nelems) >= ht->max_elems;
+	unsigned int nelems;
+
+	if (!params.percpu_count)
+		nelems = atomic_read(&ht->nelems);
+	else {
+		nelems = percpu_counter_read_positive(&ht->pnelems);
+		if (nelems > ht->pnelems_error)
+			nelems -= ht->pnelems_error;
+	}
+	return nelems >= ht->max_elems;
 }
 
 #ifdef CONFIG_PROVE_LOCKING
@@ -699,10 +785,10 @@ static inline void *__rhashtable_insert_fast(
 		goto slow_path;
 
 	data = ERR_PTR(-E2BIG);
-	if (unlikely(rht_grow_above_max(ht, tbl)))
+	if (unlikely(rht_grow_above_max(ht, tbl, params)))
 		goto out;
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl, params)))
 		goto slow_path;
 
 	head = rht_dereference_bucket(*headp, tbl, hash);
@@ -717,8 +803,11 @@ static inline void *__rhashtable_insert_fast(
 
 	rcu_assign_pointer(*headp, obj);
 
-	atomic_inc(&ht->nelems);
-	if (rht_grow_above_75(ht, tbl))
+	if (params.percpu_count)
+		percpu_counter_inc(&ht->pnelems);
+	else
+		atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl, params))
 		schedule_work(&ht->run_work);
 
 good:
@@ -990,9 +1079,12 @@ static inline int __rhashtable_remove_fast_one(
 	rht_unlock(lock);
 unlocked:
 	if (err > 0) {
-		atomic_dec(&ht->nelems);
+		if (params.percpu_count)
+			percpu_counter_dec(&ht->pnelems);
+		else
+			atomic_dec(&ht->nelems);
 		if (unlikely(ht->p.automatic_shrinking &&
-			     rht_shrink_below_30(ht, tbl)))
+			     rht_shrink_below_30(ht, tbl, params)))
 			schedule_work(&ht->run_work);
 		err = 0;
 	}
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 381acab7a909..1b9820787cd5 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -386,10 +386,9 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht,
  * It is valid to have concurrent insertions and deletions protected by per
  * bucket locks or concurrent RCU protected lookups and traversals.
  */
-static int rhashtable_shrink(struct rhashtable *ht)
+static int rhashtable_shrink(struct rhashtable *ht, const unsigned int nelems)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
 
 	if (nelems)
@@ -406,10 +405,28 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
+static int check_nelems(struct rhashtable *ht)
+{
+	unsigned int nelems;
+
+	if (ht->p.percpu_count) {
+		unsigned int approx = percpu_counter_read_positive(&ht->pnelems);
+
+		nelems = percpu_counter_sum_positive(&ht->pnelems);
+		if (approx > nelems)
+			ht->pnelems_error = (approx - nelems) * 3 / 2;
+		else
+			ht->pnelems_error = (nelems - approx) * 3 / 2;
+	} else
+		nelems = atomic_read(&ht->nelems);
+	return nelems;
+}
+
 static void rht_deferred_worker(struct work_struct *work)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
+	unsigned int nelems;
 	int err = 0;
 
 	ht = container_of(work, struct rhashtable, run_work);
@@ -418,10 +435,18 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rht_dereference(ht->tbl, ht);
 	tbl = rhashtable_last_table(ht, tbl);
 
-	if (rht_grow_above_75(ht, tbl))
-		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
-	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		err = rhashtable_shrink(ht);
+	nelems = check_nelems(ht);
+
+	if (__rht_grow_above_75(ht, tbl, nelems)) {
+		unsigned int size = roundup_pow_of_two(nelems * 4 / 3);
+
+		if (ht->p.max_size && size > ht->p.max_size)
+			size = ht->p.max_size;
+
+		err = rhashtable_rehash_alloc(ht, tbl, size);
+	} else if (ht->p.automatic_shrinking &&
+		   __rht_shrink_below_30(ht, tbl, nelems))
+		err = rhashtable_shrink(ht, nelems);
 	else if (tbl->nest)
 		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
 
@@ -439,6 +464,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 {
 	struct bucket_table *old_tbl;
 	struct bucket_table *new_tbl;
+	unsigned int nelems;
 	unsigned int size;
 	int err;
 
@@ -448,7 +474,12 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 
 	err = -EBUSY;
 
-	if (rht_grow_above_75(ht, tbl))
+	/* Now is a good time to ensure ->pnelems_error is up to date,
+	 * and to use the exact count
+	 */
+	nelems = check_nelems(ht);
+
+	if (__rht_grow_above_75(ht, tbl, nelems))
 		size *= 2;
 	/* Do not schedule more than one rehash */
 	else if (old_tbl != tbl)
@@ -556,11 +587,15 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (PTR_ERR(data) != -ENOENT)
 		return ERR_CAST(data);
 
-	if (unlikely(rht_grow_above_max(ht, tbl)))
+	if (unlikely(rht_grow_above_max(ht, tbl, ht->p)))
 		return ERR_PTR(-E2BIG);
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
-		return ERR_PTR(-EAGAIN);
+	if (unlikely(rht_grow_above_100(ht, tbl, ht->p))) {
+		unsigned int nelems = check_nelems(ht);
+
+		if (__rht_grow_above_100(ht, tbl, nelems))
+			return ERR_PTR(-EAGAIN);
+	}
 
 	head = rht_ptr(rht_dereference_bucket(*pprev, tbl, hash));
 	while (!ht->rhlist && !rht_is_a_nulls(head) && head < obj) {
@@ -581,8 +616,11 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		obj = (void *)(2UL | (unsigned long)obj);
 	rcu_assign_pointer(*pprev, obj);
 
-	atomic_inc(&ht->nelems);
-	if (rht_grow_above_75(ht, tbl))
+	if (ht->p.percpu_count)
+		percpu_counter_inc(&ht->pnelems);
+	else
+		atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl, ht->p))
 		schedule_work(&ht->run_work);
 
 	return NULL;
@@ -1069,6 +1107,12 @@ int rhashtable_init(struct rhashtable *ht,
 		return -ENOMEM;
 
 	atomic_set(&ht->nelems, 0);
+	ht->pnelems_error = 0;
+	if (params->percpu_count)
+		if (percpu_counter_init(&ht->pnelems, 0, GFP_KERNEL)) {
+			bucket_table_free(tbl);
+			return -ENOMEM;
+		}
 
 	RCU_INIT_POINTER(ht->tbl, tbl);
 
@@ -1159,6 +1203,8 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 	}
 
 	bucket_table_free(tbl);
+	if (ht->p.percpu_count)
+		percpu_counter_destroy(&ht->pnelems);
 	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index b428a9c7522a..0902db84a37d 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -57,6 +57,10 @@ static bool enomem_retry = false;
 module_param(enomem_retry, bool, 0);
 MODULE_PARM_DESC(enomem_retry, "Retry insert even if -ENOMEM was returned (default: off)");
 
+static bool percpu_count = false;
+module_param(percpu_count, bool, 0);
+MODULE_PARM_DESC(percpu_count, "Use less-precise percpu counter for counting elements (default: off)");
+
 struct test_obj_val {
 	int	id;
 	int	tid;
@@ -180,6 +184,7 @@ static void test_bucket_stats(struct rhashtable *ht, unsigned int entries)
 	unsigned int err, total = 0, chain_len = 0;
 	struct rhashtable_iter hti;
 	struct rhash_head *pos;
+	unsigned int nelems;
 
 	err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
 	if (err) {
@@ -206,10 +211,14 @@ static void test_bucket_stats(struct rhashtable *ht, unsigned int entries)
 	rhashtable_walk_stop(&hti);
 	rhashtable_walk_exit(&hti);
 
+	if (ht->p.percpu_count)
+		nelems = percpu_counter_sum(&ht->pnelems);
+	else
+		nelems = atomic_read(&ht->nelems);
 	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d, table-jumps=%u\n",
-		total, atomic_read(&ht->nelems), entries, chain_len);
+		total, nelems, entries, chain_len);
 
-	if (total != atomic_read(&ht->nelems) || total != entries)
+	if (total != nelems || total != entries)
 		pr_warn("Test failed: Total count mismatch ^^^");
 }
 
@@ -705,6 +714,7 @@ static int __init test_rht_init(void)
 	test_rht_params.automatic_shrinking = shrinking;
 	test_rht_params.max_size = max_size ? : roundup_pow_of_two(entries);
 	test_rht_params.nelem_hint = size;
+	test_rht_params.percpu_count = percpu_count;
 
 	objs = vzalloc((test_rht_params.max_size + 1) * sizeof(struct test_obj));
 	if (!objs)

^ permalink raw reply related

* [PATCH 15/18] rhashtable: use bit_spin_locks to protect hash bucket.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

This patch changes rhashtables to use a bit_spin_lock (BIT(1))
the bucket pointer to lock the hash chain for that bucket.

The benefits of a bit spin_lock are:
 - no need to allocate a separate array of locks.
 - no need to have a configuration option to guide the
   choice of the size of this array
 - locking cost if often a single test-and-set in a cache line
   that will have to be loaded anyway.  When inserting at, or removing
   from, the head of the chain, the unlock is free - writing the new
   address in the bucket head implicitly clears the lock bit.
 - even when lockings costs 2 updates (lock and unlock), they are
   in a cacheline that needs to be read anyway.

The cost of using a bit spin_lock is a little bit of code complexity,
which I think is quite manageable.

Bit spin_locks are sometimes inappropriate because they are not fair -
if multiple CPUs repeatedly contend of the same lock, one CPU can
easily be starved.  This is not a credible situation with rhashtable.
Multiple CPUs may want to repeatedly add or remove objects, but they
will typically do so at different buckets, so they will attempt to
acquire different locks.

As we have more bit-locks than we previously had spinlocks (by at
least a factor of two) we can expect slightly less contention to
go with the slightly better cache behavior and reduced memory
consumption.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable-types.h |    2 
 include/linux/rhashtable.h       |  190 +++++++++++++++++++++++++-------------
 ipc/util.c                       |    1 
 lib/rhashtable.c                 |  118 ++++++++++++------------
 net/bridge/br_fdb.c              |    1 
 net/bridge/br_vlan.c             |    1 
 net/bridge/br_vlan_tunnel.c      |    1 
 net/ipv4/ipmr.c                  |    1 
 net/ipv6/ip6mr.c                 |    1 
 9 files changed, 184 insertions(+), 132 deletions(-)

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index bc3e84547ba7..39e5e1fb9b65 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -48,7 +48,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
@@ -62,7 +61,6 @@ struct rhashtable_params {
 	unsigned int		max_size;
 	u16			min_size;
 	bool			automatic_shrinking;
-	u8			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 81ca3ed2927b..8e4d483ddf22 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -24,6 +24,7 @@
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 #include <linux/rculist.h>
+#include <linux/bit_spinlock.h>
 
 #include <linux/rhashtable-types.h>
 /*
@@ -52,8 +53,6 @@
  * @nest: Number of bits of first-level nested table.
  * @rehash: Current bucket being rehashed
  * @hash_rnd: Random seed to fold into hash
- * @locks_mask: Mask to apply before accessing locks[]
- * @locks: Array of spinlocks protecting individual buckets
  * @walkers: List of active walkers
  * @rcu: RCU structure for freeing the table
  * @future_tbl: Table under construction during rehashing
@@ -64,8 +63,6 @@ struct bucket_table {
 	unsigned int		size;
 	unsigned int		nest;
 	u32			hash_rnd;
-	unsigned int		locks_mask;
-	spinlock_t		*locks;
 	struct list_head	walkers;
 	struct rcu_head		rcu;
 
@@ -74,6 +71,61 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
+/*
+ * We lock a bucket by setting BIT(1) in the pointer - this is always
+ * zero in real pointers and in the nulls marker.
+ * bit_spin_locks do not handle contention well, but the whole point
+ * of the hashtable design is to achieve minimum per-bucket contention.
+ * A nested hash table might not have a bucket pointer.  In that case
+ * we cannot get a lock.  For remove and replace the bucket cannot be
+ * interesting and doesn't need locking.
+ * For insert we allocate the bucket if this is the last bucket_table,
+ * and then take the lock.
+ * Sometimes we unlock a bucket by writing a new pointer there.  In that
+ * case we don't need to unlock, but we do need to reset state such as
+ * local_bh. For that we have rht_unlocked().  This doesn't include
+ * the memory barrier that bit_spin_unlock() provides, but rcu_assign_pointer()
+ * will have provided that.
+ */
+
+static inline void rht_lock(struct rhash_head **bucket)
+{
+	local_bh_disable();
+	bit_spin_lock(1, (unsigned long *)bucket);
+}
+
+static inline void rht_unlock(struct rhash_head **bucket)
+{
+	bit_spin_unlock(1, (unsigned long *)bucket);
+	local_bh_enable();
+}
+
+static inline void rht_unlocked(void)
+{
+	preempt_enable();
+	__release(bitlock);
+	local_bh_enable();
+}
+
+/*
+ * If 'p' is a bucket head and might be locked, rht_ptr returns
+ * the address without the lock bit.
+ */
+static inline struct rhash_head __rcu *rht_ptr(const struct rhash_head *p)
+{
+	return (void *)(((unsigned long)p) & ~2UL);
+}
+
+static inline struct rhash_head __rcu *rht_ptr_locked(const struct rhash_head *p)
+{
+	return (void *)(((unsigned long)p) | 2UL);
+}
+
+static inline bool rht_is_locked(const struct rhash_head *p)
+{
+	return rht_ptr_locked(p) == p;
+}
+
 #define	RHT_NULLS_MARKER(ptr)	\
 	((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
 #define INIT_RHT_NULLS_HEAD(ptr)	\
@@ -197,25 +249,6 @@ static inline bool rht_grow_above_max(const struct rhashtable *ht,
 	return atomic_read(&ht->nelems) >= ht->max_elems;
 }
 
-/* The bucket lock is selected based on the hash and protects mutations
- * on a group of hash buckets.
- *
- * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
- * a single lock always covers both buckets which may both contains
- * entries which link to the same bucket of the old table during resizing.
- * This allows to simplify the locking as locking the bucket in both
- * tables during resize always guarantee protection.
- *
- * IMPORTANT: When holding the bucket lock of both the old and new table
- * during expansions and shrinking, the old bucket lock must always be
- * acquired first.
- */
-static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl,
-					  unsigned int hash)
-{
-	return &tbl->locks[hash & tbl->locks_mask];
-}
-
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
@@ -317,7 +350,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @hash:	the hash value / bucket index
  */
 #define rht_for_each(pos, tbl, hash) \
-	rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+	rht_for_each_continue(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash)
 
 /**
  * rht_for_each_entry_continue - continue iterating over hash chain
@@ -342,7 +375,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * @member:	name of the &struct rhash_head within the hashable struct.
  */
 #define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
-	rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash),	\
+	rht_for_each_entry_continue(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \
 				    tbl, hash, member)
 
 /**
@@ -358,7 +391,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * remove the loop cursor from the list.
  */
 #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	      \
-	for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \
+	for (pos = rht_dereference_bucket(rht_ptr(*rht_bucket(tbl, hash)),    \
+					  tbl, hash),			      \
 	     next = !rht_is_a_nulls(pos) ?				      \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL;   \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	      \
@@ -379,7 +413,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  */
 #define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
-	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+		pos = rht_ptr(rht_dereference_bucket_rcu(head, tbl, hash)); \
 	     !rht_is_a_nulls(pos);					\
 	     pos = rcu_dereference_raw(pos->next))
 
@@ -394,7 +428,11 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_rcu(pos, tbl, hash)				\
-	rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash)
+	for (({barrier(); }),						\
+	     pos = rht_ptr(rht_dereference_bucket_rcu(*rht_bucket(tbl, hash), \
+						      tbl, hash));	\
+	     !rht_is_a_nulls(pos);					\
+	     pos = rcu_dereference_raw(pos->next))
 
 /**
  * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
@@ -428,7 +466,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert(
  * traversal is guarded by rcu_read_lock().
  */
 #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		   \
-	rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \
+	rht_for_each_entry_rcu_continue(tpos, pos,			   \
+					rht_ptr(*rht_bucket(tbl, hash)),   \
 					tbl, hash, member)
 
 /**
@@ -592,9 +631,9 @@ static inline void *__rhashtable_insert_fast(
 	};
 	struct rhash_head __rcu **headp;
 	struct rhash_head __rcu **pprev;
+	struct rhash_head __rcu **lock;
 	struct bucket_table *tbl;
 	struct rhash_head *head;
-	spinlock_t *lock;
 	unsigned int hash;
 	int elasticity;
 	void *data;
@@ -603,24 +642,23 @@ static inline void *__rhashtable_insert_fast(
 
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 	hash = rht_head_hashfn(ht, tbl, obj, params);
-	lock = rht_bucket_lock(tbl, hash);
-	spin_lock_bh(lock);
+	elasticity = RHT_ELASTICITY;
+	headp = rht_bucket_insert(ht, tbl, hash);
+	data = ERR_PTR(-ENOMEM);
+	if (!headp)
+		goto out;
+	lock = pprev = headp;
+	rht_lock(lock);
 
 	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
-		spin_unlock_bh(lock);
+		rht_unlock(lock);
 		rcu_read_unlock();
 		return rhashtable_insert_slow(ht, key, obj);
 	}
 
-	elasticity = RHT_ELASTICITY;
-	headp = rht_bucket_insert(ht, tbl, hash);
-	pprev = headp;
-	data = ERR_PTR(-ENOMEM);
-	if (!pprev)
-		goto out;
 
-	rht_for_each_continue(head, *headp, tbl, hash) {
+	rht_for_each_continue(head, rht_ptr(*headp), tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -651,6 +689,8 @@ static inline void *__rhashtable_insert_fast(
 		head = rht_dereference_bucket(head->next, tbl, hash);
 		RCU_INIT_POINTER(list->rhead.next, head);
 		rcu_assign_pointer(*pprev, obj);
+		/* This is where we inserted */
+		headp = pprev;
 
 		goto good;
 	}
@@ -667,7 +707,7 @@ static inline void *__rhashtable_insert_fast(
 
 	head = rht_dereference_bucket(*headp, tbl, hash);
 
-	RCU_INIT_POINTER(obj->next, head);
+	RCU_INIT_POINTER(obj->next, rht_ptr(head));
 	if (rhlist) {
 		struct rhlist_head *list;
 
@@ -684,8 +724,15 @@ static inline void *__rhashtable_insert_fast(
 good:
 	data = NULL;
 
+	if (headp == lock) {
+		/* Assigning to *headp unlocked the chain, so we
+		 * don't need to do it again.
+		 */
+		rht_unlocked();
+	} else {
 out:
-	spin_unlock_bh(lock);
+		rht_unlock(lock);
+	}
 	rcu_read_unlock();
 
 	return data;
@@ -697,9 +744,9 @@ static inline void *__rhashtable_insert_fast(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
  * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
  *
  * It is safe to call this function from atomic context.
  *
@@ -726,9 +773,9 @@ static inline int rhashtable_insert_fast(
  * @list:	pointer to hash list head inside object
  * @params:	hash table parameters
  *
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
  * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
  *
  * It is safe to call this function from atomic context.
  *
@@ -749,9 +796,9 @@ static inline int rhltable_insert_key(
  * @list:	pointer to hash list head inside object
  * @params:	hash table parameters
  *
- * Will take a per bucket spinlock to protect against mutual mutations
+ * Will take the per bucket bitlock to protect against mutual mutations
  * on the same bucket. Multiple insertions may occur in parallel unless
- * they map to the same bucket lock.
+ * they map to the same bucket.
  *
  * It is safe to call this function from atomic context.
  *
@@ -879,20 +926,19 @@ static inline int __rhashtable_remove_fast_one(
 	bool rhlist)
 {
 	struct rhash_head __rcu **pprev;
+	struct rhash_head __rcu **lock;
 	struct rhash_head *he;
-	spinlock_t * lock;
 	unsigned int hash;
 	int err = -ENOENT;
 
 	hash = rht_head_hashfn(ht, tbl, obj, params);
-	lock = rht_bucket_lock(tbl, hash);
-
-	spin_lock_bh(lock);
-
 	pprev = rht_bucket_var(tbl, hash);
 	if (!pprev)
-		goto out;
-	rht_for_each_continue(he, *pprev, tbl, hash) {
+		return -ENOENT;
+	lock = pprev;
+	rht_lock(lock);
+
+	rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
 		struct rhlist_head *list;
 
 		list = container_of(he, struct rhlist_head, rhead);
@@ -933,12 +979,16 @@ static inline int __rhashtable_remove_fast_one(
 		}
 
 		rcu_assign_pointer(*pprev, obj);
+		if (lock == pprev) {
+			/* That rcu_assign_pointer() unlocked the chain */
+			rht_unlocked();
+			goto unlocked;
+		}
 		break;
 	}
 
-out:
-	spin_unlock_bh(lock);
-
+	rht_unlock(lock);
+unlocked:
 	if (err > 0) {
 		atomic_dec(&ht->nelems);
 		if (unlikely(ht->p.automatic_shrinking &&
@@ -1028,8 +1078,8 @@ static inline int __rhashtable_replace_fast(
 	const struct rhashtable_params params)
 {
 	struct rhash_head __rcu **pprev;
+	struct rhash_head __rcu **lock;
 	struct rhash_head *he;
-	spinlock_t *lock;
 	unsigned int hash;
 	int err = -ENOENT;
 
@@ -1040,14 +1090,14 @@ static inline int __rhashtable_replace_fast(
 	if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
 		return -EINVAL;
 
-	lock = rht_bucket_lock(tbl, hash);
-
-	spin_lock_bh(lock);
-
 	pprev = rht_bucket_var(tbl, hash);
 	if (!pprev)
-		goto out;
-	rht_for_each_continue(he, *pprev, tbl, hash) {
+		return -ENOENT;
+
+	lock = pprev;
+	rht_lock(lock);
+
+	rht_for_each_continue(he, rht_ptr(*pprev), tbl, hash) {
 		if (he != obj_old) {
 			pprev = &he->next;
 			continue;
@@ -1056,11 +1106,17 @@ static inline int __rhashtable_replace_fast(
 		rcu_assign_pointer(obj_new->next, obj_old->next);
 		rcu_assign_pointer(*pprev, obj_new);
 		err = 0;
+		if (pprev == lock) {
+			/* We just unlocked the chain by assigning to *pprev */
+			rht_unlocked();
+			goto unlocked;
+		}
 		break;
 	}
-out:
-	spin_unlock_bh(lock);
 
+	rht_unlock(lock);
+
+unlocked:
 	return err;
 }
 
diff --git a/ipc/util.c b/ipc/util.c
index fdffff41f65b..cc78eb76df8b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -105,7 +105,6 @@ static const struct rhashtable_params ipc_kht_params = {
 	.head_offset		= offsetof(struct kern_ipc_perm, khtnode),
 	.key_offset		= offsetof(struct kern_ipc_perm, key),
 	.key_len		= FIELD_SIZEOF(struct kern_ipc_perm, key),
-	.locks_mul		= 1,
 	.automatic_shrinking	= true,
 };
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index bac2493808f0..381acab7a909 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,7 +32,6 @@
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4U
-#define BUCKET_LOCKS_PER_CPU	32UL
 
 union nested_table {
 	union nested_table __rcu *table;
@@ -57,9 +56,11 @@ EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 {
-	spinlock_t *lock = rht_bucket_lock(tbl, hash);
-
-	return (debug_locks) ? lockdep_is_held(lock) : 1;
+	if (!debug_locks)
+		return 1;
+	if (unlikely(tbl->nest))
+		return 1;
+	return bit_spin_is_locked(1, (unsigned long *)&tbl->buckets[hash]);
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #else
@@ -105,7 +106,6 @@ static void bucket_table_free(const struct bucket_table *tbl)
 	if (tbl->nest)
 		nested_bucket_table_free(tbl);
 
-	free_bucket_spinlocks(tbl->locks);
 	kvfree(tbl);
 }
 
@@ -172,7 +172,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       gfp_t gfp)
 {
 	struct bucket_table *tbl = NULL;
-	size_t size, max_locks;
+	size_t size;
 	int i;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
@@ -192,16 +192,6 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 
 	tbl->size = size;
 
-	max_locks = size >> 1;
-	if (tbl->nest)
-		max_locks = min_t(size_t, max_locks, 1U << tbl->nest);
-
-	if (alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks,
-				   ht->p.locks_mul, gfp) < 0) {
-		bucket_table_free(tbl);
-		return NULL;
-	}
-
 	INIT_LIST_HEAD(&tbl->walkers);
 
 	tbl->hash_rnd = get_random_u32();
@@ -225,25 +215,24 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 	return new_tbl;
 }
 
-static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
+static int rhashtable_rehash_one(struct rhashtable *ht,
+				 struct rhash_head __rcu **pprev,
+				 unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
-	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
 	struct rhash_head __rcu **inspos;
+	struct rhash_head __rcu **lock;
 	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
-	spinlock_t *new_bucket_lock;
 	unsigned int new_hash;
 
 	if (new_tbl->nest)
 		goto out;
 
 	err = -ENOENT;
-	if (!pprev)
-		goto out;
 
-	rht_for_each_continue(entry, *pprev, old_tbl, old_hash) {
+	rht_for_each_continue(entry, rht_ptr(*pprev), old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 
@@ -258,11 +247,11 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 
 	new_hash = head_hashfn(ht, new_tbl, entry);
 
-	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
-
-	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
 	inspos = &new_tbl->buckets[new_hash];
-	head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	lock = inspos;
+	rht_lock(lock);
+
+	head = rht_ptr(rht_dereference_bucket(*inspos, new_tbl, new_hash));
 	while (!rht_is_a_nulls(head) && head < entry) {
 		inspos = &head->next;
 		head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
@@ -270,7 +259,14 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	RCU_INIT_POINTER(entry->next, head);
 
 	rcu_assign_pointer(*inspos, entry);
-	spin_unlock(new_bucket_lock);
+	if (inspos != lock)
+		rht_unlock(lock);
+	else
+		rht_unlocked();
+
+	/* Need to preserved the bit lock. */
+	if (rht_is_locked(*pprev))
+		next = rht_ptr_locked(next);
 
 	rcu_assign_pointer(*pprev, next);
 
@@ -282,19 +278,19 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	spinlock_t *old_bucket_lock;
+	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
 	int err;
 
-	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
+	if (!pprev)
+		return 0;
+	rht_lock(pprev);
 
-	spin_lock_bh(old_bucket_lock);
-	while (!(err = rhashtable_rehash_one(ht, old_hash)))
+	while (!(err = rhashtable_rehash_one(ht, pprev, old_hash)))
 		;
 
 	if (err == -ENOENT)
 		err = 0;
-
-	spin_unlock_bh(old_bucket_lock);
+	rht_unlock(pprev);
 
 	return err;
 }
@@ -487,6 +483,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 }
 
 static void *rhashtable_lookup_one(struct rhashtable *ht,
+				   struct rhash_head __rcu **pprev,
 				   struct bucket_table *tbl, unsigned int hash,
 				   const void *key, struct rhash_head *obj)
 {
@@ -494,15 +491,12 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		.ht = ht,
 		.key = key,
 	};
-	struct rhash_head __rcu **pprev;
+	struct rhash_head **lock = pprev;
 	struct rhash_head *head;
 	int elasticity;
 
 	elasticity = RHT_ELASTICITY;
-	pprev = rht_bucket_var(tbl, hash);
-	if (!pprev)
-		return ERR_PTR(-ENOENT);
-	rht_for_each_continue(head, *pprev, tbl, hash) {
+	rht_for_each_continue(head, rht_ptr(*pprev), tbl, hash) {
 		struct rhlist_head *list;
 		struct rhlist_head *plist;
 
@@ -524,6 +518,9 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		RCU_INIT_POINTER(list->next, plist);
 		head = rht_dereference_bucket(head->next, tbl, hash);
 		RCU_INIT_POINTER(list->rhead.next, head);
+		if (pprev == lock)
+			/* Need to preserve the bit lock */
+			obj = rht_ptr_locked(obj);
 		rcu_assign_pointer(*pprev, obj);
 
 		return NULL;
@@ -536,12 +533,13 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 }
 
 static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
+						  struct rhash_head __rcu **pprev,
 						  struct bucket_table *tbl,
 						  unsigned int hash,
 						  struct rhash_head *obj,
 						  void *data)
 {
-	struct rhash_head __rcu **pprev;
+	struct rhash_head **lock = pprev;
 	struct bucket_table *new_tbl;
 	struct rhash_head *head;
 
@@ -564,11 +562,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		return ERR_PTR(-EAGAIN);
 
-	pprev = rht_bucket_insert(ht, tbl, hash);
-	if (!pprev)
-		return ERR_PTR(-ENOMEM);
-
-	head = rht_dereference_bucket(*pprev, tbl, hash);
+	head = rht_ptr(rht_dereference_bucket(*pprev, tbl, hash));
 	while (!ht->rhlist && !rht_is_a_nulls(head) && head < obj) {
 		pprev = &head->next;
 		head = rht_dereference_bucket(*pprev, tbl, hash);
@@ -582,6 +576,9 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
+	if (pprev == lock)
+		/* Need to preserve the bit lock */
+		obj = (void *)(2UL | (unsigned long)obj);
 	rcu_assign_pointer(*pprev, obj);
 
 	atomic_inc(&ht->nelems);
@@ -596,6 +593,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 {
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
+	struct rhash_head __rcu **pprev;
 	unsigned int hash;
 	void *data;
 
@@ -604,14 +602,25 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	do {
 		tbl = new_tbl;
 		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		spin_lock(rht_bucket_lock(tbl, hash));
-
-		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
-		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
-		if (PTR_ERR(new_tbl) != -EEXIST)
-			data = ERR_CAST(new_tbl);
-
-		spin_unlock(rht_bucket_lock(tbl, hash));
+		if (rcu_access_pointer(tbl->future_tbl))
+			/* Failure is OK */
+			pprev = rht_bucket_var(tbl, hash);
+		else
+			pprev = rht_bucket_insert(ht, tbl, hash);
+		if (pprev == NULL) {
+			new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+			data = ERR_PTR(-EAGAIN);
+		} else {
+			rht_lock(pprev);
+			data = rhashtable_lookup_one(ht, pprev, tbl,
+						     hash, key, obj);
+			new_tbl = rhashtable_insert_one(ht, pprev, tbl,
+							hash, obj, data);
+			if (PTR_ERR(new_tbl) != -EEXIST)
+				data = ERR_CAST(new_tbl);
+
+			rht_unlock(pprev);
+		}
 	} while (!IS_ERR_OR_NULL(new_tbl));
 
 	if (PTR_ERR(data) == -EAGAIN)
@@ -1045,11 +1054,6 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(&ht->p);
 
-	if (params->locks_mul)
-		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-	else
-		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
-
 	ht->key_len = ht->p.key_len;
 	if (!params->hashfn) {
 		ht->p.hashfn = jhash;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index d9e69e4514be..aa989318162f 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -33,7 +33,6 @@ static const struct rhashtable_params br_fdb_rht_params = {
 	.key_offset = offsetof(struct net_bridge_fdb_entry, key),
 	.key_len = sizeof(struct net_bridge_fdb_key),
 	.automatic_shrinking = true,
-	.locks_mul = 1,
 };
 
 static struct kmem_cache *br_fdb_cache __read_mostly;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 9896f4975353..7569c3c04585 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -21,7 +21,6 @@ static const struct rhashtable_params br_vlan_rht_params = {
 	.key_offset = offsetof(struct net_bridge_vlan, vid),
 	.key_len = sizeof(u16),
 	.nelem_hint = 3,
-	.locks_mul = 1,
 	.max_size = VLAN_N_VID,
 	.obj_cmpfn = br_vlan_cmp,
 	.automatic_shrinking = true,
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index 6d2c4eed2dc8..758151863669 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -34,7 +34,6 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = {
 	.key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
 	.key_len = sizeof(__be64),
 	.nelem_hint = 3,
-	.locks_mul = 1,
 	.obj_cmpfn = br_vlan_tunid_cmp,
 	.automatic_shrinking = true,
 };
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index adbc3d3a560b..1995520518ab 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -371,7 +371,6 @@ static const struct rhashtable_params ipmr_rht_params = {
 	.key_offset = offsetof(struct mfc_cache, cmparg),
 	.key_len = sizeof(struct mfc_cache_cmp_arg),
 	.nelem_hint = 3,
-	.locks_mul = 1,
 	.obj_cmpfn = ipmr_hash_cmp,
 	.automatic_shrinking = true,
 };
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index dfb339f79bde..f7dff3f219d3 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -340,7 +340,6 @@ static const struct rhashtable_params ip6mr_rht_params = {
 	.key_offset = offsetof(struct mfc6_cache, cmparg),
 	.key_len = sizeof(struct mfc6_cache_cmp_arg),
 	.nelem_hint = 3,
-	.locks_mul = 1,
 	.obj_cmpfn = ip6mr_hash_cmp,
 	.automatic_shrinking = true,
 };

^ permalink raw reply related

* [PATCH 13/18] rhashtable: don't hold lock on first table throughout insertion.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

rhashtable_try_insert() currently hold a lock on the bucket in
the first table, while also locking buckets in subsequent tables.
This is unnecessary and looks like a hold-over from some earlier
version of the implementation.

As insert and remove always lock a bucket in each table in turn, and
as insert only inserts in the final table, there cannot be any races
that are not covered by simply locking a bucket in each table in turn.

When an insert call reaches that last table it can be sure that there
is no match entry in any other table as it has searched them all, and
insertion never happens anywhere but in the last table.  The fact that
code tests for the existence of future_tbl while holding a lock on
the relevant bucket ensures that two threads inserting the same key
will make compatible decisions about which is the "last" table.

This simplifies the code and allows the ->rehash field to be
discarded.

We still need a way to ensure that a dead bucket_table is never
re-linked by rhashtable_walk_stop().  This can be achieved by
calling call_rcu() inside the locked region, and checking
->rcu.func in rhashtable_walk_stop().  If it is not NULL, then
the bucket table is empty and dead.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   13 -------------
 lib/rhashtable.c           |   44 +++++++++++---------------------------------
 2 files changed, 11 insertions(+), 46 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1b70d690ab65..5f0511bd5a39 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -63,7 +63,6 @@
 struct bucket_table {
 	unsigned int		size;
 	unsigned int		nest;
-	unsigned int		rehash;
 	u32			hash_rnd;
 	unsigned int		locks_mask;
 	spinlock_t		*locks;
@@ -774,12 +773,6 @@ static inline int rhltable_insert(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * This lookup function may only be used for fixed key hash table (key_len
  * parameter set). It will BUG() if used inappropriately.
  *
@@ -835,12 +828,6 @@ static inline void *rhashtable_lookup_get_insert_fast(
  * @obj:	pointer to hash head inside object
  * @params:	hash table parameters
  *
- * Locks down the bucket chain in both the old and new table if a resize
- * is in progress to ensure that writers can't remove from the old table
- * and can't insert to the new table during the atomic operation of search
- * and insertion. Searches for duplicates in both the old and new table if
- * a resize is in progress.
- *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Will trigger an automatic deferred table resizing if residency in the
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index a185f5a8b34d..919eebd6757d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -289,10 +289,9 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
 
-	if (err == -ENOENT) {
-		old_tbl->rehash++;
+	if (err == -ENOENT)
 		err = 0;
-	}
+
 	spin_unlock_bh(old_bucket_lock);
 
 	return err;
@@ -339,13 +338,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
-	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
 	 * table, and thus no references to the old table will
 	 * remain.
+	 * We do this inside the locked region so that
+	 * rhashtable_walk_stop() can check ->rcu.func and know
+	 * not to re-link the table.
 	 */
 	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+	spin_unlock(&ht->lock);
 
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
@@ -591,36 +593,14 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
 	unsigned int hash;
-	spinlock_t *lock;
 	void *data;
 
-	tbl = rcu_dereference(ht->tbl);
-
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
-
-		if (tbl->rehash <= hash)
-			break;
-
-		spin_unlock_bh(lock);
-		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	}
+	new_tbl = rcu_dereference(ht->tbl);
 
-	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
-	new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
-	if (PTR_ERR(new_tbl) != -EEXIST)
-		data = ERR_CAST(new_tbl);
-
-	while (!IS_ERR_OR_NULL(new_tbl)) {
+	do {
 		tbl = new_tbl;
 		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		spin_lock_nested(rht_bucket_lock(tbl, hash),
-				 SINGLE_DEPTH_NESTING);
+		spin_lock(rht_bucket_lock(tbl, hash));
 
 		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
 		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
@@ -628,9 +608,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			data = ERR_CAST(new_tbl);
 
 		spin_unlock(rht_bucket_lock(tbl, hash));
-	}
-
-	spin_unlock_bh(lock);
+	} while (!IS_ERR_OR_NULL(new_tbl));
 
 	if (PTR_ERR(data) == -EAGAIN)
 		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
@@ -965,7 +943,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	ht = iter->ht;
 
 	spin_lock(&ht->lock);
-	if (tbl->rehash < tbl->size)
+	if (tbl->rcu.func == NULL)
 		list_add(&iter->walker.list, &tbl->walkers);
 	else
 		iter->walker.tbl = NULL;

^ permalink raw reply related

* [PATCH 12/18] rhashtable: add rhashtable_walk_prev()
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

rhashtable_walk_prev() returns the object returned by
the previous rhashtable_walk_next(), providing it is still in the
table (or was during this grace period).
This works even if rhashtable_walk_stop() and rhashtable_talk_start()
have been called since the last rhashtable_walk_next().

If there have been no calls to rhashtable_walk_next(), or if the
object is gone from the table, then NULL is returned.

This can usefully be used in a seq_file ->start() function.
If the pos is the same as was returned by the last ->next() call,
then rhashtable_walk_prev() can be used to re-establish the
current location in the table.  If it returns NULL, then
rhashtable_walk_next() should be used.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 +
 lib/rhashtable.c           |   31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index c612d2446d90..1b70d690ab65 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -247,6 +247,7 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 }
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void *rhashtable_walk_prev(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 5ab0f4825271..a185f5a8b34d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -915,6 +915,37 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
+/**
+ * rhashtable_walk_prev - Return the previously returned object, if available
+ * @iter:	Hash table iterator
+ *
+ * If rhashtable_walk_next() has previously been called and the object
+ * it returned is still in the hash table, that object is returned again,
+ * otherwise %NULL is returned.
+ *
+ * If the recent rhashtable_walk_next() call was since the most recent
+ * rhashtable_walk_start() call then the returned object may not, strictly
+ * speaking, still be in the table.  It will be safe to dereference.
+ *
+ * Note that the iterator is not changed and in particular it does not
+ * step backwards.
+ */
+void *rhashtable_walk_prev(struct rhashtable_iter *iter)
+{
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+
+	if (!p)
+		return NULL;
+	if (!iter->p_is_unsafe || ht->rhlist)
+		return p;
+	rht_for_each_rcu(p, iter->walker.tbl, iter->slot)
+		if (p == iter->p)
+			return p;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_prev);
+
 /**
  * rhashtable_walk_stop - Finish a hash table walk
  * @iter:	Hash table iterator

^ permalink raw reply related

* [PATCH 10/18] rhashtable: remove rhashtable_walk_peek()
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

This function has a somewhat confused behavior that is not properly
described by the documentation.
Sometimes is returns the previous object, sometimes it returns the
next one.
Sometimes it changes the iterator, sometimes it doesn't.

This function is not currently used and is not worth keeping, so
remove it.

A future patch will introduce a new function with a
simpler interface which can meet the same need that
this was added for.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    1 -
 lib/rhashtable.c           |   34 ----------------------------------
 2 files changed, 35 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 10435a77b156..4e2fc97667e0 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -247,7 +247,6 @@ static inline void rhashtable_walk_start(struct rhashtable_iter *iter)
 }
 
 void *rhashtable_walk_next(struct rhashtable_iter *iter);
-void *rhashtable_walk_peek(struct rhashtable_iter *iter);
 void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
 
 void rhashtable_free_and_destroy(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 86c801d04d4a..120382b0bad0 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -891,40 +891,6 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 
-/**
- * rhashtable_walk_peek - Return the next object but don't advance the iterator
- * @iter:	Hash table iterator
- *
- * Returns the next object or NULL when the end of the table is reached.
- *
- * Returns -EAGAIN if resize event occurred.  Note that the iterator
- * will rewind back to the beginning and you may continue to use it.
- */
-void *rhashtable_walk_peek(struct rhashtable_iter *iter)
-{
-	struct rhlist_head *list = iter->list;
-	struct rhashtable *ht = iter->ht;
-	struct rhash_head *p = iter->p;
-
-	if (p)
-		return rht_obj(ht, ht->rhlist ? &list->rhead : p);
-
-	/* No object found in current iter, find next one in the table. */
-
-	if (iter->skip) {
-		/* A nonzero skip value points to the next entry in the table
-		 * beyond that last one that was found. Decrement skip so
-		 * we find the current value. __rhashtable_walk_find_next
-		 * will restore the original value of skip assuming that
-		 * the table hasn't changed.
-		 */
-		iter->skip--;
-	}
-
-	return __rhashtable_walk_find_next(iter);
-}
-EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
-
 /**
  * rhashtable_walk_stop - Finish a hash table walk
  * @iter:	Hash table iterator

^ permalink raw reply related

* [PATCH 09/18] rhashtable: use cmpxchg() in nested_table_alloc()
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

nested_table_alloc() relies on the fact that there is
at most one spinlock allocated for every slot in the top
level nested table, so it is not possible for two threads
to try to allocate the same table at the same time.

A future patch will change the locking and invalidate
this assumption.  So change the code to protect against
a race using cmpxchg() - if it loses, it frees the table
that it allocated.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1737fbd049da..86c801d04d4a 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -132,9 +132,11 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 			INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
 	}
 
-	rcu_assign_pointer(*prev, ntbl);
-
-	return ntbl;
+	if (cmpxchg(prev, NULL, ntbl) == NULL)
+		return ntbl;
+	/* Raced with another thread. */
+	kfree(ntbl);
+	return rcu_dereference(*prev);
 }
 
 static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,

^ permalink raw reply related

* [PATCH 08/18] rhashtable: clean up dereference of ->future_tbl.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

Using rht_dereference_bucket() to dereference
->future_tbl looks like a type error, and could be confusing.
Using rht_dereference_rcu() to test a pointer for NULL
adds an unnecessary barrier - rcu_access_pointer() is preferred
for NULL tests when no lock is held.

This uses 3 different ways to access ->future_tbl.
- if we know the mutex is held, use rht_dereference()
- if we don't hold the mutex, and are only testing for NULL,
  use rcu_access_pointer()
- otherwise (using RCU protection for true dereference),
  use rht_dereference_rcu().

Note that this includes a simplification of the call to
rhashtable_last_table() - we don't do an extra dereference
before the call any more.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |    2 +-
 lib/rhashtable.c           |    9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 49f80506d02e..10435a77b156 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -605,7 +605,7 @@ static inline void *__rhashtable_insert_fast(
 	lock = rht_bucket_lock(tbl, hash);
 	spin_lock_bh(lock);
 
-	if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) {
+	if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
 slow_path:
 		spin_unlock_bh(lock);
 		rcu_read_unlock();
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b5d17bce19ff..1737fbd049da 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -226,8 +226,7 @@ static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
-	struct bucket_table *new_tbl = rhashtable_last_table(ht,
-		rht_dereference_rcu(old_tbl->future_tbl, ht));
+	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
 	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
 	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
@@ -467,7 +466,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 
 fail:
 	/* Do not fail the insert if someone else did a rehash. */
-	if (likely(rcu_dereference_raw(tbl->future_tbl)))
+	if (likely(rcu_access_pointer(tbl->future_tbl)))
 		return 0;
 
 	/* Schedule async rehash to retry allocation in process context. */
@@ -540,7 +539,7 @@ static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
 		return ERR_CAST(data);
 
-	new_tbl = rcu_dereference(tbl->future_tbl);
+	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	if (new_tbl)
 		return new_tbl;
 
@@ -599,7 +598,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			break;
 
 		spin_unlock_bh(lock);
-		tbl = rcu_dereference(tbl->future_tbl);
+		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 	}
 
 	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);

^ permalink raw reply related

* [PATCH 07/18] rhashtable: use cmpxchg() to protect ->future_tbl.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

Rather than borrowing one of the bucket locks to
protect ->future_tbl updates, use cmpxchg().
This gives more freedom to change how bucket locking
is implemented.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e209069f1d74..b5d17bce19ff 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -297,21 +297,14 @@ static int rhashtable_rehash_attach(struct rhashtable *ht,
 				    struct bucket_table *old_tbl,
 				    struct bucket_table *new_tbl)
 {
-	/* Protect future_tbl using the first bucket lock. */
-	spin_lock_bh(old_tbl->locks);
-
-	/* Did somebody beat us to it? */
-	if (rcu_access_pointer(old_tbl->future_tbl)) {
-		spin_unlock_bh(old_tbl->locks);
-		return -EEXIST;
-	}
-
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
+	 * As cmpxchg() provides strong barriers, we do not need
+	 * rcu_assign_pointer().
 	 */
-	rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 
-	spin_unlock_bh(old_tbl->locks);
+	if (cmpxchg(&old_tbl->future_tbl, NULL, new_tbl) != NULL)
+		return -EEXIST;
 
 	return 0;
 }

^ permalink raw reply related

* [PATCH 06/18] rhashtable: simplify nested_table_alloc() and rht_bucket_nested_insert()
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

Now that we don't use the hash value or shift in nested_table_alloc()
there is room for simplification.
We only need to pass a "is this a leaf" flag to nested_table_alloc(),
and don't need to track as much information in
rht_bucket_nested_insert().

Note there is another minor cleanup in nested_table_alloc() here.
The number of elements in a page of "union nested_tables" is most naturally

  PAGE_SIZE / sizeof(ntbl[0])

The previous code had

  PAGE_SIZE / sizeof(ntbl[0].bucket)

which happens to be the correct value only because the bucket uses all
the space in the union.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/rhashtable.c |   18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 8582e1916c2d..e209069f1d74 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -116,7 +116,7 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 
 static union nested_table *nested_table_alloc(struct rhashtable *ht,
 					      union nested_table __rcu **prev,
-					      unsigned int shifted)
+					      bool leaf)
 {
 	union nested_table *ntbl;
 	int i;
@@ -127,8 +127,8 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 
 	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
 
-	if (ntbl && shifted) {
-		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++)
+	if (ntbl && leaf) {
+		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
 			INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
 	}
 
@@ -155,7 +155,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 
 	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
-				0)) {
+				false)) {
 		kfree(tbl);
 		return NULL;
 	}
@@ -1209,24 +1209,18 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
 	unsigned int index = hash & ((1 << tbl->nest) - 1);
 	unsigned int size = tbl->size >> tbl->nest;
 	union nested_table *ntbl;
-	unsigned int shifted;
-	unsigned int nhash;
 
 	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
 	hash >>= tbl->nest;
-	nhash = index;
-	shifted = tbl->nest;
 	ntbl = nested_table_alloc(ht, &ntbl[index].table,
-				  size <= (1 << shift) ? shifted : 0);
+				  size <= (1 << shift));
 
 	while (ntbl && size > (1 << shift)) {
 		index = hash & ((1 << shift) - 1);
 		size >>= shift;
 		hash >>= shift;
-		nhash |= index << shifted;
-		shifted += shift;
 		ntbl = nested_table_alloc(ht, &ntbl[index].table,
-					  size <= (1 << shift) ? shifted : 0);
+					  size <= (1 << shift));
 	}
 
 	if (!ntbl)

^ permalink raw reply related

* [PATCH 04/18] rhashtable: detect when object movement might have invalidated a lookup
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

Some users of rhashtable might need to change the key
of an object and move it to a different location in the table.
Other users might want to allocate objects using
SLAB_TYPESAFE_BY_RCU which can result in the same memory allocation
being used for a different (type-compatible) purpose and similarly
end up in a different hash-chain.

To support these, we store a unique NULLS_MARKER at the end of
each chain, and when a search fails to find a match, we check
if the NULLS marker found was the expected one.  If not,
the search is repeated.

The unique NULLS_MARKER is derived from the address of the
head of the chain.

If an object is removed and re-added to the same hash chain, we won't
notice by looking that the NULLS marker.  In this case we must be sure
that it was not re-added *after* its original location, or a lookup may
incorrectly fail.  The easiest solution is to ensure it is inserted at
the start of the chain.  insert_slow() already does that,
insert_fast() does not.  So this patch changes insert_fast to always
insert at the head of the chain.

Note that such a user must do their own double-checking of
the object found by rhashtable_lookup_fast() after ensuring
mutual exclusion which anything that might change the key, such as
successfully taking a new reference.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable.h |   35 +++++++++++++++++++++++------------
 lib/rhashtable.c           |    8 +++++---
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index d9f719af7936..25d839881ae5 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -75,8 +75,10 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
+#define	RHT_NULLS_MARKER(ptr)	\
+	((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1))
 #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) NULLS_MARKER(0))
+	((ptr) = RHT_NULLS_MARKER(&(ptr)))
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
@@ -471,6 +473,7 @@ static inline struct rhash_head *__rhashtable_lookup(
 		.ht = ht,
 		.key = key,
 	};
+	struct rhash_head __rcu * const *head;
 	struct bucket_table *tbl;
 	struct rhash_head *he;
 	unsigned int hash;
@@ -478,13 +481,19 @@ static inline struct rhash_head *__rhashtable_lookup(
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 restart:
 	hash = rht_key_hashfn(ht, tbl, key, params);
-	rht_for_each_rcu(he, tbl, hash) {
-		if (params.obj_cmpfn ?
-		    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
-		    rhashtable_compare(&arg, rht_obj(ht, he)))
-			continue;
-		return he;
-	}
+	head = rht_bucket(tbl, hash);
+	do {
+		rht_for_each_rcu_continue(he, *head, tbl, hash) {
+			if (params.obj_cmpfn ?
+			    params.obj_cmpfn(&arg, rht_obj(ht, he)) :
+			    rhashtable_compare(&arg, rht_obj(ht, he)))
+				continue;
+			return he;
+		}
+		/* An object might have been moved to a different hash chain,
+		 * while we walk along it - better check and retry.
+		 */
+	} while (he != RHT_NULLS_MARKER(head));
 
 	/* Ensure we see any new tables. */
 	smp_rmb();
@@ -580,6 +589,7 @@ static inline void *__rhashtable_insert_fast(
 		.ht = ht,
 		.key = key,
 	};
+	struct rhash_head __rcu **headp;
 	struct rhash_head __rcu **pprev;
 	struct bucket_table *tbl;
 	struct rhash_head *head;
@@ -603,12 +613,13 @@ static inline void *__rhashtable_insert_fast(
 	}
 
 	elasticity = RHT_ELASTICITY;
-	pprev = rht_bucket_insert(ht, tbl, hash);
+	headp = rht_bucket_insert(ht, tbl, hash);
+	pprev = headp;
 	data = ERR_PTR(-ENOMEM);
 	if (!pprev)
 		goto out;
 
-	rht_for_each_continue(head, *pprev, tbl, hash) {
+	rht_for_each_continue(head, *headp, tbl, hash) {
 		struct rhlist_head *plist;
 		struct rhlist_head *list;
 
@@ -648,7 +659,7 @@ static inline void *__rhashtable_insert_fast(
 	if (unlikely(rht_grow_above_100(ht, tbl)))
 		goto slow_path;
 
-	head = rht_dereference_bucket(*pprev, tbl, hash);
+	head = rht_dereference_bucket(*headp, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (rhlist) {
@@ -658,7 +669,7 @@ static inline void *__rhashtable_insert_fast(
 		RCU_INIT_POINTER(list->next, NULL);
 	}
 
-	rcu_assign_pointer(*pprev, obj);
+	rcu_assign_pointer(*headp, obj);
 
 	atomic_inc(&ht->nelems);
 	if (rht_grow_above_75(ht, tbl))
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 688693c919be..69f05cf9e9e8 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -1174,8 +1174,7 @@ struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
 	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
-	static struct rhash_head __rcu *rhnull =
-		(struct rhash_head __rcu *)NULLS_MARKER(0);
+	static struct rhash_head __rcu *rhnull;
 	unsigned int index = hash & ((1 << tbl->nest) - 1);
 	unsigned int size = tbl->size >> tbl->nest;
 	unsigned int subhash = hash;
@@ -1193,8 +1192,11 @@ struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
 		subhash >>= shift;
 	}
 
-	if (!ntbl)
+	if (!ntbl) {
+		if (!rhnull)
+			INIT_RHT_NULLS_HEAD(rhnull, NULL, 0);
 		return &rhnull;
+	}
 
 	return &ntbl[subhash].bucket;
 

^ permalink raw reply related

* [PATCH 03/18] rhashtable: remove nulls_base and related code.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

This "feature" is unused, undocumented, and untested and so
doesn't really belong.  Next patch will introduce support
to detect when a search gets diverted down a different chain,
which the common purpose of nulls markers.

This patch actually fixes a bug too.  The table resizing allows a
table to grow to 2^31 buckets, but the hash is truncated to 27 bits -
any growth beyond 2^27 is wasteful an ineffective.

This patch results in NULLS_MARKER(0) being used for all chains,
and leaves the use of rht_is_a_null() to test for it.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable-types.h |    2 --
 include/linux/rhashtable.h       |   33 +++------------------------------
 lib/rhashtable.c                 |    8 --------
 lib/test_rhashtable.c            |    5 +----
 4 files changed, 4 insertions(+), 44 deletions(-)

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 9740063ff13b..763d613ce2c2 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -50,7 +50,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @min_size: Minimum size while shrinking
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
  * @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
  * @obj_cmpfn: Function to compare key with object
@@ -64,7 +63,6 @@ struct rhashtable_params {
 	u16			min_size;
 	bool			automatic_shrinking;
 	u8			locks_mul;
-	u32			nulls_base;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 48754ab07cdf..d9f719af7936 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -28,25 +28,8 @@
 #include <linux/rhashtable-types.h>
 /*
  * The end of the chain is marked with a special nulls marks which has
- * the following format:
- *
- * +-------+-----------------------------------------------------+-+
- * | Base  |                      Hash                           |1|
- * +-------+-----------------------------------------------------+-+
- *
- * Base (4 bits) : Reserved to distinguish between multiple tables.
- *                 Specified via &struct rhashtable_params.nulls_base.
- * Hash (27 bits): Full hash (unmasked) of first element added to bucket
- * 1 (1 bit)     : Nulls marker (always set)
- *
- * The remaining bits of the next pointer remain unused for now.
+ * the least significant bit set.
  */
-#define RHT_BASE_BITS		4
-#define RHT_HASH_BITS		27
-#define RHT_BASE_SHIFT		RHT_HASH_BITS
-
-/* Base bits plus 1 bit for nulls marker */
-#define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
 
 /* Maximum chain length before rehash
  *
@@ -92,24 +75,14 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
-{
-	return NULLS_MARKER(ht->p.nulls_base + hash);
-}
-
 #define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
-	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+	((ptr) = (typeof(ptr)) NULLS_MARKER(0))
 
 static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
 {
 	return ((unsigned long) ptr & 1);
 }
 
-static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
-{
-	return ((unsigned long) ptr) >> 1;
-}
-
 static inline void *rht_obj(const struct rhashtable *ht,
 			    const struct rhash_head *he)
 {
@@ -119,7 +92,7 @@ static inline void *rht_obj(const struct rhashtable *ht,
 static inline unsigned int rht_bucket_index(const struct bucket_table *tbl,
 					    unsigned int hash)
 {
-	return (hash >> RHT_HASH_RESERVED_SPACE) & (tbl->size - 1);
+	return hash & (tbl->size - 1);
 }
 
 static inline unsigned int rht_key_get_hash(struct rhashtable *ht,
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index c9fafea7dc6e..688693c919be 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -995,7 +995,6 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -1029,9 +1028,6 @@ int rhashtable_init(struct rhashtable *ht,
 	    (params->obj_hashfn && !params->obj_cmpfn))
 		return -EINVAL;
 
-	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
-		return -EINVAL;
-
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	spin_lock_init(&ht->lock);
@@ -1096,10 +1092,6 @@ int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
 {
 	int err;
 
-	/* No rhlist NULLs marking for now. */
-	if (params->nulls_base)
-		return -EINVAL;
-
 	err = rhashtable_init(&hlt->ht, params);
 	hlt->ht.rhlist = true;
 	return err;
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index bf92b7aa2a49..b428a9c7522a 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -83,7 +83,7 @@ static u32 my_hashfn(const void *data, u32 len, u32 seed)
 {
 	const struct test_obj_rhl *obj = data;
 
-	return (obj->value.id % 10) << RHT_HASH_RESERVED_SPACE;
+	return (obj->value.id % 10);
 }
 
 static int my_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
@@ -99,7 +99,6 @@ static struct rhashtable_params test_rht_params = {
 	.key_offset = offsetof(struct test_obj, value),
 	.key_len = sizeof(struct test_obj_val),
 	.hashfn = jhash,
-	.nulls_base = (3U << RHT_BASE_SHIFT),
 };
 
 static struct rhashtable_params test_rht_params_dup = {
@@ -294,8 +293,6 @@ static int __init test_rhltable(unsigned int entries)
 	if (!obj_in_table)
 		goto out_free;
 
-	/* nulls_base not supported in rhlist interface */
-	test_rht_params.nulls_base = 0;
 	err = rhltable_init(&rhlt, &test_rht_params);
 	if (WARN_ON(err))
 		goto out_free;

^ permalink raw reply related

* [PATCH 02/18] rhashtable: split rhashtable.h
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

Due to the use of rhashtables in net namespaces,
rhashtable.h is included in lots of the kernel,
so a small changes can required a large recompilation.
This makes development painful.

This patch splits out rhashtable-types.h which just includes
the major type declarations, and does not include (non-trivial)
inline code.  rhashtable.h is no longer included by anything
in the include/ directory.
Common include files only include rhashtable-types.h so a large
recompilation is only triggered when that changes.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 MAINTAINERS                                       |    2 
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h        |    1 
 drivers/staging/lustre/lustre/fid/fid_request.c   |    2 
 drivers/staging/lustre/lustre/fld/fld_request.c   |    1 
 drivers/staging/lustre/lustre/include/lu_object.h |    1 
 include/linux/ipc.h                               |    2 
 include/linux/ipc_namespace.h                     |    2 
 include/linux/mroute_base.h                       |    2 
 include/linux/rhashtable-types.h                  |  139 +++++++++++++++++++++
 include/linux/rhashtable.h                        |  127 -------------------
 include/net/inet_frag.h                           |    2 
 include/net/netfilter/nf_flow_table.h             |    2 
 include/net/sctp/structs.h                        |    2 
 include/net/seg6.h                                |    2 
 include/net/seg6_hmac.h                           |    2 
 ipc/msg.c                                         |    1 
 ipc/sem.c                                         |    1 
 ipc/shm.c                                         |    1 
 ipc/util.c                                        |    1 
 lib/rhashtable.c                                  |    1 
 net/ipv4/inet_fragment.c                          |    1 
 net/ipv4/ipmr.c                                   |    1 
 net/ipv4/ipmr_base.c                              |    1 
 net/ipv6/ip6mr.c                                  |    1 
 net/ipv6/seg6.c                                   |    1 
 net/ipv6/seg6_hmac.c                              |    1 
 net/netfilter/nf_tables_api.c                     |    1 
 net/sctp/input.c                                  |    1 
 net/sctp/socket.c                                 |    1 
 29 files changed, 169 insertions(+), 134 deletions(-)
 create mode 100644 include/linux/rhashtable-types.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 66985d05bced..a3a4f44d3ce1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12012,7 +12012,9 @@ M:	Herbert Xu <herbert@gondor.apana.org.au>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	lib/rhashtable.c
+F:	lib/test_rhashtable.c
 F:	include/linux/rhashtable.h
+F:	include/linux/rhashtable-types.h
 
 RICOH R5C592 MEMORYSTICK DRIVER
 M:	Maxim Levitsky <maximlevitsky@gmail.com>
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 688f95440af2..66a7a02d616a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -46,6 +46,7 @@
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/vmalloc.h>
+#include <linux/rhashtable.h>
 #include <linux/etherdevice.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c
index c674652af03a..a9af68178eff 100644
--- a/drivers/staging/lustre/lustre/fid/fid_request.c
+++ b/drivers/staging/lustre/lustre/fid/fid_request.c
@@ -40,7 +40,7 @@
 #define DEBUG_SUBSYSTEM S_FID
 
 #include <linux/module.h>
-
+#include <linux/rhashtable.h>
 #include <obd.h>
 #include <obd_class.h>
 #include <obd_support.h>
diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c
index 7b7ba93a4db6..292ff5756243 100644
--- a/drivers/staging/lustre/lustre/fld/fld_request.c
+++ b/drivers/staging/lustre/lustre/fld/fld_request.c
@@ -40,6 +40,7 @@
 #define DEBUG_SUBSYSTEM S_FLD
 
 #include <linux/module.h>
+#include <linux/rhashtable.h>
 #include <asm/div64.h>
 
 #include <obd.h>
diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h
index 1b5284d2416c..205463c47bda 100644
--- a/drivers/staging/lustre/lustre/include/lu_object.h
+++ b/drivers/staging/lustre/lustre/include/lu_object.h
@@ -37,6 +37,7 @@
 #include <stdarg.h>
 #include <linux/percpu_counter.h>
 #include <linux/libcfs/libcfs.h>
+#include <linux/rhashtable-types.h>
 #include <uapi/linux/lustre/lustre_idl.h>
 #include <lu_ref.h>
 
diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 6cc2df7f7ac9..e1c9eea6015b 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -4,7 +4,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/uidgid.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <uapi/linux/ipc.h>
 #include <linux/refcount.h>
 
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index b5630c8eb2f3..6cea726612b7 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -9,7 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ns_common.h>
 #include <linux/refcount.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 struct user_namespace;
 
diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h
index d617fe45543e..fd673be398ff 100644
--- a/include/linux/mroute_base.h
+++ b/include/linux/mroute_base.h
@@ -2,7 +2,7 @@
 #define __LINUX_MROUTE_BASE_H
 
 #include <linux/netdevice.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <linux/spinlock.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
new file mode 100644
index 000000000000..9740063ff13b
--- /dev/null
+++ b/include/linux/rhashtable-types.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Resizable, Scalable, Concurrent Hash Table
+ *
+ * Simple structures that might be needed in include
+ * files.
+ */
+
+#ifndef _LINUX_RHASHTABLE_TYPES_H
+#define _LINUX_RHASHTABLE_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+
+struct rhash_head {
+	struct rhash_head __rcu		*next;
+};
+
+struct rhlist_head {
+	struct rhash_head		rhead;
+	struct rhlist_head __rcu	*next;
+};
+
+struct bucket_table;
+
+/**
+ * struct rhashtable_compare_arg - Key for the function rhashtable_compare
+ * @ht: Hash table
+ * @key: Key to compare against
+ */
+struct rhashtable_compare_arg {
+	struct rhashtable *ht;
+	const void *key;
+};
+
+typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
+typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
+			       const void *obj);
+
+/**
+ * struct rhashtable_params - Hash table construction parameters
+ * @nelem_hint: Hint on number of elements, should be 75% of desired size
+ * @key_len: Length of key
+ * @key_offset: Offset of key in struct to be hashed
+ * @head_offset: Offset of rhash_head in struct to be hashed
+ * @max_size: Maximum size while expanding
+ * @min_size: Minimum size while shrinking
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @nulls_base: Base value to generate nulls marker
+ * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
+ * @obj_hashfn: Function to hash object
+ * @obj_cmpfn: Function to compare key with object
+ */
+struct rhashtable_params {
+	u16			nelem_hint;
+	u16			key_len;
+	u16			key_offset;
+	u16			head_offset;
+	unsigned int		max_size;
+	u16			min_size;
+	bool			automatic_shrinking;
+	u8			locks_mul;
+	u32			nulls_base;
+	rht_hashfn_t		hashfn;
+	rht_obj_hashfn_t	obj_hashfn;
+	rht_obj_cmpfn_t		obj_cmpfn;
+};
+
+/**
+ * struct rhashtable - Hash table handle
+ * @tbl: Bucket table
+ * @key_len: Key length for hashfn
+ * @max_elems: Maximum number of elements in table
+ * @p: Configuration parameters
+ * @rhlist: True if this is an rhltable
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @lock: Spin lock to protect walker list
+ * @nelems: Number of elements in table
+ */
+struct rhashtable {
+	struct bucket_table __rcu	*tbl;
+	unsigned int			key_len;
+	unsigned int			max_elems;
+	struct rhashtable_params	p;
+	bool				rhlist;
+	struct work_struct		run_work;
+	struct mutex                    mutex;
+	spinlock_t			lock;
+	atomic_t			nelems;
+};
+
+/**
+ * struct rhltable - Hash table with duplicate objects in a list
+ * @ht: Underlying rhtable
+ */
+struct rhltable {
+	struct rhashtable ht;
+};
+
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @tbl: The table that we were walking over
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	struct bucket_table *tbl;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @list: Current hash list pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhlist_head *list;
+	struct rhashtable_walker walker;
+	unsigned int slot;
+	unsigned int skip;
+	bool end_of_table;
+};
+
+int rhashtable_init(struct rhashtable *ht,
+		    const struct rhashtable_params *params);
+int rhltable_init(struct rhltable *hlt,
+		  const struct rhashtable_params *params);
+
+#endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 4e1f535c2034..48754ab07cdf 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Resizable, Scalable, Concurrent Hash Table
  *
@@ -17,16 +18,14 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/atomic.h>
-#include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
-#include <linux/mutex.h>
 #include <linux/rculist.h>
 
+#include <linux/rhashtable-types.h>
 /*
  * The end of the chain is marked with a special nulls marks which has
  * the following format:
@@ -64,15 +63,6 @@
  */
 #define RHT_ELASTICITY	16u
 
-struct rhash_head {
-	struct rhash_head __rcu		*next;
-};
-
-struct rhlist_head {
-	struct rhash_head		rhead;
-	struct rhlist_head __rcu	*next;
-};
-
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -102,114 +92,6 @@ struct bucket_table {
 	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
 };
 
-/**
- * struct rhashtable_compare_arg - Key for the function rhashtable_compare
- * @ht: Hash table
- * @key: Key to compare against
- */
-struct rhashtable_compare_arg {
-	struct rhashtable *ht;
-	const void *key;
-};
-
-typedef u32 (*rht_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef u32 (*rht_obj_hashfn_t)(const void *data, u32 len, u32 seed);
-typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
-			       const void *obj);
-
-struct rhashtable;
-
-/**
- * struct rhashtable_params - Hash table construction parameters
- * @nelem_hint: Hint on number of elements, should be 75% of desired size
- * @key_len: Length of key
- * @key_offset: Offset of key in struct to be hashed
- * @head_offset: Offset of rhash_head in struct to be hashed
- * @max_size: Maximum size while expanding
- * @min_size: Minimum size while shrinking
- * @locks_mul: Number of bucket locks to allocate per cpu (default: 32)
- * @automatic_shrinking: Enable automatic shrinking of tables
- * @nulls_base: Base value to generate nulls marker
- * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
- * @obj_hashfn: Function to hash object
- * @obj_cmpfn: Function to compare key with object
- */
-struct rhashtable_params {
-	u16			nelem_hint;
-	u16			key_len;
-	u16			key_offset;
-	u16			head_offset;
-	unsigned int		max_size;
-	u16			min_size;
-	bool			automatic_shrinking;
-	u8			locks_mul;
-	u32			nulls_base;
-	rht_hashfn_t		hashfn;
-	rht_obj_hashfn_t	obj_hashfn;
-	rht_obj_cmpfn_t		obj_cmpfn;
-};
-
-/**
- * struct rhashtable - Hash table handle
- * @tbl: Bucket table
- * @key_len: Key length for hashfn
- * @max_elems: Maximum number of elements in table
- * @p: Configuration parameters
- * @rhlist: True if this is an rhltable
- * @run_work: Deferred worker to expand/shrink asynchronously
- * @mutex: Mutex to protect current/future table swapping
- * @lock: Spin lock to protect walker list
- * @nelems: Number of elements in table
- */
-struct rhashtable {
-	struct bucket_table __rcu	*tbl;
-	unsigned int			key_len;
-	unsigned int			max_elems;
-	struct rhashtable_params	p;
-	bool				rhlist;
-	struct work_struct		run_work;
-	struct mutex                    mutex;
-	spinlock_t			lock;
-	atomic_t			nelems;
-};
-
-/**
- * struct rhltable - Hash table with duplicate objects in a list
- * @ht: Underlying rhtable
- */
-struct rhltable {
-	struct rhashtable ht;
-};
-
-/**
- * struct rhashtable_walker - Hash table walker
- * @list: List entry on list of walkers
- * @tbl: The table that we were walking over
- */
-struct rhashtable_walker {
-	struct list_head list;
-	struct bucket_table *tbl;
-};
-
-/**
- * struct rhashtable_iter - Hash table iterator
- * @ht: Table to iterate through
- * @p: Current pointer
- * @list: Current hash list pointer
- * @walker: Associated rhashtable walker
- * @slot: Current slot
- * @skip: Number of entries to skip in slot
- */
-struct rhashtable_iter {
-	struct rhashtable *ht;
-	struct rhash_head *p;
-	struct rhlist_head *list;
-	struct rhashtable_walker walker;
-	unsigned int slot;
-	unsigned int skip;
-	bool end_of_table;
-};
-
 static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
 {
 	return NULLS_MARKER(ht->p.nulls_base + hash);
@@ -376,11 +258,6 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
 }
 #endif /* CONFIG_PROVE_LOCKING */
 
-int rhashtable_init(struct rhashtable *ht,
-		    const struct rhashtable_params *params);
-int rhltable_init(struct rhltable *hlt,
-		  const struct rhashtable_params *params);
-
 void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 			     struct rhash_head *obj);
 
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index ed07e3786d98..f4272a29dc44 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,7 +2,7 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 struct netns_frags {
 	/* sysctls */
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 833752dd0c58..3bb75491482f 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -4,7 +4,7 @@
 #include <linux/in.h>
 #include <linux/in6.h>
 #include <linux/netdevice.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <linux/rcupdate.h>
 #include <net/dst.h>
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a0ec462bc1a9..e5ac430c8717 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -48,7 +48,7 @@
 #define __sctp_structs_h__
 
 #include <linux/ktime.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 #include <linux/socket.h>	/* linux/in.h needs this!!    */
 #include <linux/in.h>		/* We get struct sockaddr_in. */
 #include <linux/in6.h>		/* We get struct in6_addr     */
diff --git a/include/net/seg6.h b/include/net/seg6.h
index 099bad59dc90..85ee569c8306 100644
--- a/include/net/seg6.h
+++ b/include/net/seg6.h
@@ -18,7 +18,7 @@
 #include <linux/ipv6.h>
 #include <net/lwtunnel.h>
 #include <linux/seg6.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 static inline void update_csum_diff4(struct sk_buff *skb, __be32 from,
 				     __be32 to)
diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h
index 69c3a106056b..7fda469e2758 100644
--- a/include/net/seg6_hmac.h
+++ b/include/net/seg6_hmac.h
@@ -22,7 +22,7 @@
 #include <linux/route.h>
 #include <net/seg6.h>
 #include <linux/seg6_hmac.h>
-#include <linux/rhashtable.h>
+#include <linux/rhashtable-types.h>
 
 #define SEG6_HMAC_MAX_DIGESTSIZE	160
 #define SEG6_HMAC_RING_SIZE		256
diff --git a/ipc/msg.c b/ipc/msg.c
index 56fd1c73eedc..8fa2cac6830a 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -38,6 +38,7 @@
 #include <linux/rwsem.h>
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
+#include <linux/rhashtable.h>
 
 #include <asm/current.h>
 #include <linux/uaccess.h>
diff --git a/ipc/sem.c b/ipc/sem.c
index 06be75d9217a..f0c4a0ed91c7 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -84,6 +84,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
 #include <linux/sched/wake_q.h>
+#include <linux/rhashtable.h>
 
 #include <linux/uaccess.h>
 #include "util.h"
diff --git a/ipc/shm.c b/ipc/shm.c
index d73269381ec7..5824a7f3253e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -43,6 +43,7 @@
 #include <linux/nsproxy.h>
 #include <linux/mount.h>
 #include <linux/ipc_namespace.h>
+#include <linux/rhashtable.h>
 
 #include <linux/uaccess.h>
 
diff --git a/ipc/util.c b/ipc/util.c
index 4e81182fa0ac..fdffff41f65b 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -63,6 +63,7 @@
 #include <linux/rwsem.h>
 #include <linux/memory.h>
 #include <linux/ipc_namespace.h>
+#include <linux/rhashtable.h>
 
 #include <asm/unistd.h>
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 9427b5766134..c9fafea7dc6e 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -28,6 +28,7 @@
 #include <linux/rhashtable.h>
 #include <linux/err.h>
 #include <linux/export.h>
+#include <linux/rhashtable.h>
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4U
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9e35b81d093..316518f87294 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <linux/rhashtable.h>
 
 #include <net/sock.h>
 #include <net/inet_frag.h>
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 2fb4de3f7f66..adbc3d3a560b 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -60,6 +60,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
 #include <linux/export.h>
+#include <linux/rhashtable.h>
 #include <net/ip_tunnels.h>
 #include <net/checksum.h>
 #include <net/netlink.h>
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 30221701614c..4f39b27b5084 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -2,6 +2,7 @@
  * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
  */
 
+#include <linux/rhashtable.h>
 #include <linux/mroute_base.h>
 
 /* Sets everything common except 'dev', since that is done under locking */
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 298fd8b6ed17..dfb339f79bde 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -32,6 +32,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/compat.h>
+#include <linux/rhashtable.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <net/raw.h>
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 7f5621d09571..e24a91c7892a 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -17,6 +17,7 @@
 #include <linux/net.h>
 #include <linux/in6.h>
 #include <linux/slab.h>
+#include <linux/rhashtable.h>
 
 #include <net/ipv6.h>
 #include <net/protocol.h>
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 33fb35cbfac1..b1791129a875 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -22,6 +22,7 @@
 #include <linux/icmpv6.h>
 #include <linux/mroute6.h>
 #include <linux/slab.h>
+#include <linux/rhashtable.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 91e80aa852d6..85077c2f3379 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
 #include <linux/vmalloc.h>
+#include <linux/rhashtable.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
diff --git a/net/sctp/input.c b/net/sctp/input.c
index ba8a6e6c36fa..9bbc5f92c941 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -56,6 +56,7 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/checksum.h>
 #include <net/net_namespace.h>
+#include <linux/rhashtable.h>
 
 /* Forward declarations for internal helpers. */
 static int sctp_rcv_ootb(struct sk_buff *);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index ae7e7c606f72..0adce7b22675 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -66,6 +66,7 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/compat.h>
+#include <linux/rhashtable.h>
 
 #include <net/ip.h>
 #include <net/icmp.h>

^ permalink raw reply related

* [PATCH 01/18] rhashtable: silence RCU warning in rhashtable_test.
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel
In-Reply-To: <152782754287.30340.4395718227884933670.stgit@noble>

print_ht in rhashtable_test calls rht_dereference() with neither
RCU protection or the mutex.  This triggers an RCU warning.
So take the mutex to silence the warning.

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/test_rhashtable.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index f4000c137dbe..bf92b7aa2a49 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -499,6 +499,8 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
 	unsigned int i, cnt = 0;

 	ht = &rhlt->ht;
+	/* Take the mutex to avoid RCU warning */
+	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
 	for (i = 0; i < tbl->size; i++) {
 		struct rhash_head *pos, *next;
@@ -532,6 +534,7 @@ static unsigned int __init print_ht(struct rhltable *rhlt)
 		}
 	}
 	printk(KERN_ERR "\n---- ht: ----%s\n-------------\n", buff);
+	mutex_unlock(&ht->mutex);

 	return cnt;
 }

^ permalink raw reply related

* [RFC PATCH 00/18] Assorted rhashtable improvements
From: NeilBrown @ 2018-06-01  4:44 UTC (permalink / raw)
  To: Thomas Graf, Herbert Xu; +Cc: netdev, linux-kernel

Hi,
 the following is my current set of rhashtable improvements.
 Some have been seen before, some have been improved,
 others are new.

 They include:
   working list-nulls support
   stability improvements for rhashtable_walk
   bit-spin-locks for simplicity and reduced cache footprint
      during modification
   optional per-cpu locks to improve scalability for modificiation
   various cleanups

 If I get suitable acks I will send more focused subsets to Davem for
 inclusion.

 I had said previously that I thought there was a way to provide
 stable walking of an rhl table in the face of concurrent
 insert/delete.  Having tried, I no longer think this can be
 done without substantial impact to lookups and/or other operations.
 The idea of attaching a marker to the list is incompatible with
 the normal rules for working with rcu-protected lists ("attaching"
 might be manageable.  "moving" or "removing" is the problematic part).

 The last patch is the one I'm least certain of.  It seems like a good
 idea to improve the chance of a walk avoiding any rehash, but it
 cannot provide a solid guarantee without risking a denial-of-service.
 My compromise is to guarantee no rehashes caused by shrinkage, and
 discourage rehashes caused by growth.  I'm not yet sure if that is
 sufficiently valuable, but I thought I would include the patch in the
 RFC anyway.

Thanks,
NeilBrown


---

NeilBrown (18):
      rhashtable: silence RCU warning in rhashtable_test.
      rhashtable: split rhashtable.h
      rhashtable: remove nulls_base and related code.
      rhashtable: detect when object movement might have invalidated a lookup
      rhashtable: simplify INIT_RHT_NULLS_HEAD()
      rhashtable: simplify nested_table_alloc() and rht_bucket_nested_insert()
      rhashtable: use cmpxchg() to protect ->future_tbl.
      rhashtable: clean up dereference of ->future_tbl.
      rhashtable: use cmpxchg() in nested_table_alloc()
      rhashtable: remove rhashtable_walk_peek()
      rhashtable: further improve stability of rhashtable_walk
      rhashtable: add rhashtable_walk_prev()
      rhashtable: don't hold lock on first table throughout insertion.
      rhashtable: allow rht_bucket_var to return NULL.
      rhashtable: use bit_spin_locks to protect hash bucket.
      rhashtable: allow percpu element counter
      rhashtable: rename rht_for_each*continue as *from.
      rhashtable: add rhashtable_walk_delay_rehash()


 .clang-format                                     |    8 
 MAINTAINERS                                       |    2 
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h        |    1 
 drivers/staging/lustre/lustre/fid/fid_request.c   |    2 
 drivers/staging/lustre/lustre/fld/fld_request.c   |    1 
 drivers/staging/lustre/lustre/include/lu_object.h |    1 
 include/linux/ipc.h                               |    2 
 include/linux/ipc_namespace.h                     |    2 
 include/linux/mroute_base.h                       |    2 
 include/linux/percpu_counter.h                    |    4 
 include/linux/rhashtable-types.h                  |  147 ++++++
 include/linux/rhashtable.h                        |  537 +++++++++++----------
 include/net/inet_frag.h                           |    2 
 include/net/netfilter/nf_flow_table.h             |    2 
 include/net/sctp/structs.h                        |    2 
 include/net/seg6.h                                |    2 
 include/net/seg6_hmac.h                           |    2 
 ipc/msg.c                                         |    1 
 ipc/sem.c                                         |    1 
 ipc/shm.c                                         |    1 
 ipc/util.c                                        |    2 
 lib/rhashtable.c                                  |  481 +++++++++++--------
 lib/test_rhashtable.c                             |   22 +
 net/bridge/br_fdb.c                               |    1 
 net/bridge/br_vlan.c                              |    1 
 net/bridge/br_vlan_tunnel.c                       |    1 
 net/ipv4/inet_fragment.c                          |    1 
 net/ipv4/ipmr.c                                   |    2 
 net/ipv4/ipmr_base.c                              |    1 
 net/ipv6/ip6mr.c                                  |    2 
 net/ipv6/seg6.c                                   |    1 
 net/ipv6/seg6_hmac.c                              |    1 
 net/netfilter/nf_tables_api.c                     |    1 
 net/sctp/input.c                                  |    1 
 net/sctp/socket.c                                 |    1 
 35 files changed, 760 insertions(+), 481 deletions(-)
 create mode 100644 include/linux/rhashtable-types.h

--
Signature

^ permalink raw reply

* Re: linux-next: build failure after merge of the net-next tree
From: Stephen Rothwell @ 2018-06-01  3:59 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List,
	Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann
In-Reply-To: <20180531073855.29c23fce@canb.auug.org.au>

[-- Attachment #1: Type: text/plain, Size: 1995 bytes --]

Hi all,

On Thu, 31 May 2018 07:38:55 +1000 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>
> On Tue, 29 May 2018 13:25:48 +1000 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
> >
> > After merging the net-next tree, today's linux-next build (x86_64
> > allmodconfig) failed like this:
> > 
> > x86_64-linux-ld: unknown architecture of input file `net/bpfilter/bpfilter_umh.o' is incompatible with i386:x86-64 output
> > 
> > Caused by commit
> > 
> >   d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
> > 
> > In my builds, the host is PowerPC 64 LE ...
> > 
> > I have reverted that commit along with
> > 
> >   61a552eb487f ("bpfilter: fix build dependency")
> >   13405468f49d ("bpfilter: don't pass O_CREAT when opening console for debug")
> > 
> > for today.  
> 
> I am still getting this failure (well, at least yesterday I did).

Still happened today.  My guess is that bpfilter_umh needs to be built
with the kernel compiler (not the host compiler - since ir is meant to
run on the some machine as the kernel, right?) but will require the
kernel architecture libc etc


I replaced the reverts above with the following:

From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 1 Jun 2018 13:33:28 +1000
Subject: [PATCH] net: bpfilter: mark as BROKEN for now

This does not build in a cross compile environment

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 net/bpfilter/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index a948b072c28f..ea4be72fdf6e 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -2,6 +2,7 @@ menuconfig BPFILTER
 	bool "BPF based packet filtering framework (BPFILTER)"
 	default n
 	depends on NET && BPF && INET
+	depends on BROKEN
 	help
 	  This builds experimental bpfilter framework that is aiming to
 	  provide netfilter compatible functionality via BPF
-- 
2.17.0

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply related

* Re: [PATCH net-next] qed: Add srq core support for RoCE and iWARP
From: kbuild test robot @ 2018-06-01  3:41 UTC (permalink / raw)
  To: Yuval Bason
  Cc: kbuild-all, yuval.bason, davem, netdev, jgg, dledford, linux-rdma,
	Michal Kalderon, Ariel Elior
In-Reply-To: <20180530131137.4653-1-yuval.bason@cavium.com>

Hi Yuval,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Yuval-Bason/qed-Add-srq-core-support-for-RoCE-and-iWARP/20180601-073407
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   drivers/net/ethernet/qlogic/qed/qed_rdma.c:137:5: sparse: symbol 'qed_rdma_get_sb_id' was not declared. Should it be static?
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:448:32: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:448:32: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:452:36: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:452:36: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:459:27: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:459:27: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:471:19: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:471:19: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:544:30: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:709:5: sparse: symbol 'qed_rdma_stop' was not declared. Should it be static?
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:796:33: sparse: cast removes address space of expression
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:899:16: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:899:16: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:921:16: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:921:16: sparse: expression using sizeof(void)
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1063:31: sparse: incorrect type in assignment (different base types) @@    expected restricted __le16 [usertype] int_timeout @@    got unsignedrestricted __le16 [usertype] int_timeout @@
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1063:31:    expected restricted __le16 [usertype] int_timeout
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1063:31:    got unsigned short [unsigned] [usertype] int_timeout
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1165:21: sparse: incorrect type in assignment (different base types) @@    expected unsigned short [unsigned] [short] [usertype] <noident> @@    got unsigned] [short] [usertype] <noident> @@
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1165:21:    expected unsigned short [unsigned] [short] [usertype] <noident>
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1165:21:    got restricted __le16 [usertype] <noident>
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1166:21: sparse: incorrect type in assignment (different base types) @@    expected unsigned short [unsigned] [short] [usertype] <noident> @@    got unsigned] [short] [usertype] <noident> @@
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1166:21:    expected unsigned short [unsigned] [short] [usertype] <noident>
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1166:21:    got restricted __le16 [usertype] <noident>
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1167:21: sparse: incorrect type in assignment (different base types) @@    expected unsigned short [unsigned] [short] [usertype] <noident> @@    got unsigned] [short] [usertype] <noident> @@
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1167:21:    expected unsigned short [unsigned] [short] [usertype] <noident>
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1167:21:    got restricted __le16 [usertype] <noident>
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1458:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1458:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1458:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1458:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1458:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1458:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1462:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1462:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1462:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1462:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1462:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1462:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1465:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1465:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1465:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1465:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1465:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1465:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1470:17: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1470:17:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1470:17:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1470:17: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1470:17:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1470:17:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1474:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1474:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1474:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1474:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1474:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1474:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1478:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1478:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1478:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1478:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1478:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1478:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1482:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1482:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1482:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1482:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1482:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1482:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1486:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1486:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1486:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1486:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1486:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1486:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1490:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1490:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1490:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1490:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1490:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1490:9:    right side has type unsigned long long
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1493:9: sparse: invalid assignment: &=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1493:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1493:9:    right side has type int
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1493:9: sparse: invalid assignment: |=
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1493:9:    left side has type restricted __le16
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1493:9:    right side has type unsigned long long
>> drivers/net/ethernet/qlogic/qed/qed_rdma.c:1679:29: sparse: incorrect type in assignment (different base types) @@    expected restricted __le32 [usertype] wqe_limit @@    got restricted __le32 [usertype] wqe_limit @@
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1679:29:    expected restricted __le32 [usertype] wqe_limit
   drivers/net/ethernet/qlogic/qed/qed_rdma.c:1679:29:    got restricted __le16 [usertype] <noident>
>> drivers/net/ethernet/qlogic/qed/qed_rdma.c:1655:5: sparse: symbol 'qed_rdma_modify_srq' was not declared. Should it be static?
>> drivers/net/ethernet/qlogic/qed/qed_rdma.c:1691:5: sparse: symbol 'qed_rdma_destroy_srq' was not declared. Should it be static?
>> drivers/net/ethernet/qlogic/qed/qed_rdma.c:1734:5: sparse: symbol 'qed_rdma_create_srq' was not declared. Should it be static?

Please review and possibly fold the followup patch.

vim +1679 drivers/net/ethernet/qlogic/qed/qed_rdma.c

  1425	
  1426	static int
  1427	qed_rdma_register_tid(void *rdma_cxt,
  1428			      struct qed_rdma_register_tid_in_params *params)
  1429	{
  1430		struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
  1431		struct rdma_register_tid_ramrod_data *p_ramrod;
  1432		struct qed_sp_init_data init_data;
  1433		struct qed_spq_entry *p_ent;
  1434		enum rdma_tid_type tid_type;
  1435		u8 fw_return_code;
  1436		int rc;
  1437	
  1438		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", params->itid);
  1439	
  1440		/* Get SPQ entry */
  1441		memset(&init_data, 0, sizeof(init_data));
  1442		init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
  1443		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
  1444	
  1445		rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_REGISTER_MR,
  1446					 p_hwfn->p_rdma_info->proto, &init_data);
  1447		if (rc) {
  1448			DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
  1449			return rc;
  1450		}
  1451	
  1452		if (p_hwfn->p_rdma_info->last_tid < params->itid)
  1453			p_hwfn->p_rdma_info->last_tid = params->itid;
  1454	
  1455		p_ramrod = &p_ent->ramrod.rdma_register_tid;
  1456	
  1457		p_ramrod->flags = 0;
  1458		SET_FIELD(p_ramrod->flags,
  1459			  RDMA_REGISTER_TID_RAMROD_DATA_TWO_LEVEL_PBL,
  1460			  params->pbl_two_level);
  1461	
  1462		SET_FIELD(p_ramrod->flags,
  1463			  RDMA_REGISTER_TID_RAMROD_DATA_ZERO_BASED, params->zbva);
  1464	
  1465		SET_FIELD(p_ramrod->flags,
  1466			  RDMA_REGISTER_TID_RAMROD_DATA_PHY_MR, params->phy_mr);
  1467	
  1468		/* Don't initialize D/C field, as it may override other bits. */
  1469		if (!(params->tid_type == QED_RDMA_TID_FMR) && !(params->dma_mr))
  1470			SET_FIELD(p_ramrod->flags,
  1471				  RDMA_REGISTER_TID_RAMROD_DATA_PAGE_SIZE_LOG,
  1472				  params->page_size_log - 12);
  1473	
  1474		SET_FIELD(p_ramrod->flags,
  1475			  RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_READ,
  1476			  params->remote_read);
  1477	
  1478		SET_FIELD(p_ramrod->flags,
  1479			  RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_WRITE,
  1480			  params->remote_write);
  1481	
  1482		SET_FIELD(p_ramrod->flags,
  1483			  RDMA_REGISTER_TID_RAMROD_DATA_REMOTE_ATOMIC,
  1484			  params->remote_atomic);
  1485	
  1486		SET_FIELD(p_ramrod->flags,
  1487			  RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_WRITE,
  1488			  params->local_write);
  1489	
> 1490		SET_FIELD(p_ramrod->flags,
  1491			  RDMA_REGISTER_TID_RAMROD_DATA_LOCAL_READ, params->local_read);
  1492	
> 1493		SET_FIELD(p_ramrod->flags,
  1494			  RDMA_REGISTER_TID_RAMROD_DATA_ENABLE_MW_BIND,
  1495			  params->mw_bind);
  1496	
  1497		SET_FIELD(p_ramrod->flags1,
  1498			  RDMA_REGISTER_TID_RAMROD_DATA_PBL_PAGE_SIZE_LOG,
  1499			  params->pbl_page_size_log - 12);
  1500	
  1501		SET_FIELD(p_ramrod->flags2,
  1502			  RDMA_REGISTER_TID_RAMROD_DATA_DMA_MR, params->dma_mr);
  1503	
  1504		switch (params->tid_type) {
  1505		case QED_RDMA_TID_REGISTERED_MR:
  1506			tid_type = RDMA_TID_REGISTERED_MR;
  1507			break;
  1508		case QED_RDMA_TID_FMR:
  1509			tid_type = RDMA_TID_FMR;
  1510			break;
  1511		case QED_RDMA_TID_MW_TYPE1:
  1512			tid_type = RDMA_TID_MW_TYPE1;
  1513			break;
  1514		case QED_RDMA_TID_MW_TYPE2A:
  1515			tid_type = RDMA_TID_MW_TYPE2A;
  1516			break;
  1517		default:
  1518			rc = -EINVAL;
  1519			DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
  1520			return rc;
  1521		}
  1522		SET_FIELD(p_ramrod->flags1,
  1523			  RDMA_REGISTER_TID_RAMROD_DATA_TID_TYPE, tid_type);
  1524	
  1525		p_ramrod->itid = cpu_to_le32(params->itid);
  1526		p_ramrod->key = params->key;
  1527		p_ramrod->pd = cpu_to_le16(params->pd);
  1528		p_ramrod->length_hi = (u8)(params->length >> 32);
  1529		p_ramrod->length_lo = DMA_LO_LE(params->length);
  1530		if (params->zbva) {
  1531			/* Lower 32 bits of the registered MR address.
  1532			 * In case of zero based MR, will hold FBO
  1533			 */
  1534			p_ramrod->va.hi = 0;
  1535			p_ramrod->va.lo = cpu_to_le32(params->fbo);
  1536		} else {
  1537			DMA_REGPAIR_LE(p_ramrod->va, params->vaddr);
  1538		}
  1539		DMA_REGPAIR_LE(p_ramrod->pbl_base, params->pbl_ptr);
  1540	
  1541		/* DIF */
  1542		if (params->dif_enabled) {
  1543			SET_FIELD(p_ramrod->flags2,
  1544				  RDMA_REGISTER_TID_RAMROD_DATA_DIF_ON_HOST_FLG, 1);
  1545			DMA_REGPAIR_LE(p_ramrod->dif_error_addr,
  1546				       params->dif_error_addr);
  1547			DMA_REGPAIR_LE(p_ramrod->dif_runt_addr, params->dif_runt_addr);
  1548		}
  1549	
  1550		rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
  1551		if (rc)
  1552			return rc;
  1553	
  1554		if (fw_return_code != RDMA_RETURN_OK) {
  1555			DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code);
  1556			return -EINVAL;
  1557		}
  1558	
  1559		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Register TID, rc = %d\n", rc);
  1560		return rc;
  1561	}
  1562	
  1563	static int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid)
  1564	{
  1565		struct qed_hwfn *p_hwfn = (struct qed_hwfn *)rdma_cxt;
  1566		struct rdma_deregister_tid_ramrod_data *p_ramrod;
  1567		struct qed_sp_init_data init_data;
  1568		struct qed_spq_entry *p_ent;
  1569		struct qed_ptt *p_ptt;
  1570		u8 fw_return_code;
  1571		int rc;
  1572	
  1573		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "itid = %08x\n", itid);
  1574	
  1575		/* Get SPQ entry */
  1576		memset(&init_data, 0, sizeof(init_data));
  1577		init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
  1578		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
  1579	
  1580		rc = qed_sp_init_request(p_hwfn, &p_ent, RDMA_RAMROD_DEREGISTER_MR,
  1581					 p_hwfn->p_rdma_info->proto, &init_data);
  1582		if (rc) {
  1583			DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
  1584			return rc;
  1585		}
  1586	
  1587		p_ramrod = &p_ent->ramrod.rdma_deregister_tid;
  1588		p_ramrod->itid = cpu_to_le32(itid);
  1589	
  1590		rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
  1591		if (rc) {
  1592			DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d\n", rc);
  1593			return rc;
  1594		}
  1595	
  1596		if (fw_return_code == RDMA_RETURN_DEREGISTER_MR_BAD_STATE_ERR) {
  1597			DP_NOTICE(p_hwfn, "fw_return_code = %d\n", fw_return_code);
  1598			return -EINVAL;
  1599		} else if (fw_return_code == RDMA_RETURN_NIG_DRAIN_REQ) {
  1600			/* Bit indicating that the TID is in use and a nig drain is
  1601			 * required before sending the ramrod again
  1602			 */
  1603			p_ptt = qed_ptt_acquire(p_hwfn);
  1604			if (!p_ptt) {
  1605				rc = -EBUSY;
  1606				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
  1607					   "Failed to acquire PTT\n");
  1608				return rc;
  1609			}
  1610	
  1611			rc = qed_mcp_drain(p_hwfn, p_ptt);
  1612			if (rc) {
  1613				qed_ptt_release(p_hwfn, p_ptt);
  1614				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
  1615					   "Drain failed\n");
  1616				return rc;
  1617			}
  1618	
  1619			qed_ptt_release(p_hwfn, p_ptt);
  1620	
  1621			/* Resend the ramrod */
  1622			rc = qed_sp_init_request(p_hwfn, &p_ent,
  1623						 RDMA_RAMROD_DEREGISTER_MR,
  1624						 p_hwfn->p_rdma_info->proto,
  1625						 &init_data);
  1626			if (rc) {
  1627				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
  1628					   "Failed to init sp-element\n");
  1629				return rc;
  1630			}
  1631	
  1632			rc = qed_spq_post(p_hwfn, p_ent, &fw_return_code);
  1633			if (rc) {
  1634				DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
  1635					   "Ramrod failed\n");
  1636				return rc;
  1637			}
  1638	
  1639			if (fw_return_code != RDMA_RETURN_OK) {
  1640				DP_NOTICE(p_hwfn, "fw_return_code = %d\n",
  1641					  fw_return_code);
  1642				return rc;
  1643			}
  1644		}
  1645	
  1646		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "De-registered TID, rc = %d\n", rc);
  1647		return rc;
  1648	}
  1649	
  1650	static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
  1651	{
  1652		return QED_LEADING_HWFN(cdev);
  1653	}
  1654	
> 1655	int qed_rdma_modify_srq(void *rdma_cxt,
  1656				struct qed_rdma_modify_srq_in_params *in_params)
  1657	{
  1658		struct rdma_srq_modify_ramrod_data *p_ramrod;
  1659		struct qed_hwfn *p_hwfn = rdma_cxt;
  1660		struct qed_sp_init_data init_data;
  1661		struct qed_spq_entry *p_ent;
  1662		u16 opaque_fid;
  1663		int rc;
  1664	
  1665		memset(&init_data, 0, sizeof(init_data));
  1666		init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
  1667		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
  1668	
  1669		rc = qed_sp_init_request(p_hwfn, &p_ent,
  1670					 RDMA_RAMROD_MODIFY_SRQ,
  1671					 p_hwfn->p_rdma_info->proto, &init_data);
  1672		if (rc)
  1673			return rc;
  1674	
  1675		p_ramrod = &p_ent->ramrod.rdma_modify_srq;
  1676		p_ramrod->srq_id.srq_idx = cpu_to_le16(in_params->srq_id);
  1677		opaque_fid = p_hwfn->hw_info.opaque_fid;
  1678		p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
> 1679		p_ramrod->wqe_limit = cpu_to_le16(in_params->wqe_limit);
  1680	
  1681		rc = qed_spq_post(p_hwfn, p_ent, NULL);
  1682		if (rc)
  1683			return rc;
  1684	
  1685		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "modified SRQ id = %x",
  1686			   in_params->srq_id);
  1687	
  1688		return rc;
  1689	}
  1690	
> 1691	int qed_rdma_destroy_srq(void *rdma_cxt,
  1692				 struct qed_rdma_destroy_srq_in_params *in_params)
  1693	{
  1694		struct rdma_srq_destroy_ramrod_data *p_ramrod;
  1695		struct qed_hwfn *p_hwfn = rdma_cxt;
  1696		struct qed_sp_init_data init_data;
  1697		struct qed_spq_entry *p_ent;
  1698		struct qed_bmap *bmap;
  1699		u16 opaque_fid;
  1700		int rc;
  1701	
  1702		opaque_fid = p_hwfn->hw_info.opaque_fid;
  1703	
  1704		memset(&init_data, 0, sizeof(init_data));
  1705		init_data.opaque_fid = opaque_fid;
  1706		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
  1707	
  1708		rc = qed_sp_init_request(p_hwfn, &p_ent,
  1709					 RDMA_RAMROD_DESTROY_SRQ,
  1710					 p_hwfn->p_rdma_info->proto, &init_data);
  1711		if (rc)
  1712			return rc;
  1713	
  1714		p_ramrod = &p_ent->ramrod.rdma_destroy_srq;
  1715		p_ramrod->srq_id.srq_idx = cpu_to_le16(in_params->srq_id);
  1716		p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
  1717	
  1718		rc = qed_spq_post(p_hwfn, p_ent, NULL);
  1719		if (rc)
  1720			return rc;
  1721	
  1722		bmap = &p_hwfn->p_rdma_info->srq_map;
  1723	
  1724		spin_lock_bh(&p_hwfn->p_rdma_info->lock);
  1725		qed_bmap_release_id(p_hwfn, bmap, in_params->srq_id);
  1726		spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
  1727	
  1728		DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "SRQ destroyed Id = %x",
  1729			   in_params->srq_id);
  1730	
  1731		return rc;
  1732	}
  1733	
> 1734	int qed_rdma_create_srq(void *rdma_cxt,
  1735				struct qed_rdma_create_srq_in_params *in_params,
  1736				struct qed_rdma_create_srq_out_params *out_params)
  1737	{
  1738		struct rdma_srq_create_ramrod_data *p_ramrod;
  1739		struct qed_hwfn *p_hwfn = rdma_cxt;
  1740		struct qed_sp_init_data init_data;
  1741		enum qed_cxt_elem_type elem_type;
  1742		struct qed_spq_entry *p_ent;
  1743		u16 opaque_fid, srq_id;
  1744		struct qed_bmap *bmap;
  1745		u32 returned_id;
  1746		int rc;
  1747	
  1748		bmap = &p_hwfn->p_rdma_info->srq_map;
  1749		spin_lock_bh(&p_hwfn->p_rdma_info->lock);
  1750		rc = qed_rdma_bmap_alloc_id(p_hwfn, bmap, &returned_id);
  1751		spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
  1752	
  1753		if (rc) {
  1754			DP_NOTICE(p_hwfn, "failed to allocate srq id\n");
  1755			return rc;
  1756		}
  1757	
  1758		elem_type = QED_ELEM_SRQ;
  1759		rc = qed_cxt_dynamic_ilt_alloc(p_hwfn, elem_type, returned_id);
  1760		if (rc)
  1761			goto err;
  1762		/* returned id is no greater than u16 */
  1763		srq_id = (u16)returned_id;
  1764		opaque_fid = p_hwfn->hw_info.opaque_fid;
  1765	
  1766		memset(&init_data, 0, sizeof(init_data));
  1767		opaque_fid = p_hwfn->hw_info.opaque_fid;
  1768		init_data.opaque_fid = opaque_fid;
  1769		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
  1770	
  1771		rc = qed_sp_init_request(p_hwfn, &p_ent,
  1772					 RDMA_RAMROD_CREATE_SRQ,
  1773					 p_hwfn->p_rdma_info->proto, &init_data);
  1774		if (rc)
  1775			goto err;
  1776	
  1777		p_ramrod = &p_ent->ramrod.rdma_create_srq;
  1778		DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, in_params->pbl_base_addr);
  1779		p_ramrod->pages_in_srq_pbl = cpu_to_le16(in_params->num_pages);
  1780		p_ramrod->pd_id = cpu_to_le16(in_params->pd_id);
  1781		p_ramrod->srq_id.srq_idx = cpu_to_le16(srq_id);
  1782		p_ramrod->srq_id.opaque_fid = cpu_to_le16(opaque_fid);
  1783		p_ramrod->page_size = cpu_to_le16(in_params->page_size);
  1784		DMA_REGPAIR_LE(p_ramrod->producers_addr, in_params->prod_pair_addr);
  1785	
  1786		rc = qed_spq_post(p_hwfn, p_ent, NULL);
  1787		if (rc)
  1788			goto err;
  1789	
  1790		out_params->srq_id = srq_id;
  1791	
  1792		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
  1793			   "SRQ created Id = %x\n", out_params->srq_id);
  1794	
  1795		return rc;
  1796	
  1797	err:
  1798		spin_lock_bh(&p_hwfn->p_rdma_info->lock);
  1799		qed_bmap_release_id(p_hwfn, bmap, returned_id);
  1800		spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
  1801	
  1802		return rc;
  1803	}
  1804	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* [RFC PATCH] qed: qed_rdma_modify_srq() can be static
From: kbuild test robot @ 2018-06-01  3:41 UTC (permalink / raw)
  To: Yuval Bason
  Cc: kbuild-all, yuval.bason, davem, netdev, jgg, dledford, linux-rdma,
	Michal Kalderon, Ariel Elior
In-Reply-To: <20180530131137.4653-1-yuval.bason@cavium.com>


Fixes: 27c50d39911b ("qed: Add srq core support for RoCE and iWARP")
Signed-off-by: kbuild test robot <fengguang.wu@intel.com>
---
 qed_rdma.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
index bd23659..f118328 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c
@@ -1652,8 +1652,8 @@ static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
 	return QED_LEADING_HWFN(cdev);
 }
 
-int qed_rdma_modify_srq(void *rdma_cxt,
-			struct qed_rdma_modify_srq_in_params *in_params)
+static int qed_rdma_modify_srq(void *rdma_cxt,
+			       struct qed_rdma_modify_srq_in_params *in_params)
 {
 	struct rdma_srq_modify_ramrod_data *p_ramrod;
 	struct qed_hwfn *p_hwfn = rdma_cxt;
@@ -1688,8 +1688,8 @@ int qed_rdma_modify_srq(void *rdma_cxt,
 	return rc;
 }
 
-int qed_rdma_destroy_srq(void *rdma_cxt,
-			 struct qed_rdma_destroy_srq_in_params *in_params)
+static int qed_rdma_destroy_srq(void *rdma_cxt,
+				struct qed_rdma_destroy_srq_in_params *in_params)
 {
 	struct rdma_srq_destroy_ramrod_data *p_ramrod;
 	struct qed_hwfn *p_hwfn = rdma_cxt;
@@ -1731,9 +1731,9 @@ int qed_rdma_destroy_srq(void *rdma_cxt,
 	return rc;
 }
 
-int qed_rdma_create_srq(void *rdma_cxt,
-			struct qed_rdma_create_srq_in_params *in_params,
-			struct qed_rdma_create_srq_out_params *out_params)
+static int qed_rdma_create_srq(void *rdma_cxt,
+			       struct qed_rdma_create_srq_in_params *in_params,
+			       struct qed_rdma_create_srq_out_params *out_params)
 {
 	struct rdma_srq_create_ramrod_data *p_ramrod;
 	struct qed_hwfn *p_hwfn = rdma_cxt;

^ permalink raw reply related

* Re: [PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info
From: Dmitry V. Levin @ 2018-06-01  3:12 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Eugene Syromiatnikov, netdev, linux-kernel, Martin KaFai Lau,
	Daniel Borkmann, Alexei Starovoitov, David S. Miller, Jiri Olsa,
	Ingo Molnar, Lawrence Brakmo, Andrey Ignatov, Jakub Kicinski,
	John Fastabend
In-Reply-To: <20180530181857.GA6744@altlinux.org>

[-- Attachment #1: Type: text/plain, Size: 3127 bytes --]

Hi,

Looks like the ABI bug in bpf_map_info and bpf_prog info introduced
in 4.16 is going to slip into 4.17, causing extra pain to 32-bit
userspace.  I'm adding Linus to this thread in hope it might help
to get a fix applied before 4.17 is released.

On Wed, May 30, 2018 at 09:18:58PM +0300, Dmitry V. Levin wrote:
> On Sun, May 27, 2018 at 01:28:42PM +0200, Eugene Syromiatnikov wrote:
> > Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
> > has broken compat, as offsets of these fields are different in 32-bit
> > and 64-bit ABIs.  One fix (other than implementing compat support in
> > syscall in order to handle this discrepancy) is to use __aligned_u64
> > instead of __u64 for these fields.
> > 
> > Reported-by: Dmitry V. Levin <ldv@altlinux.org>
> > Fixes: 52775b33bb507 ("bpf: offload: report device information about
> > offloaded maps")
> > Fixes: 675fc275a3a2d ("bpf: offload: report device information for
> > offloaded programs")
> > 
> > Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
> 
> Reviewed-by: "Dmitry V. Levin" <ldv@altlinux.org>
> Cc: <stable@vger.kernel.org> # v4.16+
> 
> Thanks,
> 
> > ---
> >  include/uapi/linux/bpf.h       | 8 ++++----
> >  tools/include/uapi/linux/bpf.h | 8 ++++----
> >  2 files changed, 8 insertions(+), 8 deletions(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index c5ec897..903010a 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -1017,8 +1017,8 @@ struct bpf_prog_info {
> >  	__aligned_u64 map_ids;
> >  	char name[BPF_OBJ_NAME_LEN];
> >  	__u32 ifindex;
> > -	__u64 netns_dev;
> > -	__u64 netns_ino;
> > +	__aligned_u64 netns_dev;
> > +	__aligned_u64 netns_ino;
> >  } __attribute__((aligned(8)));
> >  
> >  struct bpf_map_info {
> > @@ -1030,8 +1030,8 @@ struct bpf_map_info {
> >  	__u32 map_flags;
> >  	char  name[BPF_OBJ_NAME_LEN];
> >  	__u32 ifindex;
> > -	__u64 netns_dev;
> > -	__u64 netns_ino;
> > +	__aligned_u64 netns_dev;
> > +	__aligned_u64 netns_ino;
> >  } __attribute__((aligned(8)));
> >  
> >  /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
> > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > index c5ec897..903010a 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -1017,8 +1017,8 @@ struct bpf_prog_info {
> >  	__aligned_u64 map_ids;
> >  	char name[BPF_OBJ_NAME_LEN];
> >  	__u32 ifindex;
> > -	__u64 netns_dev;
> > -	__u64 netns_ino;
> > +	__aligned_u64 netns_dev;
> > +	__aligned_u64 netns_ino;
> >  } __attribute__((aligned(8)));
> >  
> >  struct bpf_map_info {
> > @@ -1030,8 +1030,8 @@ struct bpf_map_info {
> >  	__u32 map_flags;
> >  	char  name[BPF_OBJ_NAME_LEN];
> >  	__u32 ifindex;
> > -	__u64 netns_dev;
> > -	__u64 netns_ino;
> > +	__aligned_u64 netns_dev;
> > +	__aligned_u64 netns_ino;
> >  } __attribute__((aligned(8)));
> >  
> >  /* User bpf_sock_addr struct to access socket fields and sockaddr struct passed


-- 
ldv

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [net PATCH] net-sysfs: Fix memory leak in XPS configuration
From: David Miller @ 2018-06-01  3:03 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev
In-Reply-To: <20180531195922.7571.86674.stgit@ahduyck-green-test.jf.intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 31 May 2018 15:59:46 -0400

> This patch reorders the error cases in showing the XPS configuration so
> that we hold off on memory allocation until after we have verified that we
> can support XPS on a given ring.
> 
> Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes")
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH v2 net] ixgbe: fix parsing of TC actions for HW offload
From: David Miller @ 2018-06-01  3:01 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: ohlavaty, netdev, andrewx.bowers
In-Reply-To: <b67b75aeb43afd4812e378673151a629f1681eea.camel@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Thu, 31 May 2018 14:46:08 -0700

> On Thu, 2018-05-31 at 23:21 +0200, Ondřej Hlavatý wrote:
>> The previous code was optimistic, accepting the offload of whole
>> action
>> chain when there was a single known action (drop/redirect). This
>> results
>> in offloading a rule which should not be offloaded, because its
>> behavior
>> cannot be reproduced in the hardware.
>> 
>> For example:
>> 
>> $ tc filter add dev eno1 parent ffff: protocol ip \
>>     u32 ht 800: order 1 match tcp src 42 FFFF \
>>     action mirred egress mirror dev enp1s16 pipe \
>>     drop
>> 
>> The controller is unable to mirror the packet to a VF, but still
>> offloads the rule by dropping the packet.
>> 
>> Change the approach of the function to a pessimistic one, rejecting
>> the
>> chain when an unknown action is found. This is better suited for
>> future
>> extensions.
>> 
>> Note that both recognized actions always return TC_ACT_SHOT,
>> therefore
>> it is safe to ignore actions behind them.
>> 
>> Signed-off-by: Ondřej Hlavatý <ohlavaty@redhat.com>
> 
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> 
> Note- I am having our validation move to testing with GCC 8.1.1 or
> later so that we can catch warnings like Dave found in the future.
> 
> Dave- Please go ahead and pick this up.

Ok, applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next] virtio_net: fix error return code in virtnet_probe()
From: David Miller @ 2018-06-01  2:50 UTC (permalink / raw)
  To: weiyongjun1
  Cc: mst, jasowang, sridhar.samudrala, virtualization, netdev,
	kernel-janitors
In-Reply-To: <1527732307-145609-1-git-send-email-weiyongjun1@huawei.com>

From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 31 May 2018 02:05:07 +0000

> Fix to return a negative error code from the failover create fail error
> handling case instead of 0, as done elsewhere in this function.
> 
> Fixes: ba5e4426e80e ("virtio_net: Extend virtio to use VF datapath when available")
> Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>

Applied.

^ permalink raw reply

* Re: [PATCH] rtnetlink: Remove VLA usage
From: David Miller @ 2018-06-01  2:49 UTC (permalink / raw)
  To: keescook; +Cc: fw, dsahern, netdev, linux-kernel
In-Reply-To: <20180530222052.GA30622@beast>

From: Kees Cook <keescook@chromium.org>
Date: Wed, 30 May 2018 15:20:52 -0700

> In the quest to remove all stack VLA usage from the kernel[1], this
> allocates the maximum size expected for all possible types and adds
> sanity-checks at both registration and usage to make sure nothing gets
> out of sync. This matches the proposed VLA solution for nfnetlink[2]. The
> values chosen here were based on finding assignments for .maxtype and
> .slave_maxtype and manually counting the enums:
> 
> slave_maxtype (max 33):
 ...
> maxtype (max 45):
 ...
> 
> This additionally changes maxtype and slave_maxtype fields to unsigned,
> since they're only ever using positive values.
> 
> [1] https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qPXydAacU1RqZWA@mail.gmail.com
> [2] https://patchwork.kernel.org/patch/10439647/
> 
> Signed-off-by: Kees Cook <keescook@chromium.org>

Looks good, applied, thanks.

^ permalink raw reply

* [PATCH] net: wireless: brcmsmac: Remove unnecessary parentheses
From: Varsha Rao @ 2018-06-01  2:14 UTC (permalink / raw)
  To: Nicholas Mc Guire, Lukas Bulwahn, Arend van Spriel, Franky Lin,
	Hante Meuleman, Chi-Hsien Lin, Wright Feng, Kalle Valo,
	David S. Miller, linux-wireless, brcm80211-dev-list.pdl,
	brcm80211-dev-list, netdev, linux-kernel
  Cc: Varsha Rao

This patch fixes the clang warning of extraneous parentheses, with the
following coccinelle script.

@@
identifier i;
expression e;
statement s;
@@
if (
-(i == e)
+i == e
 )
s

Suggested-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Varsha Rao <rvarsha016@gmail.com>
---
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_cmn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_cmn.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_cmn.c
index 3a13d176b221..35e3b101e5cf 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_cmn.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/phy/phy_cmn.c
@@ -159,7 +159,7 @@ u16 read_radio_reg(struct brcms_phy *pi, u16 addr)
 {
 	u16 data;
 
-	if ((addr == RADIO_IDCODE))
+	if (addr == RADIO_IDCODE)
 		return 0xffff;
 
 	switch (pi->pubpi.phy_type) {
-- 
2.17.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox