* [PATCH v2 1/5] tools: Add kernel stubs needed for rosebush
2024-06-25 21:17 [PATCH v2 0/5] Rosebush, a new hash table Matthew Wilcox (Oracle)
@ 2024-06-25 21:17 ` Matthew Wilcox (Oracle)
2024-06-25 21:17 ` [PATCH v2 2/5] rosebush: Add new data structure Matthew Wilcox (Oracle)
` (3 subsequent siblings)
4 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-06-25 21:17 UTC (permalink / raw)
To: linux-kernel; +Cc: Matthew Wilcox (Oracle), linux-fsdevel, maple-tree
Various parts of the kernel API didn't need to exist before, so add
them:
- spin_trylock()
- DECLARE_FLEX_ARRAY
- vmalloc
- __vmalloc_node
- kvmalloc
- kvfree_rcu_mightsleep
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
tools/include/linux/compiler.h | 4 ++++
tools/include/linux/compiler_types.h | 2 ++
tools/include/linux/spinlock.h | 2 ++
tools/include/linux/stddef.h | 3 +++
tools/testing/radix-tree/linux/kernel.h | 2 ++
tools/testing/radix-tree/linux/vmalloc.h | 14 ++++++++++++++
6 files changed, 27 insertions(+)
create mode 100644 tools/include/linux/stddef.h
create mode 100644 tools/testing/radix-tree/linux/vmalloc.h
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 8a63a9913495..936b5153b0fe 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -62,6 +62,10 @@
#define __nocf_check __attribute__((nocf_check))
#endif
+#ifndef __refdata
+#define __refdata
+#endif
+
/* Are two types/vars the same type (ignoring qualifiers)? */
#ifndef __same_type
# define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h
index d09f9dc172a4..18bd8767cebc 100644
--- a/tools/include/linux/compiler_types.h
+++ b/tools/include/linux/compiler_types.h
@@ -17,6 +17,7 @@
/* context/locking */
# define __must_hold(x) __attribute__((context(x,1,1)))
# define __acquires(x) __attribute__((context(x,0,1)))
+# define __cond_acquires(x) __attribute__((context(x,0,-1)))
# define __releases(x) __attribute__((context(x,1,0)))
# define __acquire(x) __context__(x,1)
# define __release(x) __context__(x,-1)
@@ -27,6 +28,7 @@
# define __acquires(x)
# define __releases(x)
# define __acquire(x) (void)0
+# define __cond_acquires(x)
# define __release(x) (void)0
# define __cond_lock(x,c) (c)
#endif /* __CHECKER__ */
diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h
index a6cdf25b6b9d..871a6a305b9d 100644
--- a/tools/include/linux/spinlock.h
+++ b/tools/include/linux/spinlock.h
@@ -20,6 +20,8 @@
#define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x)
#define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x)
+#define spin_trylock(x) (pthread_mutex_trylock(x) == 0)
+
#define arch_spinlock_t pthread_mutex_t
#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER
diff --git a/tools/include/linux/stddef.h b/tools/include/linux/stddef.h
new file mode 100644
index 000000000000..337f520eba1d
--- /dev/null
+++ b/tools/include/linux/stddef.h
@@ -0,0 +1,3 @@
+#include <uapi/linux/stddef.h>
+
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) __DECLARE_FLEX_ARRAY(TYPE, NAME)
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index c0a2bb785b92..72a95fd9708c 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -26,4 +26,6 @@
#define __must_hold(x)
#define EXPORT_PER_CPU_SYMBOL_GPL(x)
+
+#define PAGE_SIZE 4096
#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/vmalloc.h b/tools/testing/radix-tree/linux/vmalloc.h
new file mode 100644
index 000000000000..2857c37472bb
--- /dev/null
+++ b/tools/testing/radix-tree/linux/vmalloc.h
@@ -0,0 +1,14 @@
+#include <malloc.h>
+
+#define vmalloc(x) malloc(x)
+#define __vmalloc_node(size, align, gfp, node, caller) memalign(size, align)
+#define vfree(x) free(x)
+
+#define kvmalloc(x, gfp) memalign(x, x)
+#define kvfree(x) free(x)
+
+/* A correct but slow implementation */
+#define kvfree_rcu_mightsleep(x) { \
+ synchronize_rcu(); \
+ free(x); \
+}
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH v2 2/5] rosebush: Add new data structure
2024-06-25 21:17 [PATCH v2 0/5] Rosebush, a new hash table Matthew Wilcox (Oracle)
2024-06-25 21:17 ` [PATCH v2 1/5] tools: Add kernel stubs needed for rosebush Matthew Wilcox (Oracle)
@ 2024-06-25 21:17 ` Matthew Wilcox (Oracle)
2024-06-25 23:36 ` Dr. David Alan Gilbert
` (3 more replies)
2024-06-25 21:17 ` [PATCH v2 3/5] rosebush: Add test suite Matthew Wilcox (Oracle)
` (2 subsequent siblings)
4 siblings, 4 replies; 17+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-06-25 21:17 UTC (permalink / raw)
To: linux-kernel; +Cc: Matthew Wilcox (Oracle), linux-fsdevel, maple-tree
Rosebush is a resizing hash table. See
Docuemntation/core-api/rosebush.rst for details.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
Documentation/core-api/index.rst | 1 +
Documentation/core-api/rosebush.rst | 121 +++++
MAINTAINERS | 8 +
include/linux/rosebush.h | 62 +++
lib/Makefile | 2 +-
lib/rosebush.c | 654 ++++++++++++++++++++++++++++
6 files changed, 847 insertions(+), 1 deletion(-)
create mode 100644 Documentation/core-api/rosebush.rst
create mode 100644 include/linux/rosebush.h
create mode 100644 lib/rosebush.c
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index f147854700e4..8984a7d6ad73 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -36,6 +36,7 @@ Library functionality that is used throughout the kernel.
kobject
kref
assoc_array
+ rosebush
xarray
maple_tree
idr
diff --git a/Documentation/core-api/rosebush.rst b/Documentation/core-api/rosebush.rst
new file mode 100644
index 000000000000..f0c0bc690e48
--- /dev/null
+++ b/Documentation/core-api/rosebush.rst
@@ -0,0 +1,121 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+========
+Rosebush
+========
+
+:Author: Matthew Wilcox
+
+Overview
+========
+
+Rosebush is a hashtable, different from the rhashtable. It is scalable
+(one spinlock per bucket), resizing in two dimensions (number and size
+of buckets), and concurrent (can be iterated under the RCU read lock).
+It is designed to minimise dependent cache misses, which can stall a
+modern CPU for thousands of instructions.
+
+Objects stored in a rosebush do not have an embedded linked list.
+They can therefore be placed into the same rosebush multiple times and
+be placed in multiple rosebushes. It is also possible to store pointers
+which have special meaning like ERR_PTR(). It is not possible to store
+a NULL pointer in a rosebush, as this is the return value that indicates
+the iteration has finished.
+
+The user of the rosebush is responsible for calculating their own hash.
+A high quality hash is desirable to keep the scalable properties of
+the rosebush, but a hash with poor distribution in the lower bits will
+not lead to a catastrophic breakdown. It may lead to excessive memory
+consumption and a lot of CPU time spent during lookup.
+
+Rosebush is not yet IRQ or BH safe. It can be iterated in interrupt
+context, but not modified.
+
+RCU Iteration
+-------------
+
+There is no rosebush_lookup() function. This is because the hash value
+may not be unique. Instead, the user should iterate the rosebush,
+which will return pointers which probably have matching hash values.
+It is the user's responsibility to determine if the returned pointer is
+one they are interested in.
+
+Rosebush iteration guarantees to return all pointers which have a
+matching hash, were in the rosebush before the iteration started and
+remain in the rosebush after iteration ends. It may return additional
+pointers, including pointers which do not have a matching hash value,
+but it guarantees not to skip any pointers, and it guarantees to only
+return pointers which have (at some point) been stored in the rosebush.
+
+If the rosebush is modified while the iteration is in progress, newly
+added entries may or may not be returned and removed entries may or may
+not be returned. Causality is not honoured; e.g. if Entry A is known
+to be inserted before Entry B, it is possible for an iteration to return
+Entry B and not Entry A.
+
+Functions and structures
+========================
+
+.. kernel-doc:: include/linux/rosebush.h
+.. kernel-doc:: lib/rosebush.c
+
+Internals
+=========
+
+The rosebush is organised into a top level table which contains pointers
+to buckets. Each bucket contains a spinlock (for modifications to the
+bucket), the number of entries in the bucket and a contention counter.
+
+The top level table is a power of two in size. The bottom M bits of the
+hash are used to index into this table. The bucket contains hash values
+followed by their corresponding pointers. We also track a contention
+count, which lets us know if this spinlock is overloaded and we should
+split the bucket to improve scalability.
+
+A bucket may be shared between multiple table entries. For simplicity,
+we require that all buckets are shared between a power-of-two number
+of slots. For example, a table with 8 entries might have entries that
+point to buckets A, B, A, B, A, C, A, D. If we were to then split bucket
+A, we would replace the first pair of pointers with pointers to bucket
+E and the second pair with pointers to bucket F. This is akin to the
+buddy page allocator.
+
+When we decide that the table needs to be resized, we allocate a new
+table, and duplicate the current contents of the table into each half
+of the new table. At this point, all buckets in the table are shared.
+We then split the bucket that we're trying to insert into.
+
+IRQ / BH safety
+---------------
+
+If we decide to make the rosebush modifiable in IRQ context, we need
+to take the locks in an irq-safe way; we need to figure out how to
+allocate the top level table without vmalloc(), and we need to manage
+without kvfree_rcu_mightsleep(). These all have solutions, but those
+solutions have a cost that isn't worth paying until we have users.
+
+Some of those problems go away if we limit our support to removal in IRQ
+context and only allow insertions in process context (as we do not need
+to reallocate the table or bucket when removing an item).
+
+Small rosebushes
+----------------
+
+As an optimisation, if the rosebush has no entries, the buckets pointer
+is NULL. If the rosebush has only a few entries, there are only two
+buckets (allocated as a single allocation) and the table pointer points
+directly to the first one instead of pointing to a table.
+
+Shrinking the rosebush
+----------------------
+
+We do not currently attempt either to join buckets or to shrink the table
+if sufficiently many buckets are shared. If this proves to be a desirable
+course of action, we can add support for it, with sufficient hysteresis
+that adding/removing a single entry will not cause bouncing.
+
+Rosebush statistics
+-------------------
+
+It would probably be wise to be able to gather statistics about bucket
+occupancy rates, but this work has not yet been done.
diff --git a/MAINTAINERS b/MAINTAINERS
index a39c237edb95..d4a8e99919a4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19467,6 +19467,14 @@ F: include/net/rose.h
F: include/uapi/linux/rose.h
F: net/rose/
+ROSEBUSH DATA STRUCTURE
+M: Matthew Wilcox <willy@infradead.org>
+L: maple-tree@lists.infradead.org
+S: Supported
+F: Documentation/core-api/rosebush.rst
+F: include/linux/rosebush.h
+F: lib/*rosebush.c
+
ROTATION DRIVER FOR ALLWINNER A83T
M: Jernej Skrabec <jernej.skrabec@gmail.com>
L: linux-media@vger.kernel.org
diff --git a/include/linux/rosebush.h b/include/linux/rosebush.h
new file mode 100644
index 000000000000..57f4dfa3f93d
--- /dev/null
+++ b/include/linux/rosebush.h
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* See lib/rosebush.c */
+
+#ifndef _LINUX_ROSEBUSH_H
+#define _LINUX_ROSEBUSH_H
+
+#include <linux/spinlock.h>
+
+/*
+ * Embed this struct in your struct, don't allocate it separately.
+ * None of this is for customers to use; treat it as opaque.
+ * In particular, taking the rbh_resize_lock will prevent only rbh_table
+ * from being reallocated; buckets can still be grown and split without
+ * the lock. But you will get incomprehensible lockdep warnings!
+ */
+struct rbh {
+ spinlock_t rbh_resize_lock;
+ unsigned long rbh_table; /* A tagged pointer */
+};
+
+#define DEFINE_ROSEBUSH(name) struct rbh name = \
+ { .rbh_resize_lock = __SPIN_LOCK_UNLOCKED(name.lock), }
+
+int rbh_insert(struct rbh *rbh, u32 hash, void *p);
+int rbh_reserve(struct rbh *rbh, u32 hash);
+int rbh_use(struct rbh *rbh, u32 hash, void *p);
+int rbh_remove(struct rbh *rbh, u32 hash, void *p);
+void rbh_destroy(struct rbh *rbh);
+
+/**
+ * rbh_release - Release a reserved slot in a rosebush.
+ * @rbh: The rosebush.
+ * @hash: The hash value that was reserved.
+ *
+ * If you decide that you do not need to use a reserved slot, call this
+ * function to release the slot.
+ *
+ * Return: 0 on success, -ENOENT if no reserved slot was found.
+ */
+
+static inline int rbh_release(struct rbh *rbh, u32 hash)
+{
+ return rbh_remove(rbh, hash, NULL);
+}
+
+struct rbh_iter {
+ struct rbh *rbh;
+ struct rbh_bucket *bucket;
+ u32 hash;
+ unsigned int index;
+};
+
+#define RBH_ITER(name, _rbh, _hash) \
+ struct rbh_iter name = { .rbh = _rbh, .hash = _hash, }
+
+void *rbh_next(struct rbh_iter *rbhi);
+
+void rbh_iter_remove(struct rbh_iter *rbhi, void *p);
+void rbh_iter_lock(struct rbh_iter *rbhi);
+void rbh_iter_unlock(struct rbh_iter *rbhi);
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index 3b1769045651..723e6c90b58d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector
endif
lib-y := ctype.o string.o vsprintf.o cmdline.o \
- rbtree.o radix-tree.o timerqueue.o xarray.o \
+ rosebush.o rbtree.o radix-tree.o timerqueue.o xarray.o \
maple_tree.o idr.o extable.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
diff --git a/lib/rosebush.c b/lib/rosebush.c
new file mode 100644
index 000000000000..47106a04d11d
--- /dev/null
+++ b/lib/rosebush.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Rosebush, a resizing bucket hash table
+ * Copyright (c) 2024 Oracle Corporation
+ * Author: Matthew Wilcox <willy@infradead.org>
+ */
+
+#include <linux/export.h>
+#include <linux/rcupdate.h>
+#include <linux/rosebush.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/stddef.h>
+#include <linux/vmalloc.h>
+
+#include <asm/barrier.h>
+
+/*
+ * The lock is held whenever we are modifying the contents of the bucket.
+ * The contention counter tracks whether we need to split the bucket due
+ * to contention on the spinlock.
+ * The bucket is 256 bytes in size (20 * 12 = 240, plus parent, lock, etc)
+ */
+#ifdef CONFIG_64BIT
+#define RBH_ENTRIES 20
+#else
+#define RBH_ENTRIES 30
+#endif
+
+struct rbh_bucket {
+ u32 rbh_hashes[RBH_ENTRIES];
+ void __rcu *rbh_slots[RBH_ENTRIES];
+ const struct rbh *rbh;
+ spinlock_t rbh_lock;
+ u32 rbh_contention;
+};
+
+struct rbh_table {
+ DECLARE_FLEX_ARRAY(struct rbh_bucket __rcu *, buckets);
+};
+
+struct rbh_initial_table {
+ struct rbh_bucket buckets[2];
+};
+
+/*
+ * As far as lockdep is concerned, all buckets in the same rosebush use
+ * the same lock. We use the classes to distinguish the rbh resize lock
+ * from the bucket locks. The only time we take two bucket locks is
+ * when we already hold the resize lock, so there is no need to define
+ * an order between bucket locks.
+ */
+#ifdef CONFIG_DEBUG_SPINLOCK
+#define bucket_lock_init(rbh, bucket) \
+ __raw_spin_lock_init(spinlock_check(&(bucket)->rbh_lock), \
+ "rbh", (rbh)->rbh_resize_lock.dep_map.key, LD_WAIT_SPIN)
+#else
+#define bucket_lock_init(rbh, bucket) \
+ spin_lock_init(&(bucket)->rbh_lock)
+#endif
+
+#define rbh_resize_lock(rbh) spin_lock_nested(&(rbh)->rbh_resize_lock, 2)
+#define rbh_resize_unlock(rbh) spin_unlock(&(rbh)->rbh_resize_lock)
+
+struct table_mask {
+ struct rbh_table *table;
+ u32 mask;
+};
+
+/*
+ * A very complicated way of spelling &rbh->bucket[hash].
+ *
+ * The first complication is that we encode the number of buckets
+ * in the pointer so that we can get both in an atomic load.
+ *
+ * The second complication is that small hash tables don't have a top
+ * level table; instead the buckets pointer points to a pair of buckets.
+ */
+static struct rbh_bucket *get_bucket(struct rbh *rbh, u32 hash,
+ struct table_mask *state)
+{
+ unsigned long tagged;
+ struct rbh_initial_table *initial_table;
+ unsigned int bnr;
+
+ /* rcu_dereference(), but not a pointer */
+ tagged = READ_ONCE(rbh->rbh_table);
+ if (!tagged)
+ return NULL;
+
+ /* The lowest bits indicates how many buckets the table holds */
+ state->table = (struct rbh_table *)(tagged & (tagged + 1));
+ state->mask = tagged - (unsigned long)state->table;
+ bnr = hash & state->mask;
+ if (state->mask != 1)
+ return rcu_dereference(state->table->buckets[bnr]);
+
+ initial_table = (struct rbh_initial_table *)state->table;
+ return &initial_table->buckets[bnr];
+}
+
+static struct rbh_bucket *lock_bucket(struct rbh *rbh, u32 hash)
+ __acquires(&bucket->rbh_lock)
+{
+ struct rbh_bucket *bucket, *new_bucket;
+ struct table_mask state;
+
+ bucket = get_bucket(rbh, hash, &state);
+ if (!bucket)
+ return bucket;
+again:
+ spin_lock(&bucket->rbh_lock);
+ new_bucket = get_bucket(rbh, hash, &state);
+ if (bucket == new_bucket)
+ return bucket;
+ spin_unlock(&bucket->rbh_lock);
+ bucket = new_bucket;
+ goto again;
+}
+
+static bool rbh_first(struct rbh *rbh, u32 hash)
+{
+ struct rbh_initial_table *table;
+ int i;
+
+//printk("%s: table size %zd\n", __func__, sizeof(*table));
+ table = kmalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return false;
+
+ rbh_resize_lock(rbh);
+ if (rbh->rbh_table) {
+ rbh_resize_unlock(rbh);
+ /* Somebody else resized it for us */
+ kfree(table);
+ return true;
+ }
+
+ bucket_lock_init(rbh, &table->buckets[0]);
+ table->buckets[0].rbh = rbh;
+ table->buckets[0].rbh_contention = 0;
+ bucket_lock_init(rbh, &table->buckets[1]);
+ table->buckets[1].rbh = rbh;
+ table->buckets[1].rbh_contention = 0;
+ for (i = 0; i < RBH_ENTRIES; i++) {
+ table->buckets[0].rbh_hashes[i] = ~0;
+ table->buckets[1].rbh_hashes[i] = 0;
+ }
+ /* rcu_assign_pointer() but not a pointer */
+ smp_store_release(&rbh->rbh_table, (unsigned long)table | 1);
+ rbh_resize_unlock(rbh);
+
+//printk("%s: new table = %px\n", __func__, table);
+ return true;
+}
+
+static void copy_initial_buckets(const struct rbh *rbh,
+ struct rbh_table *table, struct rbh_initial_table *init_table)
+ __acquires(&init_table->buckets[0].rbh_lock)
+ __acquires(&init_table->buckets[1].rbh_lock)
+{
+ struct rbh_bucket *bucket;
+
+ bucket = (void __force *)table->buckets[0];
+ spin_lock(&init_table->buckets[0].rbh_lock);
+ memcpy(bucket, &init_table->buckets[0], sizeof(init_table->buckets[0]));
+ bucket_lock_init(rbh, bucket);
+
+ bucket = (void __force *)table->buckets[1];
+ spin_lock_nested(&init_table->buckets[1].rbh_lock, 1);
+ memcpy(bucket, &init_table->buckets[1], sizeof(init_table->buckets[1]));
+ bucket_lock_init(rbh, bucket);
+}
+
+/*
+ * When we grow the table, we duplicate the bucket pointers so this
+ * thread doesn't pay the entire cost of growing the table.
+ */
+static int rbh_grow_table(struct rbh *rbh, u32 hash, struct table_mask *state)
+ __releases(RCU)
+ __acquires(RCU)
+{
+ struct rbh_table *table;
+ struct rbh_initial_table *init_table;
+ u32 mask = state->mask;
+ unsigned long tagged = (unsigned long)state->table | mask;
+ size_t size;
+
+ rcu_read_unlock();
+
+ size = (mask + 1) * 2 * sizeof(void *);
+ if (size > 4 * PAGE_SIZE)
+ /* XXX: NUMA_NO_NODE doesn't necessarily interleave */
+ table = __vmalloc_node(size, size, GFP_KERNEL, NUMA_NO_NODE,
+ &table);
+ else
+ table = kvmalloc(size, GFP_KERNEL);
+ if (!table) {
+ rcu_read_lock();
+ /* Maybe somebody resized it for us */
+ if (READ_ONCE(rbh->rbh_table) != tagged)
+ return 0;
+ return -ENOMEM;
+ }
+ BUG_ON((unsigned long)table & (size - 1));
+
+ if (mask == 1) {
+ /* Don't need to bother with RCU until we publish the table */
+ table->buckets[0] = (void __rcu *)kmalloc(sizeof(struct rbh_bucket), GFP_KERNEL);
+ if (!table->buckets[0])
+ goto free_all;
+ table->buckets[1] = (void __rcu *)kmalloc(sizeof(struct rbh_bucket), GFP_KERNEL);
+ if (!table->buckets[1])
+ goto free_all;
+ }
+
+ rbh_resize_lock(rbh);
+ if (rbh->rbh_table != tagged) {
+ rbh_resize_unlock(rbh);
+ /* Somebody else resized it for us */
+ kvfree(table);
+ rcu_read_lock();
+ return 0;
+ }
+
+//printk("%s: replacing table %p with table %p mask %d\n", __func__, state->table, table, mask);
+ if (mask == 1) {
+ init_table = (void *)state->table;
+ copy_initial_buckets(rbh, table, init_table);
+ } else {
+ memcpy(&table->buckets, &state->table->buckets,
+ (mask + 1) * sizeof(void *));
+ }
+ memcpy(&table->buckets[mask + 1], &table->buckets[0],
+ (mask + 1) * sizeof(void *));
+
+ tagged = ((unsigned long)table) | (mask << 1) | 1;
+ /* rcu_assign_pointer() but not a pointer */
+ smp_store_release(&rbh->rbh_table, tagged);
+ rbh_resize_unlock(rbh);
+ if (mask == 1) {
+ spin_unlock(&init_table->buckets[0].rbh_lock);
+ spin_unlock(&init_table->buckets[1].rbh_lock);
+ }
+ kvfree_rcu_mightsleep(state->table);
+
+ rcu_read_lock();
+ return 0;
+free_all:
+ kfree((void __force *)table->buckets[0]);
+ kvfree(table);
+ rcu_read_lock();
+ return -ENOMEM;
+}
+
+static void bucket_copy(const struct rbh *rbh, struct rbh_bucket *buckets[2],
+ const struct rbh_bucket *old_bucket, u32 hash, u32 mask)
+{
+ unsigned int i;
+ unsigned int js[2] = {0, 0};
+
+ bucket_lock_init(rbh, buckets[0]);
+ buckets[0]->rbh_contention = 0;
+ buckets[0]->rbh = rbh;
+ bucket_lock_init(rbh, buckets[1]);
+ buckets[1]->rbh_contention = 0;
+ buckets[1]->rbh = rbh;
+ for (i = 0; i < RBH_ENTRIES; i++) {
+ unsigned int nr = !!(old_bucket->rbh_hashes[i] & mask);
+ buckets[nr]->rbh_hashes[js[nr]] = old_bucket->rbh_hashes[i];
+ buckets[nr]->rbh_slots[js[nr]++] = old_bucket->rbh_slots[i];
+ }
+//printk("%s: bucket:%p copied %d entries from %p hash:%x mask:%x\n", __func__, buckets[0], js[0], old_bucket, hash, mask);
+//printk("%s: bucket:%p copied %d entries from %p hash:%x mask:%x\n", __func__, buckets[1], js[1], old_bucket, hash, mask);
+
+ /* Fill the new buckets with deleted entries */
+ while (js[0] < RBH_ENTRIES)
+ buckets[0]->rbh_hashes[js[0]++] = ~hash;
+ while (js[1] < RBH_ENTRIES)
+ buckets[1]->rbh_hashes[js[1]++] = ~hash;
+}
+
+#define rbh_dereference_protected(p, rbh) \
+ rcu_dereference_protected(p, lockdep_is_held(&(rbh)->rbh_resize_lock))
+
+static int rbh_split_bucket(struct rbh *rbh, struct rbh_bucket *bucket,
+ u32 hash, struct table_mask *state)
+{
+ unsigned long tagged;
+ u32 bit, mask;
+ struct rbh_table *table;
+ struct rbh_bucket *buckets[2];
+ int err = 0;
+
+ if (state->mask == 1) {
+ err = rbh_grow_table(rbh, hash, state);
+ } else {
+ u32 buddy = (hash & state->mask) ^ ((state->mask + 1) / 2);
+ if (rcu_dereference(state->table->buckets[buddy]) != bucket)
+ err = rbh_grow_table(rbh, hash, state);
+ }
+ if (err < 0)
+ return err;
+
+ rcu_read_unlock();
+
+ /* XXX: use slab */
+ buckets[0] = kmalloc(sizeof(*bucket), GFP_KERNEL);
+ if (!buckets[0])
+ return -ENOMEM;
+ buckets[1] = kmalloc(sizeof(*bucket), GFP_KERNEL);
+ if (!buckets[1]) {
+ kfree(buckets[0]);
+ return -ENOMEM;
+ }
+
+//printk("%s: adding buckets %p %p for hash %d\n", __func__, buckets[0], buckets[1], hash);
+ rbh_resize_lock(rbh);
+ tagged = rbh->rbh_table;
+ table = (struct rbh_table *)(tagged & (tagged + 1));
+ mask = tagged - (unsigned long)table;
+ hash &= mask;
+ if (rbh_dereference_protected(table->buckets[hash], rbh) != bucket)
+ goto free;
+
+ /* Figure out which entries we need to take to the new bucket */
+ bit = mask + 1;
+ while (bit > 1) {
+ bit /= 2;
+//printk("hash:%d buddy:%d\n", hash, hash ^ bit);
+ if (rbh_dereference_protected(table->buckets[hash ^ bit], rbh)
+ != bucket)
+ break;
+ }
+ bit *= 2;
+ if (bit == mask + 1)
+ goto free;
+
+ spin_lock(&bucket->rbh_lock);
+ bucket_copy(rbh, buckets, bucket, hash, bit);
+
+//printk("hash:%d mask:%d bit:%d\n", hash, mask, bit);
+ hash &= (bit - 1);
+ do {
+//printk("assigning bucket %p to index %d\n", buckets[0], hash);
+ rcu_assign_pointer(table->buckets[hash], buckets[0]);
+ hash += bit;
+//printk("assigning bucket %p to index %d\n", buckets[1], hash);
+ rcu_assign_pointer(table->buckets[hash], buckets[1]);
+ hash += bit;
+ } while (hash < mask);
+//printk("owner:%px\n", bucket->rbh_lock.owner)
+ spin_unlock(&bucket->rbh_lock);
+ rbh_resize_unlock(rbh);
+ kvfree_rcu_mightsleep(bucket);
+
+ return 0;
+free:
+ rbh_resize_unlock(rbh);
+//printk("%s: freeing bucket %p\n", __func__, bucket);
+ kfree(buckets[0]);
+ kfree(buckets[1]);
+
+ return 0;
+}
+
+static int __rbh_insert(struct rbh *rbh, u32 hash, void *p)
+{
+ struct table_mask state;
+ struct rbh_bucket *bucket, *new_bucket;
+ unsigned int i;
+ int err;
+
+restart:
+ rcu_read_lock();
+ bucket = get_bucket(rbh, hash, &state);
+ if (unlikely(!bucket)) {
+ rcu_read_unlock();
+ if (!rbh_first(rbh, hash))
+ return -ENOMEM;
+ goto restart;
+ }
+
+again:
+ if (spin_trylock(&bucket->rbh_lock)) {
+ if (bucket->rbh_contention)
+ bucket->rbh_contention--;
+ } else {
+ spin_lock(&bucket->rbh_lock);
+ /* Numbers chosen ad-hoc */
+ bucket->rbh_contention += 10;
+ if (unlikely(bucket->rbh_contention > 5000)) {
+ spin_unlock(&bucket->rbh_lock);
+ /* OK if this fails; it's only contention */
+ rbh_split_bucket(rbh, bucket, hash, &state);
+
+ bucket = get_bucket(rbh, hash, &state);
+ spin_lock(&bucket->rbh_lock);
+ }
+ }
+
+ new_bucket = get_bucket(rbh, hash, &state);
+ if (bucket != new_bucket) {
+ spin_unlock(&bucket->rbh_lock);
+ bucket = new_bucket;
+ goto again;
+ }
+
+ /* Deleted elements differ in their bottom bit */
+ for (i = 0; i < RBH_ENTRIES; i++) {
+ u32 bhash = bucket->rbh_hashes[i];
+
+ if ((bhash & 1) == (hash & 1))
+ continue;
+ rcu_assign_pointer(bucket->rbh_slots[i], p);
+ /* This array is read under RCU */
+ WRITE_ONCE(bucket->rbh_hashes[i], hash);
+
+ spin_unlock(&bucket->rbh_lock);
+ rcu_read_unlock();
+ return 0;
+ }
+
+ /* No space in this bucket */
+ spin_unlock(&bucket->rbh_lock);
+
+ err = rbh_split_bucket(rbh, bucket, hash, &state);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ goto restart;
+}
+
+/**
+ * rbh_insert - Add a pointer to a rosebush.
+ * @rbh: The rosebush.
+ * @hash: The hash value for this pointer.
+ * @p: The pointer to add.
+ *
+ * Return: 0 on success, -ENOMEM if memory allocation fails,
+ * -EINVAL if @p is NULL.
+ */
+int rbh_insert(struct rbh *rbh, u32 hash, void *p)
+{
+ if (p == NULL)
+ return -EINVAL;
+ return __rbh_insert(rbh, hash, p);
+}
+EXPORT_SYMBOL(rbh_insert);
+
+/**
+ * rbh_remove - Remove a pointer from a rosebush.
+ * @rbh: The rosebush.
+ * @hash: The hash value for this pointer.
+ * @p: The pointer to remove.
+ *
+ * Return: 0 on success, -ENOENT if this pointer could not be found.
+ */
+int rbh_remove(struct rbh *rbh, u32 hash, void *p)
+{
+ struct rbh_bucket *bucket;
+ unsigned int i;
+ int err = -ENOENT;
+
+ rcu_read_lock();
+ bucket = lock_bucket(rbh, hash);
+ if (!bucket)
+ goto rcu_unlock;
+
+ for (i = 0; i < RBH_ENTRIES; i++) {
+ if (bucket->rbh_hashes[i] != hash)
+ continue;
+ if (rcu_dereference_protected(bucket->rbh_slots[i],
+ lockdep_is_held(&bucket->rbh_lock)) != p)
+ continue;
+ bucket->rbh_hashes[i] = ~hash;
+ /* Do not modify the slot */
+ err = 0;
+ break;
+ }
+
+ spin_unlock(&bucket->rbh_lock);
+rcu_unlock:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(rbh_remove);
+
+/**
+ * rbh_reserve - Reserve a slot in a rosebush for later use.
+ * @rbh: The rosebush.
+ * @hash: The hash value that will be used.
+ *
+ * Some callers need to take another lock before inserting an object
+ * into the rosebush. This function reserves space for them to do that.
+ * A subsequent call to rbh_use() will not allocate memory. If you find
+ * that you do not need the reserved space any more, call rbh_remove(),
+ * passing NULL as the pointer.
+ *
+ * Return: 0 on success, -ENOMEM on failure.
+ */
+int rbh_reserve(struct rbh *rbh, u32 hash)
+{
+ return __rbh_insert(rbh, hash, NULL);
+}
+EXPORT_SYMBOL(rbh_reserve);
+
+/**
+ * rbh_use - Use a reserved slot in a rosebush.
+ * @rbh: The rosebush.
+ * @hash: The hash value for this pointer.
+ * @p: The pointer to add.
+ *
+ * Return: 0 on success, -EINVAL if @p is NULL,
+ * -ENOENT if no reserved slot could be found.
+ */
+int rbh_use(struct rbh *rbh, u32 hash, void *p)
+{
+ struct rbh_bucket *bucket;
+ unsigned int i;
+ int err = -ENOENT;
+
+ rcu_read_lock();
+ bucket = lock_bucket(rbh, hash);
+ if (!bucket)
+ goto rcu_unlock;
+
+ for (i = 0; i < RBH_ENTRIES; i++) {
+ if (bucket->rbh_hashes[i] != hash)
+ continue;
+ if (bucket->rbh_slots[i] != NULL)
+ continue;
+ rcu_assign_pointer(bucket->rbh_slots[i], p);
+ err = 0;
+ break;
+ }
+
+ spin_unlock(&bucket->rbh_lock);
+rcu_unlock:
+ rcu_read_unlock();
+ return err;
+}
+EXPORT_SYMBOL(rbh_use);
+
+/**
+ * rbh_next - Find the next entry matching this hash
+ * @rbhi: The rosebush iterator.
+ *
+ * Return: NULL if there are no more matching hash values, otherwise
+ * the next pointer.
+ */
+void *rbh_next(struct rbh_iter *rbhi)
+{
+ struct table_mask state;
+ struct rbh_bucket *bucket = rbhi->bucket;
+ void *p;
+
+ if (!bucket) {
+ bucket = get_bucket(rbhi->rbh, rbhi->hash, &state);
+ if (!bucket)
+ return NULL;
+ rbhi->bucket = bucket;
+ rbhi->index = UINT_MAX;
+ }
+
+ while (++rbhi->index < RBH_ENTRIES) {
+ if (READ_ONCE(bucket->rbh_hashes[rbhi->index]) != rbhi->hash)
+ continue;
+ p = rcu_dereference(bucket->rbh_slots[rbhi->index]);
+ if (p)
+ return p;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(rbh_next);
+
+/**
+ * rbh_destroy - Destroy a rosebush
+ * @rbh: The rosebush to destroy
+ *
+ * The caller must ensure that no other threads will simultaneously
+ * attempt to use the rosebush.
+ */
+void rbh_destroy(struct rbh *rbh)
+{
+ unsigned long tagged = rbh->rbh_table;
+ struct rbh_table *table = (struct rbh_table *)(tagged & (tagged + 1));
+ u32 mask = tagged - (unsigned long)table;
+ u32 i, j, k;
+
+ for (i = 0; i <= mask; i++) {
+ struct rbh_bucket __rcu *bucket;
+
+ bucket = table->buckets[i];
+ if (!bucket)
+ continue;
+ kfree((void __force *)bucket);
+ for (j = (mask + 1) / 2; j > 0; j /= 2) {
+ if (table->buckets[i ^ j] == bucket)
+ continue;
+ j *= 2;
+ break;
+ }
+ k = i;
+ do {
+ table->buckets[k] = NULL;
+ k += j;
+ } while (k < mask && table->buckets[k] == bucket);
+ }
+
+ kfree(table);
+ rbh->rbh_table = 0;
+}
+
+#ifndef __KERNEL__
+static void dump_bucket(struct rbh_bucket *bucket, unsigned bnr)
+{
+ unsigned i;
+
+ printk("bucket:%p index:%d rbh:%p\n", bucket, bnr, bucket->rbh);
+ for (i = 0; i < RBH_ENTRIES; i++) {
+ printk("hash:%x entry:%p (%x)\n", bucket->rbh_hashes[i],
+ bucket->rbh_slots[i], bucket->rbh_hashes[i] * 2 + 1);
+ }
+}
+
+static void rbh_dump(struct rbh *rbh)
+{
+ unsigned long tagged = READ_ONCE(rbh->rbh_table);
+ struct rbh_table *table;
+ u32 mask, i;
+
+ table = (struct rbh_table *)(tagged & (tagged + 1));
+ mask = tagged - (unsigned long)table;
+
+ printk("rosebush %p has %d buckets in table %p\n", rbh, mask + 1, table);
+
+ if (mask == 1) {
+ dump_bucket((struct rbh_bucket *)table, 0);
+ dump_bucket((struct rbh_bucket *)table + 1, 1);
+ } else for (i = 0; i <= mask; i++)
+ dump_bucket(table->buckets[i], i);
+}
+#endif
+
+/*
+ * TODO:
+ * * convert the dcache
+ * * 2 byte hashes in the bucket. Once the table has 2^17 buckets, we can
+ * use 10 bytes per entry instead of 12 (24 entries/bucket instead of 20)
+ * * 1 byte hashes in the bucket. Once the table has 2^25 buckets, we can
+ * use 9 bytes per entry instead of 10 (26 entries/bucket instead of 24)
+ */
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH v2 2/5] rosebush: Add new data structure
2024-06-25 21:17 ` [PATCH v2 2/5] rosebush: Add new data structure Matthew Wilcox (Oracle)
@ 2024-06-25 23:36 ` Dr. David Alan Gilbert
2024-06-26 13:16 ` Matthew Wilcox
2024-06-26 13:07 ` Matthew Wilcox
` (2 subsequent siblings)
3 siblings, 1 reply; 17+ messages in thread
From: Dr. David Alan Gilbert @ 2024-06-25 23:36 UTC (permalink / raw)
To: Matthew Wilcox (Oracle); +Cc: linux-kernel, linux-fsdevel, maple-tree
* Matthew Wilcox (Oracle) (willy@infradead.org) wrote:
> Rosebush is a resizing hash table. See
> Docuemntation/core-api/rosebush.rst for details.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
> Documentation/core-api/index.rst | 1 +
> Documentation/core-api/rosebush.rst | 121 +++++
> MAINTAINERS | 8 +
> include/linux/rosebush.h | 62 +++
> lib/Makefile | 2 +-
> lib/rosebush.c | 654 ++++++++++++++++++++++++++++
> 6 files changed, 847 insertions(+), 1 deletion(-)
> create mode 100644 Documentation/core-api/rosebush.rst
> create mode 100644 include/linux/rosebush.h
> create mode 100644 lib/rosebush.c
>
> diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
> index f147854700e4..8984a7d6ad73 100644
> --- a/Documentation/core-api/index.rst
> +++ b/Documentation/core-api/index.rst
> @@ -36,6 +36,7 @@ Library functionality that is used throughout the kernel.
> kobject
> kref
> assoc_array
> + rosebush
> xarray
> maple_tree
> idr
> diff --git a/Documentation/core-api/rosebush.rst b/Documentation/core-api/rosebush.rst
> new file mode 100644
> index 000000000000..f0c0bc690e48
> --- /dev/null
> +++ b/Documentation/core-api/rosebush.rst
> @@ -0,0 +1,121 @@
> +.. SPDX-License-Identifier: GPL-2.0+
> +
> +========
> +Rosebush
> +========
> +
> +:Author: Matthew Wilcox
> +
> +Overview
> +========
> +
> +Rosebush is a hashtable, different from the rhashtable. It is scalable
> +(one spinlock per bucket), resizing in two dimensions (number and size
> +of buckets),
Is that old - I thought the cover letter said v2 had fixed size buckets?
Dave
> and concurrent (can be iterated under the RCU read lock).
> +It is designed to minimise dependent cache misses, which can stall a
> +modern CPU for thousands of instructions.
> +
> +Objects stored in a rosebush do not have an embedded linked list.
> +They can therefore be placed into the same rosebush multiple times and
> +be placed in multiple rosebushes. It is also possible to store pointers
> +which have special meaning like ERR_PTR(). It is not possible to store
> +a NULL pointer in a rosebush, as this is the return value that indicates
> +the iteration has finished.
> +
> +The user of the rosebush is responsible for calculating their own hash.
> +A high quality hash is desirable to keep the scalable properties of
> +the rosebush, but a hash with poor distribution in the lower bits will
> +not lead to a catastrophic breakdown. It may lead to excessive memory
> +consumption and a lot of CPU time spent during lookup.
> +
> +Rosebush is not yet IRQ or BH safe. It can be iterated in interrupt
> +context, but not modified.
> +
> +RCU Iteration
> +-------------
> +
> +There is no rosebush_lookup() function. This is because the hash value
> +may not be unique. Instead, the user should iterate the rosebush,
> +which will return pointers which probably have matching hash values.
> +It is the user's responsibility to determine if the returned pointer is
> +one they are interested in.
> +
> +Rosebush iteration guarantees to return all pointers which have a
> +matching hash, were in the rosebush before the iteration started and
> +remain in the rosebush after iteration ends. It may return additional
> +pointers, including pointers which do not have a matching hash value,
> +but it guarantees not to skip any pointers, and it guarantees to only
> +return pointers which have (at some point) been stored in the rosebush.
> +
> +If the rosebush is modified while the iteration is in progress, newly
> +added entries may or may not be returned and removed entries may or may
> +not be returned. Causality is not honoured; e.g. if Entry A is known
> +to be inserted before Entry B, it is possible for an iteration to return
> +Entry B and not Entry A.
> +
> +Functions and structures
> +========================
> +
> +.. kernel-doc:: include/linux/rosebush.h
> +.. kernel-doc:: lib/rosebush.c
> +
> +Internals
> +=========
> +
> +The rosebush is organised into a top level table which contains pointers
> +to buckets. Each bucket contains a spinlock (for modifications to the
> +bucket), the number of entries in the bucket and a contention counter.
> +
> +The top level table is a power of two in size. The bottom M bits of the
> +hash are used to index into this table. The bucket contains hash values
> +followed by their corresponding pointers. We also track a contention
> +count, which lets us know if this spinlock is overloaded and we should
> +split the bucket to improve scalability.
> +
> +A bucket may be shared between multiple table entries. For simplicity,
> +we require that all buckets are shared between a power-of-two number
> +of slots. For example, a table with 8 entries might have entries that
> +point to buckets A, B, A, B, A, C, A, D. If we were to then split bucket
> +A, we would replace the first pair of pointers with pointers to bucket
> +E and the second pair with pointers to bucket F. This is akin to the
> +buddy page allocator.
> +
> +When we decide that the table needs to be resized, we allocate a new
> +table, and duplicate the current contents of the table into each half
> +of the new table. At this point, all buckets in the table are shared.
> +We then split the bucket that we're trying to insert into.
> +
> +IRQ / BH safety
> +---------------
> +
> +If we decide to make the rosebush modifiable in IRQ context, we need
> +to take the locks in an irq-safe way; we need to figure out how to
> +allocate the top level table without vmalloc(), and we need to manage
> +without kvfree_rcu_mightsleep(). These all have solutions, but those
> +solutions have a cost that isn't worth paying until we have users.
> +
> +Some of those problems go away if we limit our support to removal in IRQ
> +context and only allow insertions in process context (as we do not need
> +to reallocate the table or bucket when removing an item).
> +
> +Small rosebushes
> +----------------
> +
> +As an optimisation, if the rosebush has no entries, the buckets pointer
> +is NULL. If the rosebush has only a few entries, there are only two
> +buckets (allocated as a single allocation) and the table pointer points
> +directly to the first one instead of pointing to a table.
> +
> +Shrinking the rosebush
> +----------------------
> +
> +We do not currently attempt either to join buckets or to shrink the table
> +if sufficiently many buckets are shared. If this proves to be a desirable
> +course of action, we can add support for it, with sufficient hysteresis
> +that adding/removing a single entry will not cause bouncing.
> +
> +Rosebush statistics
> +-------------------
> +
> +It would probably be wise to be able to gather statistics about bucket
> +occupancy rates, but this work has not yet been done.
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a39c237edb95..d4a8e99919a4 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -19467,6 +19467,14 @@ F: include/net/rose.h
> F: include/uapi/linux/rose.h
> F: net/rose/
>
> +ROSEBUSH DATA STRUCTURE
> +M: Matthew Wilcox <willy@infradead.org>
> +L: maple-tree@lists.infradead.org
> +S: Supported
> +F: Documentation/core-api/rosebush.rst
> +F: include/linux/rosebush.h
> +F: lib/*rosebush.c
> +
> ROTATION DRIVER FOR ALLWINNER A83T
> M: Jernej Skrabec <jernej.skrabec@gmail.com>
> L: linux-media@vger.kernel.org
> diff --git a/include/linux/rosebush.h b/include/linux/rosebush.h
> new file mode 100644
> index 000000000000..57f4dfa3f93d
> --- /dev/null
> +++ b/include/linux/rosebush.h
> @@ -0,0 +1,62 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/* See lib/rosebush.c */
> +
> +#ifndef _LINUX_ROSEBUSH_H
> +#define _LINUX_ROSEBUSH_H
> +
> +#include <linux/spinlock.h>
> +
> +/*
> + * Embed this struct in your struct, don't allocate it separately.
> + * None of this is for customers to use; treat it as opaque.
> + * In particular, taking the rbh_resize_lock will prevent only rbh_table
> + * from being reallocated; buckets can still be grown and split without
> + * the lock. But you will get incomprehensible lockdep warnings!
> + */
> +struct rbh {
> + spinlock_t rbh_resize_lock;
> + unsigned long rbh_table; /* A tagged pointer */
> +};
> +
> +#define DEFINE_ROSEBUSH(name) struct rbh name = \
> + { .rbh_resize_lock = __SPIN_LOCK_UNLOCKED(name.lock), }
> +
> +int rbh_insert(struct rbh *rbh, u32 hash, void *p);
> +int rbh_reserve(struct rbh *rbh, u32 hash);
> +int rbh_use(struct rbh *rbh, u32 hash, void *p);
> +int rbh_remove(struct rbh *rbh, u32 hash, void *p);
> +void rbh_destroy(struct rbh *rbh);
> +
> +/**
> + * rbh_release - Release a reserved slot in a rosebush.
> + * @rbh: The rosebush.
> + * @hash: The hash value that was reserved.
> + *
> + * If you decide that you do not need to use a reserved slot, call this
> + * function to release the slot.
> + *
> + * Return: 0 on success, -ENOENT if no reserved slot was found.
> + */
> +
> +static inline int rbh_release(struct rbh *rbh, u32 hash)
> +{
> + return rbh_remove(rbh, hash, NULL);
> +}
> +
> +struct rbh_iter {
> + struct rbh *rbh;
> + struct rbh_bucket *bucket;
> + u32 hash;
> + unsigned int index;
> +};
> +
> +#define RBH_ITER(name, _rbh, _hash) \
> + struct rbh_iter name = { .rbh = _rbh, .hash = _hash, }
> +
> +void *rbh_next(struct rbh_iter *rbhi);
> +
> +void rbh_iter_remove(struct rbh_iter *rbhi, void *p);
> +void rbh_iter_lock(struct rbh_iter *rbhi);
> +void rbh_iter_unlock(struct rbh_iter *rbhi);
> +
> +#endif
> diff --git a/lib/Makefile b/lib/Makefile
> index 3b1769045651..723e6c90b58d 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -28,7 +28,7 @@ CFLAGS_string.o += -fno-stack-protector
> endif
>
> lib-y := ctype.o string.o vsprintf.o cmdline.o \
> - rbtree.o radix-tree.o timerqueue.o xarray.o \
> + rosebush.o rbtree.o radix-tree.o timerqueue.o xarray.o \
> maple_tree.o idr.o extable.o irq_regs.o argv_split.o \
> flex_proportions.o ratelimit.o \
> is_single_threaded.o plist.o decompress.o kobject_uevent.o \
> diff --git a/lib/rosebush.c b/lib/rosebush.c
> new file mode 100644
> index 000000000000..47106a04d11d
> --- /dev/null
> +++ b/lib/rosebush.c
> @@ -0,0 +1,654 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * Rosebush, a resizing bucket hash table
> + * Copyright (c) 2024 Oracle Corporation
> + * Author: Matthew Wilcox <willy@infradead.org>
> + */
> +
> +#include <linux/export.h>
> +#include <linux/rcupdate.h>
> +#include <linux/rosebush.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/stddef.h>
> +#include <linux/vmalloc.h>
> +
> +#include <asm/barrier.h>
> +
> +/*
> + * The lock is held whenever we are modifying the contents of the bucket.
> + * The contention counter tracks whether we need to split the bucket due
> + * to contention on the spinlock.
> + * The bucket is 256 bytes in size (20 * 12 = 240, plus parent, lock, etc)
> + */
> +#ifdef CONFIG_64BIT
> +#define RBH_ENTRIES 20
> +#else
> +#define RBH_ENTRIES 30
> +#endif
> +
> +struct rbh_bucket {
> + u32 rbh_hashes[RBH_ENTRIES];
> + void __rcu *rbh_slots[RBH_ENTRIES];
> + const struct rbh *rbh;
> + spinlock_t rbh_lock;
> + u32 rbh_contention;
> +};
> +
> +struct rbh_table {
> + DECLARE_FLEX_ARRAY(struct rbh_bucket __rcu *, buckets);
> +};
> +
> +struct rbh_initial_table {
> + struct rbh_bucket buckets[2];
> +};
> +
> +/*
> + * As far as lockdep is concerned, all buckets in the same rosebush use
> + * the same lock. We use the classes to distinguish the rbh resize lock
> + * from the bucket locks. The only time we take two bucket locks is
> + * when we already hold the resize lock, so there is no need to define
> + * an order between bucket locks.
> + */
> +#ifdef CONFIG_DEBUG_SPINLOCK
> +#define bucket_lock_init(rbh, bucket) \
> + __raw_spin_lock_init(spinlock_check(&(bucket)->rbh_lock), \
> + "rbh", (rbh)->rbh_resize_lock.dep_map.key, LD_WAIT_SPIN)
> +#else
> +#define bucket_lock_init(rbh, bucket) \
> + spin_lock_init(&(bucket)->rbh_lock)
> +#endif
> +
> +#define rbh_resize_lock(rbh) spin_lock_nested(&(rbh)->rbh_resize_lock, 2)
> +#define rbh_resize_unlock(rbh) spin_unlock(&(rbh)->rbh_resize_lock)
> +
> +struct table_mask {
> + struct rbh_table *table;
> + u32 mask;
> +};
> +
> +/*
> + * A very complicated way of spelling &rbh->bucket[hash].
> + *
> + * The first complication is that we encode the number of buckets
> + * in the pointer so that we can get both in an atomic load.
> + *
> + * The second complication is that small hash tables don't have a top
> + * level table; instead the buckets pointer points to a pair of buckets.
> + */
> +static struct rbh_bucket *get_bucket(struct rbh *rbh, u32 hash,
> + struct table_mask *state)
> +{
> + unsigned long tagged;
> + struct rbh_initial_table *initial_table;
> + unsigned int bnr;
> +
> + /* rcu_dereference(), but not a pointer */
> + tagged = READ_ONCE(rbh->rbh_table);
> + if (!tagged)
> + return NULL;
> +
> + /* The lowest bits indicates how many buckets the table holds */
> + state->table = (struct rbh_table *)(tagged & (tagged + 1));
> + state->mask = tagged - (unsigned long)state->table;
> + bnr = hash & state->mask;
> + if (state->mask != 1)
> + return rcu_dereference(state->table->buckets[bnr]);
> +
> + initial_table = (struct rbh_initial_table *)state->table;
> + return &initial_table->buckets[bnr];
> +}
> +
> +static struct rbh_bucket *lock_bucket(struct rbh *rbh, u32 hash)
> + __acquires(&bucket->rbh_lock)
> +{
> + struct rbh_bucket *bucket, *new_bucket;
> + struct table_mask state;
> +
> + bucket = get_bucket(rbh, hash, &state);
> + if (!bucket)
> + return bucket;
> +again:
> + spin_lock(&bucket->rbh_lock);
> + new_bucket = get_bucket(rbh, hash, &state);
> + if (bucket == new_bucket)
> + return bucket;
> + spin_unlock(&bucket->rbh_lock);
> + bucket = new_bucket;
> + goto again;
> +}
> +
> +static bool rbh_first(struct rbh *rbh, u32 hash)
> +{
> + struct rbh_initial_table *table;
> + int i;
> +
> +//printk("%s: table size %zd\n", __func__, sizeof(*table));
> + table = kmalloc(sizeof(*table), GFP_KERNEL);
> + if (!table)
> + return false;
> +
> + rbh_resize_lock(rbh);
> + if (rbh->rbh_table) {
> + rbh_resize_unlock(rbh);
> + /* Somebody else resized it for us */
> + kfree(table);
> + return true;
> + }
> +
> + bucket_lock_init(rbh, &table->buckets[0]);
> + table->buckets[0].rbh = rbh;
> + table->buckets[0].rbh_contention = 0;
> + bucket_lock_init(rbh, &table->buckets[1]);
> + table->buckets[1].rbh = rbh;
> + table->buckets[1].rbh_contention = 0;
> + for (i = 0; i < RBH_ENTRIES; i++) {
> + table->buckets[0].rbh_hashes[i] = ~0;
> + table->buckets[1].rbh_hashes[i] = 0;
> + }
> + /* rcu_assign_pointer() but not a pointer */
> + smp_store_release(&rbh->rbh_table, (unsigned long)table | 1);
> + rbh_resize_unlock(rbh);
> +
> +//printk("%s: new table = %px\n", __func__, table);
> + return true;
> +}
> +
> +static void copy_initial_buckets(const struct rbh *rbh,
> + struct rbh_table *table, struct rbh_initial_table *init_table)
> + __acquires(&init_table->buckets[0].rbh_lock)
> + __acquires(&init_table->buckets[1].rbh_lock)
> +{
> + struct rbh_bucket *bucket;
> +
> + bucket = (void __force *)table->buckets[0];
> + spin_lock(&init_table->buckets[0].rbh_lock);
> + memcpy(bucket, &init_table->buckets[0], sizeof(init_table->buckets[0]));
> + bucket_lock_init(rbh, bucket);
> +
> + bucket = (void __force *)table->buckets[1];
> + spin_lock_nested(&init_table->buckets[1].rbh_lock, 1);
> + memcpy(bucket, &init_table->buckets[1], sizeof(init_table->buckets[1]));
> + bucket_lock_init(rbh, bucket);
> +}
> +
> +/*
> + * When we grow the table, we duplicate the bucket pointers so this
> + * thread doesn't pay the entire cost of growing the table.
> + */
> +static int rbh_grow_table(struct rbh *rbh, u32 hash, struct table_mask *state)
> + __releases(RCU)
> + __acquires(RCU)
> +{
> + struct rbh_table *table;
> + struct rbh_initial_table *init_table;
> + u32 mask = state->mask;
> + unsigned long tagged = (unsigned long)state->table | mask;
> + size_t size;
> +
> + rcu_read_unlock();
> +
> + size = (mask + 1) * 2 * sizeof(void *);
> + if (size > 4 * PAGE_SIZE)
> + /* XXX: NUMA_NO_NODE doesn't necessarily interleave */
> + table = __vmalloc_node(size, size, GFP_KERNEL, NUMA_NO_NODE,
> + &table);
> + else
> + table = kvmalloc(size, GFP_KERNEL);
> + if (!table) {
> + rcu_read_lock();
> + /* Maybe somebody resized it for us */
> + if (READ_ONCE(rbh->rbh_table) != tagged)
> + return 0;
> + return -ENOMEM;
> + }
> + BUG_ON((unsigned long)table & (size - 1));
> +
> + if (mask == 1) {
> + /* Don't need to bother with RCU until we publish the table */
> + table->buckets[0] = (void __rcu *)kmalloc(sizeof(struct rbh_bucket), GFP_KERNEL);
> + if (!table->buckets[0])
> + goto free_all;
> + table->buckets[1] = (void __rcu *)kmalloc(sizeof(struct rbh_bucket), GFP_KERNEL);
> + if (!table->buckets[1])
> + goto free_all;
> + }
> +
> + rbh_resize_lock(rbh);
> + if (rbh->rbh_table != tagged) {
> + rbh_resize_unlock(rbh);
> + /* Somebody else resized it for us */
> + kvfree(table);
> + rcu_read_lock();
> + return 0;
> + }
> +
> +//printk("%s: replacing table %p with table %p mask %d\n", __func__, state->table, table, mask);
> + if (mask == 1) {
> + init_table = (void *)state->table;
> + copy_initial_buckets(rbh, table, init_table);
> + } else {
> + memcpy(&table->buckets, &state->table->buckets,
> + (mask + 1) * sizeof(void *));
> + }
> + memcpy(&table->buckets[mask + 1], &table->buckets[0],
> + (mask + 1) * sizeof(void *));
> +
> + tagged = ((unsigned long)table) | (mask << 1) | 1;
> + /* rcu_assign_pointer() but not a pointer */
> + smp_store_release(&rbh->rbh_table, tagged);
> + rbh_resize_unlock(rbh);
> + if (mask == 1) {
> + spin_unlock(&init_table->buckets[0].rbh_lock);
> + spin_unlock(&init_table->buckets[1].rbh_lock);
> + }
> + kvfree_rcu_mightsleep(state->table);
> +
> + rcu_read_lock();
> + return 0;
> +free_all:
> + kfree((void __force *)table->buckets[0]);
> + kvfree(table);
> + rcu_read_lock();
> + return -ENOMEM;
> +}
> +
> +static void bucket_copy(const struct rbh *rbh, struct rbh_bucket *buckets[2],
> + const struct rbh_bucket *old_bucket, u32 hash, u32 mask)
> +{
> + unsigned int i;
> + unsigned int js[2] = {0, 0};
> +
> + bucket_lock_init(rbh, buckets[0]);
> + buckets[0]->rbh_contention = 0;
> + buckets[0]->rbh = rbh;
> + bucket_lock_init(rbh, buckets[1]);
> + buckets[1]->rbh_contention = 0;
> + buckets[1]->rbh = rbh;
> + for (i = 0; i < RBH_ENTRIES; i++) {
> + unsigned int nr = !!(old_bucket->rbh_hashes[i] & mask);
> + buckets[nr]->rbh_hashes[js[nr]] = old_bucket->rbh_hashes[i];
> + buckets[nr]->rbh_slots[js[nr]++] = old_bucket->rbh_slots[i];
> + }
> +//printk("%s: bucket:%p copied %d entries from %p hash:%x mask:%x\n", __func__, buckets[0], js[0], old_bucket, hash, mask);
> +//printk("%s: bucket:%p copied %d entries from %p hash:%x mask:%x\n", __func__, buckets[1], js[1], old_bucket, hash, mask);
> +
> + /* Fill the new buckets with deleted entries */
> + while (js[0] < RBH_ENTRIES)
> + buckets[0]->rbh_hashes[js[0]++] = ~hash;
> + while (js[1] < RBH_ENTRIES)
> + buckets[1]->rbh_hashes[js[1]++] = ~hash;
> +}
> +
> +#define rbh_dereference_protected(p, rbh) \
> + rcu_dereference_protected(p, lockdep_is_held(&(rbh)->rbh_resize_lock))
> +
> +static int rbh_split_bucket(struct rbh *rbh, struct rbh_bucket *bucket,
> + u32 hash, struct table_mask *state)
> +{
> + unsigned long tagged;
> + u32 bit, mask;
> + struct rbh_table *table;
> + struct rbh_bucket *buckets[2];
> + int err = 0;
> +
> + if (state->mask == 1) {
> + err = rbh_grow_table(rbh, hash, state);
> + } else {
> + u32 buddy = (hash & state->mask) ^ ((state->mask + 1) / 2);
> + if (rcu_dereference(state->table->buckets[buddy]) != bucket)
> + err = rbh_grow_table(rbh, hash, state);
> + }
> + if (err < 0)
> + return err;
> +
> + rcu_read_unlock();
> +
> + /* XXX: use slab */
> + buckets[0] = kmalloc(sizeof(*bucket), GFP_KERNEL);
> + if (!buckets[0])
> + return -ENOMEM;
> + buckets[1] = kmalloc(sizeof(*bucket), GFP_KERNEL);
> + if (!buckets[1]) {
> + kfree(buckets[0]);
> + return -ENOMEM;
> + }
> +
> +//printk("%s: adding buckets %p %p for hash %d\n", __func__, buckets[0], buckets[1], hash);
> + rbh_resize_lock(rbh);
> + tagged = rbh->rbh_table;
> + table = (struct rbh_table *)(tagged & (tagged + 1));
> + mask = tagged - (unsigned long)table;
> + hash &= mask;
> + if (rbh_dereference_protected(table->buckets[hash], rbh) != bucket)
> + goto free;
> +
> + /* Figure out which entries we need to take to the new bucket */
> + bit = mask + 1;
> + while (bit > 1) {
> + bit /= 2;
> +//printk("hash:%d buddy:%d\n", hash, hash ^ bit);
> + if (rbh_dereference_protected(table->buckets[hash ^ bit], rbh)
> + != bucket)
> + break;
> + }
> + bit *= 2;
> + if (bit == mask + 1)
> + goto free;
> +
> + spin_lock(&bucket->rbh_lock);
> + bucket_copy(rbh, buckets, bucket, hash, bit);
> +
> +//printk("hash:%d mask:%d bit:%d\n", hash, mask, bit);
> + hash &= (bit - 1);
> + do {
> +//printk("assigning bucket %p to index %d\n", buckets[0], hash);
> + rcu_assign_pointer(table->buckets[hash], buckets[0]);
> + hash += bit;
> +//printk("assigning bucket %p to index %d\n", buckets[1], hash);
> + rcu_assign_pointer(table->buckets[hash], buckets[1]);
> + hash += bit;
> + } while (hash < mask);
> +//printk("owner:%px\n", bucket->rbh_lock.owner)
> + spin_unlock(&bucket->rbh_lock);
> + rbh_resize_unlock(rbh);
> + kvfree_rcu_mightsleep(bucket);
> +
> + return 0;
> +free:
> + rbh_resize_unlock(rbh);
> +//printk("%s: freeing bucket %p\n", __func__, bucket);
> + kfree(buckets[0]);
> + kfree(buckets[1]);
> +
> + return 0;
> +}
> +
> +static int __rbh_insert(struct rbh *rbh, u32 hash, void *p)
> +{
> + struct table_mask state;
> + struct rbh_bucket *bucket, *new_bucket;
> + unsigned int i;
> + int err;
> +
> +restart:
> + rcu_read_lock();
> + bucket = get_bucket(rbh, hash, &state);
> + if (unlikely(!bucket)) {
> + rcu_read_unlock();
> + if (!rbh_first(rbh, hash))
> + return -ENOMEM;
> + goto restart;
> + }
> +
> +again:
> + if (spin_trylock(&bucket->rbh_lock)) {
> + if (bucket->rbh_contention)
> + bucket->rbh_contention--;
> + } else {
> + spin_lock(&bucket->rbh_lock);
> + /* Numbers chosen ad-hoc */
> + bucket->rbh_contention += 10;
> + if (unlikely(bucket->rbh_contention > 5000)) {
> + spin_unlock(&bucket->rbh_lock);
> + /* OK if this fails; it's only contention */
> + rbh_split_bucket(rbh, bucket, hash, &state);
> +
> + bucket = get_bucket(rbh, hash, &state);
> + spin_lock(&bucket->rbh_lock);
> + }
> + }
> +
> + new_bucket = get_bucket(rbh, hash, &state);
> + if (bucket != new_bucket) {
> + spin_unlock(&bucket->rbh_lock);
> + bucket = new_bucket;
> + goto again;
> + }
> +
> + /* Deleted elements differ in their bottom bit */
> + for (i = 0; i < RBH_ENTRIES; i++) {
> + u32 bhash = bucket->rbh_hashes[i];
> +
> + if ((bhash & 1) == (hash & 1))
> + continue;
> + rcu_assign_pointer(bucket->rbh_slots[i], p);
> + /* This array is read under RCU */
> + WRITE_ONCE(bucket->rbh_hashes[i], hash);
> +
> + spin_unlock(&bucket->rbh_lock);
> + rcu_read_unlock();
> + return 0;
> + }
> +
> + /* No space in this bucket */
> + spin_unlock(&bucket->rbh_lock);
> +
> + err = rbh_split_bucket(rbh, bucket, hash, &state);
> + rcu_read_unlock();
> + if (err)
> + return err;
> + goto restart;
> +}
> +
> +/**
> + * rbh_insert - Add a pointer to a rosebush.
> + * @rbh: The rosebush.
> + * @hash: The hash value for this pointer.
> + * @p: The pointer to add.
> + *
> + * Return: 0 on success, -ENOMEM if memory allocation fails,
> + * -EINVAL if @p is NULL.
> + */
> +int rbh_insert(struct rbh *rbh, u32 hash, void *p)
> +{
> + if (p == NULL)
> + return -EINVAL;
> + return __rbh_insert(rbh, hash, p);
> +}
> +EXPORT_SYMBOL(rbh_insert);
> +
> +/**
> + * rbh_remove - Remove a pointer from a rosebush.
> + * @rbh: The rosebush.
> + * @hash: The hash value for this pointer.
> + * @p: The pointer to remove.
> + *
> + * Return: 0 on success, -ENOENT if this pointer could not be found.
> + */
> +int rbh_remove(struct rbh *rbh, u32 hash, void *p)
> +{
> + struct rbh_bucket *bucket;
> + unsigned int i;
> + int err = -ENOENT;
> +
> + rcu_read_lock();
> + bucket = lock_bucket(rbh, hash);
> + if (!bucket)
> + goto rcu_unlock;
> +
> + for (i = 0; i < RBH_ENTRIES; i++) {
> + if (bucket->rbh_hashes[i] != hash)
> + continue;
> + if (rcu_dereference_protected(bucket->rbh_slots[i],
> + lockdep_is_held(&bucket->rbh_lock)) != p)
> + continue;
> + bucket->rbh_hashes[i] = ~hash;
> + /* Do not modify the slot */
> + err = 0;
> + break;
> + }
> +
> + spin_unlock(&bucket->rbh_lock);
> +rcu_unlock:
> + rcu_read_unlock();
> + return err;
> +}
> +EXPORT_SYMBOL(rbh_remove);
> +
> +/**
> + * rbh_reserve - Reserve a slot in a rosebush for later use.
> + * @rbh: The rosebush.
> + * @hash: The hash value that will be used.
> + *
> + * Some callers need to take another lock before inserting an object
> + * into the rosebush. This function reserves space for them to do that.
> + * A subsequent call to rbh_use() will not allocate memory. If you find
> + * that you do not need the reserved space any more, call rbh_remove(),
> + * passing NULL as the pointer.
> + *
> + * Return: 0 on success, -ENOMEM on failure.
> + */
> +int rbh_reserve(struct rbh *rbh, u32 hash)
> +{
> + return __rbh_insert(rbh, hash, NULL);
> +}
> +EXPORT_SYMBOL(rbh_reserve);
> +
> +/**
> + * rbh_use - Use a reserved slot in a rosebush.
> + * @rbh: The rosebush.
> + * @hash: The hash value for this pointer.
> + * @p: The pointer to add.
> + *
> + * Return: 0 on success, -EINVAL if @p is NULL,
> + * -ENOENT if no reserved slot could be found.
> + */
> +int rbh_use(struct rbh *rbh, u32 hash, void *p)
> +{
> + struct rbh_bucket *bucket;
> + unsigned int i;
> + int err = -ENOENT;
> +
> + rcu_read_lock();
> + bucket = lock_bucket(rbh, hash);
> + if (!bucket)
> + goto rcu_unlock;
> +
> + for (i = 0; i < RBH_ENTRIES; i++) {
> + if (bucket->rbh_hashes[i] != hash)
> + continue;
> + if (bucket->rbh_slots[i] != NULL)
> + continue;
> + rcu_assign_pointer(bucket->rbh_slots[i], p);
> + err = 0;
> + break;
> + }
> +
> + spin_unlock(&bucket->rbh_lock);
> +rcu_unlock:
> + rcu_read_unlock();
> + return err;
> +}
> +EXPORT_SYMBOL(rbh_use);
> +
> +/**
> + * rbh_next - Find the next entry matching this hash
> + * @rbhi: The rosebush iterator.
> + *
> + * Return: NULL if there are no more matching hash values, otherwise
> + * the next pointer.
> + */
> +void *rbh_next(struct rbh_iter *rbhi)
> +{
> + struct table_mask state;
> + struct rbh_bucket *bucket = rbhi->bucket;
> + void *p;
> +
> + if (!bucket) {
> + bucket = get_bucket(rbhi->rbh, rbhi->hash, &state);
> + if (!bucket)
> + return NULL;
> + rbhi->bucket = bucket;
> + rbhi->index = UINT_MAX;
> + }
> +
> + while (++rbhi->index < RBH_ENTRIES) {
> + if (READ_ONCE(bucket->rbh_hashes[rbhi->index]) != rbhi->hash)
> + continue;
> + p = rcu_dereference(bucket->rbh_slots[rbhi->index]);
> + if (p)
> + return p;
> + }
> +
> + return NULL;
> +}
> +EXPORT_SYMBOL(rbh_next);
> +
> +/**
> + * rbh_destroy - Destroy a rosebush
> + * @rbh: The rosebush to destroy
> + *
> + * The caller must ensure that no other threads will simultaneously
> + * attempt to use the rosebush.
> + */
> +void rbh_destroy(struct rbh *rbh)
> +{
> + unsigned long tagged = rbh->rbh_table;
> + struct rbh_table *table = (struct rbh_table *)(tagged & (tagged + 1));
> + u32 mask = tagged - (unsigned long)table;
> + u32 i, j, k;
> +
> + for (i = 0; i <= mask; i++) {
> + struct rbh_bucket __rcu *bucket;
> +
> + bucket = table->buckets[i];
> + if (!bucket)
> + continue;
> + kfree((void __force *)bucket);
> + for (j = (mask + 1) / 2; j > 0; j /= 2) {
> + if (table->buckets[i ^ j] == bucket)
> + continue;
> + j *= 2;
> + break;
> + }
> + k = i;
> + do {
> + table->buckets[k] = NULL;
> + k += j;
> + } while (k < mask && table->buckets[k] == bucket);
> + }
> +
> + kfree(table);
> + rbh->rbh_table = 0;
> +}
> +
> +#ifndef __KERNEL__
> +static void dump_bucket(struct rbh_bucket *bucket, unsigned bnr)
> +{
> + unsigned i;
> +
> + printk("bucket:%p index:%d rbh:%p\n", bucket, bnr, bucket->rbh);
> + for (i = 0; i < RBH_ENTRIES; i++) {
> + printk("hash:%x entry:%p (%x)\n", bucket->rbh_hashes[i],
> + bucket->rbh_slots[i], bucket->rbh_hashes[i] * 2 + 1);
> + }
> +}
> +
> +static void rbh_dump(struct rbh *rbh)
> +{
> + unsigned long tagged = READ_ONCE(rbh->rbh_table);
> + struct rbh_table *table;
> + u32 mask, i;
> +
> + table = (struct rbh_table *)(tagged & (tagged + 1));
> + mask = tagged - (unsigned long)table;
> +
> + printk("rosebush %p has %d buckets in table %p\n", rbh, mask + 1, table);
> +
> + if (mask == 1) {
> + dump_bucket((struct rbh_bucket *)table, 0);
> + dump_bucket((struct rbh_bucket *)table + 1, 1);
> + } else for (i = 0; i <= mask; i++)
> + dump_bucket(table->buckets[i], i);
> +}
> +#endif
> +
> +/*
> + * TODO:
> + * * convert the dcache
> + * * 2 byte hashes in the bucket. Once the table has 2^17 buckets, we can
> + * use 10 bytes per entry instead of 12 (24 entries/bucket instead of 20)
> + * * 1 byte hashes in the bucket. Once the table has 2^25 buckets, we can
> + * use 9 bytes per entry instead of 10 (26 entries/bucket instead of 24)
> + */
> --
> 2.43.0
>
>
--
-----Open up your eyes, open up your mind, open up your code -------
/ Dr. David Alan Gilbert | Running GNU/Linux | Happy \
\ dave @ treblig.org | | In Hex /
\ _________________________|_____ http://www.treblig.org |_______/
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v2 2/5] rosebush: Add new data structure
2024-06-25 23:36 ` Dr. David Alan Gilbert
@ 2024-06-26 13:16 ` Matthew Wilcox
0 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox @ 2024-06-26 13:16 UTC (permalink / raw)
To: Dr. David Alan Gilbert; +Cc: linux-kernel, linux-fsdevel, maple-tree
On Tue, Jun 25, 2024 at 11:36:06PM +0000, Dr. David Alan Gilbert wrote:
> > +Overview
> > +========
> > +
> > +Rosebush is a hashtable, different from the rhashtable. It is scalable
> > +(one spinlock per bucket), resizing in two dimensions (number and size
> > +of buckets),
>
> Is that old - I thought the cover letter said v2 had fixed size buckets?
Thanks.
Rosebush is a hashtable, different from the rhashtable. It is scalable
-(one spinlock per bucket), resizing in two dimensions (number and size
-of buckets), and concurrent (can be iterated under the RCU read lock).
-It is designed to minimise dependent cache misses, which can stall a
-modern CPU for thousands of instructions.
+(one spinlock per bucket), resizing (number of buckets), and concurrent
+(can be iterated under the RCU read lock). It is designed to minimise
+dependent cache misses, which can stall a modern CPU for thousands
+of instructions.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH v2 2/5] rosebush: Add new data structure
2024-06-25 21:17 ` [PATCH v2 2/5] rosebush: Add new data structure Matthew Wilcox (Oracle)
2024-06-25 23:36 ` Dr. David Alan Gilbert
@ 2024-06-26 13:07 ` Matthew Wilcox
2024-06-29 19:45 ` Markus Elfring
2024-07-02 1:04 ` [PATCH v2 " Bagas Sanjaya
3 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox @ 2024-06-26 13:07 UTC (permalink / raw)
To: linux-kernel; +Cc: linux-fsdevel, maple-tree
On Tue, Jun 25, 2024 at 10:17:57PM +0100, Matthew Wilcox (Oracle) wrote:
> Rosebush is a resizing hash table. See
> Docuemntation/core-api/rosebush.rst for details.
I thought I had more debugging enabled than I actually did, and
there's some unbalanced RCU locking. I'll fold this fix in:
diff --git a/lib/rosebush.c b/lib/rosebush.c
index 47106a04d11d..ab2d314cecec 100644
--- a/lib/rosebush.c
+++ b/lib/rosebush.c
@@ -305,13 +305,14 @@ static int rbh_split_bucket(struct rbh *rbh, struct rbh_bucket *bucket,
rcu_read_unlock();
/* XXX: use slab */
+ err = -ENOMEM;
buckets[0] = kmalloc(sizeof(*bucket), GFP_KERNEL);
if (!buckets[0])
- return -ENOMEM;
+ goto nomem;
buckets[1] = kmalloc(sizeof(*bucket), GFP_KERNEL);
if (!buckets[1]) {
kfree(buckets[0]);
- return -ENOMEM;
+ goto nomem;
}
//printk("%s: adding buckets %p %p for hash %d\n", __func__, buckets[0], buckets[1], hash);
@@ -320,6 +321,8 @@ static int rbh_split_bucket(struct rbh *rbh, struct rbh_bucket *bucket,
table = (struct rbh_table *)(tagged & (tagged + 1));
mask = tagged - (unsigned long)table;
hash &= mask;
+
+ err = 0;
if (rbh_dereference_protected(table->buckets[hash], rbh) != bucket)
goto free;
@@ -354,14 +357,17 @@ static int rbh_split_bucket(struct rbh *rbh, struct rbh_bucket *bucket,
rbh_resize_unlock(rbh);
kvfree_rcu_mightsleep(bucket);
+ rcu_read_lock();
return 0;
free:
rbh_resize_unlock(rbh);
//printk("%s: freeing bucket %p\n", __func__, bucket);
- kfree(buckets[0]);
kfree(buckets[1]);
+nomem:
+ kfree(buckets[0]);
- return 0;
+ rcu_read_lock();
+ return err;
}
static int __rbh_insert(struct rbh *rbh, u32 hash, void *p)
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH v2 2/5] rosebush: Add new data structure
2024-06-25 21:17 ` [PATCH v2 2/5] rosebush: Add new data structure Matthew Wilcox (Oracle)
2024-06-25 23:36 ` Dr. David Alan Gilbert
2024-06-26 13:07 ` Matthew Wilcox
@ 2024-06-29 19:45 ` Markus Elfring
2024-07-01 1:32 ` Matthew Wilcox
2024-07-02 1:04 ` [PATCH v2 " Bagas Sanjaya
3 siblings, 1 reply; 17+ messages in thread
From: Markus Elfring @ 2024-06-29 19:45 UTC (permalink / raw)
To: Matthew Wilcox, LKML; +Cc: linux-fsdevel, maple-tree
…
> +++ b/lib/rosebush.c
> @@ -0,0 +1,654 @@
…
> +int rbh_remove(struct rbh *rbh, u32 hash, void *p)
> +{
…
> + rcu_read_lock();
> + bucket = lock_bucket(rbh, hash);
…
> +rcu_unlock:
> + rcu_read_unlock();
> + return err;
> +}
…
Under which circumstances would you become interested to apply a statement
like “guard(rcu)();”?
https://elixir.bootlin.com/linux/v6.10-rc5/source/include/linux/rcupdate.h#L1093
Regards,
Markus
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v2 2/5] rosebush: Add new data structure
2024-06-29 19:45 ` Markus Elfring
@ 2024-07-01 1:32 ` Matthew Wilcox
2024-07-01 5:21 ` [v2 " Markus Elfring
0 siblings, 1 reply; 17+ messages in thread
From: Matthew Wilcox @ 2024-07-01 1:32 UTC (permalink / raw)
To: Markus Elfring; +Cc: LKML, linux-fsdevel, maple-tree
On Sat, Jun 29, 2024 at 09:45:46PM +0200, Markus Elfring wrote:
> Under which circumstances would you become interested to apply a statement
> like “guard(rcu)();”?
Under no circumstances.
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [v2 2/5] rosebush: Add new data structure
2024-07-01 1:32 ` Matthew Wilcox
@ 2024-07-01 5:21 ` Markus Elfring
2024-07-01 14:18 ` Paul E. McKenney
0 siblings, 1 reply; 17+ messages in thread
From: Markus Elfring @ 2024-07-01 5:21 UTC (permalink / raw)
To: Matthew Wilcox, kernel-janitors, Boqun Feng, Johannes Berg,
Peter Zijlstra, Paul E. McKenney, Uladzislau Rezki
Cc: LKML, linux-fsdevel, maple-tree
>> Under which circumstances would you become interested to apply a statement
>> like “guard(rcu)();”?
>
> Under no circumstances.
I imagine that further contributors would like to discuss collateral evolution
also according to the support for applications of scope-based resource management.
https://elixir.bootlin.com/linux/v6.10-rc6/source/include/linux/rcupdate.h#L1093
See also the commit 80cd613a9ae091dbf52e27a409d58da988ffc8f3 ("rcu:
Mollify sparse with RCU guard") from 2024-04-15.
Regards,
Markus
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [v2 2/5] rosebush: Add new data structure
2024-07-01 5:21 ` [v2 " Markus Elfring
@ 2024-07-01 14:18 ` Paul E. McKenney
0 siblings, 0 replies; 17+ messages in thread
From: Paul E. McKenney @ 2024-07-01 14:18 UTC (permalink / raw)
To: Markus Elfring
Cc: Matthew Wilcox, kernel-janitors, Boqun Feng, Johannes Berg,
Peter Zijlstra, Uladzislau Rezki, LKML, linux-fsdevel, maple-tree
On Mon, Jul 01, 2024 at 07:21:18AM +0200, Markus Elfring wrote:
> >> Under which circumstances would you become interested to apply a statement
> >> like “guard(rcu)();”?
> >
> > Under no circumstances.
>
> I imagine that further contributors would like to discuss collateral evolution
> also according to the support for applications of scope-based resource management.
> https://elixir.bootlin.com/linux/v6.10-rc6/source/include/linux/rcupdate.h#L1093
>
> See also the commit 80cd613a9ae091dbf52e27a409d58da988ffc8f3 ("rcu:
> Mollify sparse with RCU guard") from 2024-04-15.
Although the guard(rcu)() statement is very helpful in some circumstances
and is seeing increasing use, it is not free of downsides in a number
of situations. For but one example, Matthew might expect that partially
overlapping critical sections will be needed, which would rule out use of
guards on one or the other of those two critical sections.
Thanx, Paul
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH v2 2/5] rosebush: Add new data structure
2024-06-25 21:17 ` [PATCH v2 2/5] rosebush: Add new data structure Matthew Wilcox (Oracle)
` (2 preceding siblings ...)
2024-06-29 19:45 ` Markus Elfring
@ 2024-07-02 1:04 ` Bagas Sanjaya
3 siblings, 0 replies; 17+ messages in thread
From: Bagas Sanjaya @ 2024-07-02 1:04 UTC (permalink / raw)
To: Matthew Wilcox (Oracle), linux-kernel; +Cc: linux-fsdevel, maple-tree
[-- Attachment #1: Type: text/plain, Size: 2299 bytes --]
On Tue, Jun 25, 2024 at 10:17:57PM +0100, Matthew Wilcox (Oracle) wrote:
> Rosebush is a resizing hash table. See
> Docuemntation/core-api/rosebush.rst for details.
What about "Document Rosebush hash table - overview, implementation details/internals, and API
docs"?
> +Overview
> +========
> +
> +Rosebush is a hashtable, different from the rhashtable. It is scalable
> +(one spinlock per bucket), resizing in two dimensions (number and size
> +of buckets), and concurrent (can be iterated under the RCU read lock).
> +It is designed to minimise dependent cache misses, which can stall a
> +modern CPU for thousands of instructions.
> +
> +Objects stored in a rosebush do not have an embedded linked list.
> +They can therefore be placed into the same rosebush multiple times and
> +be placed in multiple rosebushes. It is also possible to store pointers
> +which have special meaning like ERR_PTR(). It is not possible to store
> +a NULL pointer in a rosebush, as this is the return value that indicates
"..., however, as this ..."
> +the iteration has finished.
> +
> +The user of the rosebush is responsible for calculating their own hash.
> +A high quality hash is desirable to keep the scalable properties of
> +the rosebush, but a hash with poor distribution in the lower bits will
> +not lead to a catastrophic breakdown. It may lead to excessive memory
"..., but rather it may lead to ..."
What does catastrophic breakdown mean anyway?
> +consumption and a lot of CPU time spent during lookup.
> +
> +Rosebush is not yet IRQ or BH safe. It can be iterated in interrupt
"This means that ..."
> +context, but not modified.
> +
> <snipped>...
> +IRQ / BH safety
> +---------------
> +
> +If we decide to make the rosebush modifiable in IRQ context, we need
> +to take the locks in an irq-safe way; we need to figure out how to
> +allocate the top level table without vmalloc(), and we need to manage
> +without kvfree_rcu_mightsleep(). These all have solutions, but those
> +solutions have a cost that isn't worth paying until we have users.
Use cases?
Thanks.
--
An old man doll... just what I always wanted! - Clara
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v2 3/5] rosebush: Add test suite
2024-06-25 21:17 [PATCH v2 0/5] Rosebush, a new hash table Matthew Wilcox (Oracle)
2024-06-25 21:17 ` [PATCH v2 1/5] tools: Add kernel stubs needed for rosebush Matthew Wilcox (Oracle)
2024-06-25 21:17 ` [PATCH v2 2/5] rosebush: Add new data structure Matthew Wilcox (Oracle)
@ 2024-06-25 21:17 ` Matthew Wilcox (Oracle)
2024-06-28 15:18 ` Pankaj Raghav (Samsung)
2024-06-29 5:13 ` Jeff Johnson
2024-06-25 21:17 ` [PATCH v2 4/5] tools: Add support for running rosebush tests in userspace Matthew Wilcox (Oracle)
2024-06-25 21:18 ` [PATCH v2 5/5] dcache: Convert to use rosebush Matthew Wilcox (Oracle)
4 siblings, 2 replies; 17+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-06-25 21:17 UTC (permalink / raw)
To: linux-kernel; +Cc: Matthew Wilcox (Oracle), linux-fsdevel, maple-tree
This is not a very sophisticated test suite yet, but it helped find
a few bugs and provides a framework for adding more tests as more
bugs are found.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
lib/Kconfig.debug | 3 +
lib/Makefile | 1 +
lib/test_rosebush.c | 140 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 144 insertions(+)
create mode 100644 lib/test_rosebush.c
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 59b6765d86b8..f3cfd79d8dbd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2447,6 +2447,9 @@ config TEST_RHASHTABLE
If unsure, say N.
+config TEST_ROSEBUSH
+ tristate "Test the Rosebush data structure"
+
config TEST_IDA
tristate "Perform selftest on IDA functions"
diff --git a/lib/Makefile b/lib/Makefile
index 723e6c90b58d..de4edefc2c11 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -76,6 +76,7 @@ obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o
obj-$(CONFIG_TEST_MIN_HEAP) += test_min_heap.o
obj-$(CONFIG_TEST_LKM) += test_module.o
obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o
+obj-$(CONFIG_TEST_ROSEBUSH) += test_rosebush.o
obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
obj-$(CONFIG_TEST_SORT) += test_sort.o
obj-$(CONFIG_TEST_USER_COPY) += test_user_copy.o
diff --git a/lib/test_rosebush.c b/lib/test_rosebush.c
new file mode 100644
index 000000000000..59c342e7a5b3
--- /dev/null
+++ b/lib/test_rosebush.c
@@ -0,0 +1,140 @@
+#include <linux/rosebush.h>
+#include <kunit/test.h>
+
+static void iter_rbh(struct kunit *test, struct rbh *rbh, u32 hash, void *p)
+{
+ RBH_ITER(iter, rbh, hash);
+ void *q;
+
+ rcu_read_lock();
+ q = rbh_next(&iter);
+ KUNIT_EXPECT_PTR_EQ_MSG(test, p, q,
+ "rbh_next hash:%u returned %px, expected %px", hash, q, p);
+ q = rbh_next(&iter);
+ KUNIT_EXPECT_PTR_EQ_MSG(test, NULL, q,
+ "rbh_next hash:%u returned %px, expected NULL", hash, q);
+ rcu_read_unlock();
+}
+
+static void check_empty_rbh(struct kunit *test, struct rbh *rbh)
+{
+ iter_rbh(test, rbh, 0, NULL);
+ iter_rbh(test, rbh, 1, NULL);
+ iter_rbh(test, rbh, 17, NULL);
+ iter_rbh(test, rbh, 42, NULL);
+}
+
+static void test_insert(struct kunit *test, struct rbh *rbh, u32 hash)
+{
+ void *p = (void *)((hash << 1) | 1UL);
+ int err;
+
+ err = rbh_insert(rbh, hash, p);
+ KUNIT_EXPECT_EQ(test, err, 0);
+
+ iter_rbh(test, rbh, hash, p);
+}
+
+static void test_reserve(struct kunit *test, struct rbh *rbh, u32 hash)
+{
+ int err;
+
+ err = rbh_reserve(rbh, hash);
+ KUNIT_EXPECT_EQ(test, err, 0);
+
+ iter_rbh(test, rbh, hash, NULL);
+}
+
+static void test_use(struct kunit *test, struct rbh *rbh, u32 hash)
+{
+ void *p = (void *)((hash << 1) | 1UL);
+ int err;
+
+ err = rbh_use(rbh, hash, p);
+ KUNIT_EXPECT_EQ(test, err, 0);
+
+ iter_rbh(test, rbh, hash, p);
+}
+
+static void test_remove(struct kunit *test, struct rbh *rbh, u32 hash)
+{
+ void *p = (void *)((hash << 1) | 1UL);
+ int err;
+
+ err = rbh_remove(rbh, hash, p);
+ KUNIT_EXPECT_EQ(test, err, 0);
+
+ iter_rbh(test, rbh, hash, NULL);
+}
+
+static DEFINE_ROSEBUSH(rosebush);
+
+/*
+ * Conduct a number of tests on a rosebush that has never been used.
+ * They should all return NULL or an errno. We're looking for crashes
+ * here.
+ */
+static void empty(struct kunit *test)
+{
+ int err;
+
+ check_empty_rbh(test, &rosebush);
+ err = rbh_remove(&rosebush, 0, test);
+ KUNIT_EXPECT_EQ(test, err, -ENOENT);
+ err = rbh_use(&rosebush, 0, test);
+ KUNIT_EXPECT_EQ(test, err, -ENOENT);
+ KUNIT_EXPECT_EQ(test, rosebush.rbh_table, 0);
+}
+
+static void first(struct kunit *test)
+{
+ int err;
+
+ test_insert(test, &rosebush, 5);
+ check_empty_rbh(test, &rosebush);
+ test_remove(test, &rosebush, 5);
+ check_empty_rbh(test, &rosebush);
+
+ err = rbh_remove(&rosebush, 5, NULL);
+ KUNIT_EXPECT_EQ(test, err, -ENOENT);
+ test_reserve(test, &rosebush, 5);
+ err = rbh_remove(&rosebush, 5, test);
+ KUNIT_EXPECT_EQ(test, err, -ENOENT);
+ err = rbh_remove(&rosebush, 5, NULL);
+ KUNIT_EXPECT_EQ(test, err, 0);
+ err = rbh_remove(&rosebush, 5, NULL);
+ KUNIT_EXPECT_EQ(test, err, -ENOENT);
+
+ test_reserve(test, &rosebush, 5);
+ test_use(test, &rosebush, 5);
+ err = rbh_remove(&rosebush, 5, NULL);
+ KUNIT_EXPECT_EQ(test, err, -ENOENT);
+ test_remove(test, &rosebush, 5);
+}
+
+static void grow(struct kunit *test)
+{
+ int i;
+
+ for (i = 3; i < 3333; i += 2)
+ test_insert(test, &rosebush, i);
+
+ rbh_destroy(&rosebush);
+}
+
+static struct kunit_case rosebush_cases[] __refdata = {
+ KUNIT_CASE(empty),
+ KUNIT_CASE(first),
+ KUNIT_CASE(grow),
+ {}
+};
+
+static struct kunit_suite rosebush_suite = {
+ .name = "rosebush",
+ .test_cases = rosebush_cases,
+};
+
+kunit_test_suite(rosebush_suite);
+
+MODULE_AUTHOR("Matthew Wilcox (Oracle) <willy@infradead.org>");
+MODULE_LICENSE("GPL");
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH v2 3/5] rosebush: Add test suite
2024-06-25 21:17 ` [PATCH v2 3/5] rosebush: Add test suite Matthew Wilcox (Oracle)
@ 2024-06-28 15:18 ` Pankaj Raghav (Samsung)
2024-06-29 5:13 ` Jeff Johnson
1 sibling, 0 replies; 17+ messages in thread
From: Pankaj Raghav (Samsung) @ 2024-06-28 15:18 UTC (permalink / raw)
To: Matthew Wilcox (Oracle); +Cc: linux-kernel, linux-fsdevel, maple-tree
On Tue, Jun 25, 2024 at 10:17:58PM +0100, Matthew Wilcox (Oracle) wrote:
> This is not a very sophisticated test suite yet, but it helped find
> a few bugs and provides a framework for adding more tests as more
> bugs are found.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
> lib/Kconfig.debug | 3 +
> lib/Makefile | 1 +
> lib/test_rosebush.c | 140 ++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 144 insertions(+)
> create mode 100644 lib/test_rosebush.c
>
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 59b6765d86b8..f3cfd79d8dbd 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -2447,6 +2447,9 @@ config TEST_RHASHTABLE
>
> If unsure, say N.
>
> +config TEST_ROSEBUSH
> + tristate "Test the Rosebush data structure"
> +
This needs a `depends on KUNIT`.
And compiling this test as module results in rbh_destroy not found. I
think you missed EXPORT_SYMBOL of rbh_destroy in the previous patch.
> config TEST_IDA
> tristate "Perform selftest on IDA functions"
>
> +
> +static void check_empty_rbh(struct kunit *test, struct rbh *rbh)
> +{
> + iter_rbh(test, rbh, 0, NULL);
> + iter_rbh(test, rbh, 1, NULL);
> + iter_rbh(test, rbh, 17, NULL);
> + iter_rbh(test, rbh, 42, NULL);
> +}
Do these hashes hold any significance that you test them often after an
insert?
--
Pankaj
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v2 3/5] rosebush: Add test suite
2024-06-25 21:17 ` [PATCH v2 3/5] rosebush: Add test suite Matthew Wilcox (Oracle)
2024-06-28 15:18 ` Pankaj Raghav (Samsung)
@ 2024-06-29 5:13 ` Jeff Johnson
1 sibling, 0 replies; 17+ messages in thread
From: Jeff Johnson @ 2024-06-29 5:13 UTC (permalink / raw)
To: Matthew Wilcox (Oracle), linux-kernel; +Cc: linux-fsdevel, maple-tree
On 6/25/24 14:17, Matthew Wilcox (Oracle) wrote:
> This is not a very sophisticated test suite yet, but it helped find
> a few bugs and provides a framework for adding more tests as more
> bugs are found.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
...
> +static struct kunit_suite rosebush_suite = {
> + .name = "rosebush",
> + .test_cases = rosebush_cases,
> +};
> +
> +kunit_test_suite(rosebush_suite);
> +
> +MODULE_AUTHOR("Matthew Wilcox (Oracle) <willy@infradead.org>");
> +MODULE_LICENSE("GPL");
make W=1 will warn if there isn't also a MODULE_DESCRIPTION()
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v2 4/5] tools: Add support for running rosebush tests in userspace
2024-06-25 21:17 [PATCH v2 0/5] Rosebush, a new hash table Matthew Wilcox (Oracle)
` (2 preceding siblings ...)
2024-06-25 21:17 ` [PATCH v2 3/5] rosebush: Add test suite Matthew Wilcox (Oracle)
@ 2024-06-25 21:17 ` Matthew Wilcox (Oracle)
2024-06-29 5:15 ` Jeff Johnson
2024-06-25 21:18 ` [PATCH v2 5/5] dcache: Convert to use rosebush Matthew Wilcox (Oracle)
4 siblings, 1 reply; 17+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-06-25 21:17 UTC (permalink / raw)
To: linux-kernel; +Cc: Matthew Wilcox (Oracle), linux-fsdevel, maple-tree
Enable make -C tools/testing/radix-tree. Much easier to debug than
an in-kernel module.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
tools/include/linux/rosebush.h | 1 +
tools/testing/radix-tree/.gitignore | 1 +
tools/testing/radix-tree/Makefile | 6 ++++-
tools/testing/radix-tree/kunit/test.h | 20 +++++++++++++++
tools/testing/radix-tree/rosebush.c | 36 +++++++++++++++++++++++++++
5 files changed, 63 insertions(+), 1 deletion(-)
create mode 100644 tools/include/linux/rosebush.h
create mode 100644 tools/testing/radix-tree/kunit/test.h
create mode 100644 tools/testing/radix-tree/rosebush.c
diff --git a/tools/include/linux/rosebush.h b/tools/include/linux/rosebush.h
new file mode 100644
index 000000000000..3f12f4288250
--- /dev/null
+++ b/tools/include/linux/rosebush.h
@@ -0,0 +1 @@
+#include "../../../include/linux/rosebush.h"
diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore
index 49bccb90c35b..fb154f26bdab 100644
--- a/tools/testing/radix-tree/.gitignore
+++ b/tools/testing/radix-tree/.gitignore
@@ -9,3 +9,4 @@ radix-tree.c
xarray
maple
ma_xa_benchmark
+rosebush
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index 7527f738b4a1..982ff4b7fdeb 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -4,7 +4,7 @@ CFLAGS += -I. -I../../include -I../../../lib -g -Og -Wall \
-D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined
LDFLAGS += -fsanitize=address -fsanitize=undefined
LDLIBS+= -lpthread -lurcu
-TARGETS = main idr-test multiorder xarray maple
+TARGETS = main idr-test multiorder xarray maple rosebush
CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \
slab.o maple.o
OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \
@@ -36,6 +36,8 @@ xarray: $(CORE_OFILES)
maple: $(CORE_OFILES)
+rosebush: $(CORE_OFILES)
+
multiorder: multiorder.o $(CORE_OFILES)
clean:
@@ -62,6 +64,8 @@ xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c
maple.o: ../../../lib/maple_tree.c ../../../lib/test_maple_tree.c
+rosebush.o: ../../../lib/rosebush.c ../../../lib/test_rosebush.c
+
generated/map-shift.h:
@if ! grep -qws $(SHIFT) generated/map-shift.h; then \
echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \
diff --git a/tools/testing/radix-tree/kunit/test.h b/tools/testing/radix-tree/kunit/test.h
new file mode 100644
index 000000000000..0805e3695762
--- /dev/null
+++ b/tools/testing/radix-tree/kunit/test.h
@@ -0,0 +1,20 @@
+struct kunit {
+};
+
+struct kunit_case {
+ void (*run_case)(struct kunit *test);
+};
+
+struct kunit_suite {
+ char *name;
+ struct kunit_case *test_cases;
+};
+
+#define KUNIT_CASE(test_name) { .run_case = test_name, }
+#define kunit_test_suite(x)
+
+#define KUNIT_EXPECT_EQ(test, left, right) \
+ KUNIT_EXPECT_PTR_EQ_MSG(test, left, right, NULL)
+#define KUNIT_EXPECT_PTR_EQ_MSG(test, left, right, fmt, ...) \
+ assert(left == right)
+
diff --git a/tools/testing/radix-tree/rosebush.c b/tools/testing/radix-tree/rosebush.c
new file mode 100644
index 000000000000..51703737833e
--- /dev/null
+++ b/tools/testing/radix-tree/rosebush.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * rosebush.c: Userspace testing for rosebush test-suite
+ * Copyright (c) 2024 Oracle Corporation
+ * Author: Matthew Wilcox <willy@infradead.org>
+ */
+
+#include "test.h"
+#include <stdlib.h>
+#include <time.h>
+
+#define module_init(x)
+#define module_exit(x)
+#define MODULE_AUTHOR(x)
+#define MODULE_LICENSE(x)
+#define dump_stack() assert(0)
+
+#include "../../../lib/rosebush.c"
+#include "../../../lib/test_rosebush.c"
+
+int __weak main(void)
+{
+ struct kunit test;
+ int i;
+
+ assert(rosebush_suite.test_cases == rosebush_cases);
+
+ for (i = 0; i < ARRAY_SIZE(rosebush_cases); i++) {
+ if (!rosebush_cases[i].run_case)
+ continue;
+ printf("i = %d %p\n", i, rosebush_cases[i].run_case);
+ rosebush_cases[i].run_case(&test);
+ }
+
+ return 0;
+}
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread* Re: [PATCH v2 4/5] tools: Add support for running rosebush tests in userspace
2024-06-25 21:17 ` [PATCH v2 4/5] tools: Add support for running rosebush tests in userspace Matthew Wilcox (Oracle)
@ 2024-06-29 5:15 ` Jeff Johnson
0 siblings, 0 replies; 17+ messages in thread
From: Jeff Johnson @ 2024-06-29 5:15 UTC (permalink / raw)
To: Matthew Wilcox (Oracle), linux-kernel; +Cc: linux-fsdevel, maple-tree
On 6/25/24 14:17, Matthew Wilcox (Oracle) wrote:
> Enable make -C tools/testing/radix-tree. Much easier to debug than
> an in-kernel module.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
> tools/include/linux/rosebush.h | 1 +
> tools/testing/radix-tree/.gitignore | 1 +
> tools/testing/radix-tree/Makefile | 6 ++++-
> tools/testing/radix-tree/kunit/test.h | 20 +++++++++++++++
> tools/testing/radix-tree/rosebush.c | 36 +++++++++++++++++++++++++++
> 5 files changed, 63 insertions(+), 1 deletion(-)
> create mode 100644 tools/include/linux/rosebush.h
> create mode 100644 tools/testing/radix-tree/kunit/test.h
> create mode 100644 tools/testing/radix-tree/rosebush.c
...
> +#define MODULE_AUTHOR(x)
> +#define MODULE_LICENSE(x)
don't forget to #define MODULE_DESCRIPTION() here if/when you add it to
the module
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v2 5/5] dcache: Convert to use rosebush
2024-06-25 21:17 [PATCH v2 0/5] Rosebush, a new hash table Matthew Wilcox (Oracle)
` (3 preceding siblings ...)
2024-06-25 21:17 ` [PATCH v2 4/5] tools: Add support for running rosebush tests in userspace Matthew Wilcox (Oracle)
@ 2024-06-25 21:18 ` Matthew Wilcox (Oracle)
4 siblings, 0 replies; 17+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-06-25 21:18 UTC (permalink / raw)
To: linux-kernel; +Cc: Matthew Wilcox (Oracle), linux-fsdevel, maple-tree
Use the rosebush instead of a custom hashtable. This is a deliberately
awful patch because I wouldn't want anyone applying it yet. We need to
figure out how to implement d_unhashed() properly, and what to do about
IS_ROOT() dentries.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
fs/dcache.c | 152 +++++++++++++++++++---------------------------------
1 file changed, 54 insertions(+), 98 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 407095188f83..5c2ba8d5f8ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,6 +26,7 @@
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
+#include <linux/rosebush.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
@@ -87,23 +88,7 @@ EXPORT_SYMBOL(slash_name);
const struct qstr dotdot_name = QSTR_INIT("..", 2);
EXPORT_SYMBOL(dotdot_name);
-/*
- * This is the single most critical data structure when it comes
- * to the dcache: the hashtable for lookups. Somebody should try
- * to make this good - I've just made it work.
- *
- * This hash-function tries to avoid losing too many bits of hash
- * information, yet avoid using a prime hash-size or similar.
- */
-
-static unsigned int d_hash_shift __ro_after_init;
-
-static struct hlist_bl_head *dentry_hashtable __ro_after_init;
-
-static inline struct hlist_bl_head *d_hash(unsigned int hash)
-{
- return dentry_hashtable + (hash >> d_hash_shift);
-}
+static DEFINE_ROSEBUSH(all_dentries);
#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];
@@ -486,20 +471,22 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
static void ___d_drop(struct dentry *dentry)
{
- struct hlist_bl_head *b;
/*
* Hashed dentries are normally on the dentry hashtable,
* with the exception of those newly allocated by
* d_obtain_root, which are always IS_ROOT:
*/
- if (unlikely(IS_ROOT(dentry)))
- b = &dentry->d_sb->s_roots;
- else
- b = d_hash(dentry->d_name.hash);
+ if (unlikely(IS_ROOT(dentry))) {
+ struct hlist_bl_head *b = &dentry->d_sb->s_roots;
+ hlist_bl_lock(b);
+ __hlist_bl_del(&dentry->d_hash);
+ hlist_bl_unlock(b);
+ } else {
+ dentry->d_hash.pprev = NULL;
+ }
- hlist_bl_lock(b);
- __hlist_bl_del(&dentry->d_hash);
- hlist_bl_unlock(b);
+//printk("removing dentry %p at %x\n", dentry, dentry->d_name.hash);
+ rbh_remove(&all_dentries, dentry->d_name.hash, dentry);
}
void __d_drop(struct dentry *dentry)
@@ -2104,15 +2091,14 @@ static noinline struct dentry *__d_lookup_rcu_op_compare(
unsigned *seqp)
{
u64 hashlen = name->hash_len;
- struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
- struct hlist_bl_node *node;
+ RBH_ITER(iter, &all_dentries, hashlen_hash(hashlen));
struct dentry *dentry;
- hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ while ((dentry = rbh_next(&iter)) != NULL) {
int tlen;
const char *tname;
unsigned seq;
-
+//printk("%s: got dentry %p with hash %x\n", __func__, dentry, dentry->d_name.hash);
seqretry:
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
@@ -2131,9 +2117,9 @@ static noinline struct dentry *__d_lookup_rcu_op_compare(
if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
continue;
*seqp = seq;
- return dentry;
+ break;
}
- return NULL;
+ return dentry;
}
/**
@@ -2171,8 +2157,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
- struct hlist_bl_node *node;
+ RBH_ITER(iter, &all_dentries, hashlen_hash(hashlen));
struct dentry *dentry;
/*
@@ -2182,6 +2167,8 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
* Keep the two functions in sync.
*/
+//printk("%s: Looking up %s with hash %x\n", __func__, name->name, name->hash);
+
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
return __d_lookup_rcu_op_compare(parent, name, seqp);
@@ -2198,9 +2185,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
*
* See Documentation/filesystems/path-lookup.txt for more details.
*/
- hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
+ while ((dentry = rbh_next(&iter)) != NULL) {
unsigned seq;
+//printk("%s: got dentry %p with hash %x\n", __func__, dentry, dentry->d_name.hash);
/*
* The dentry sequence count protects us from concurrent
* renames, and thus protects parent and name fields.
@@ -2228,9 +2216,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
continue;
*seqp = seq;
- return dentry;
+ break;
}
- return NULL;
+//printk("%s: found %p\n", __func__, dentry);
+ return dentry;
}
/**
@@ -2276,10 +2265,7 @@ EXPORT_SYMBOL(d_lookup);
*/
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
- unsigned int hash = name->hash;
- struct hlist_bl_head *b = d_hash(hash);
- struct hlist_bl_node *node;
- struct dentry *found = NULL;
+ RBH_ITER(iter, &all_dentries, name->hash);
struct dentry *dentry;
/*
@@ -2289,6 +2275,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
* Keep the two functions in sync.
*/
+//printk("%s: Looking up %s with hash %x\n", __func__, name->name, name->hash);
+
/*
* The hash list is protected using RCU.
*
@@ -2303,12 +2291,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
* See Documentation/filesystems/path-lookup.txt for more details.
*/
rcu_read_lock();
-
- hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
-
- if (dentry->d_name.hash != hash)
- continue;
-
+ while ((dentry = rbh_next(&iter)) != NULL) {
+//printk("%s: got dentry %p with hash %x\n", __func__, dentry, dentry->d_name.hash);
spin_lock(&dentry->d_lock);
if (dentry->d_parent != parent)
goto next;
@@ -2319,15 +2303,15 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
goto next;
dentry->d_lockref.count++;
- found = dentry;
spin_unlock(&dentry->d_lock);
break;
next:
spin_unlock(&dentry->d_lock);
- }
- rcu_read_unlock();
+ }
+ rcu_read_unlock();
- return found;
+//printk("%s: found %p\n", __func__, dentry);
+ return dentry;
}
/**
@@ -2397,11 +2381,10 @@ EXPORT_SYMBOL(d_delete);
static void __d_rehash(struct dentry *entry)
{
- struct hlist_bl_head *b = d_hash(entry->d_name.hash);
-
- hlist_bl_lock(b);
- hlist_bl_add_head_rcu(&entry->d_hash, b);
- hlist_bl_unlock(b);
+//printk("%s: using dentry %p at %x\n", __func__, entry, entry->d_name.hash);
+
+ entry->d_hash.pprev = (void *)&all_dentries;
+ rbh_use(&all_dentries, entry->d_name.hash, entry);
}
/**
@@ -2413,6 +2396,8 @@ static void __d_rehash(struct dentry *entry)
void d_rehash(struct dentry * entry)
{
+//printk("%s: reserving dentry %p at %x\n", __func__, entry, entry->d_name.hash);
+ rbh_reserve(&all_dentries, entry->d_name.hash);
spin_lock(&entry->d_lock);
__d_rehash(entry);
spin_unlock(&entry->d_lock);
@@ -2601,6 +2586,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
wait_queue_head_t *d_wait;
struct inode *dir = NULL;
unsigned n;
+
spin_lock(&dentry->d_lock);
if (unlikely(d_in_lookup(dentry))) {
dir = dentry->d_parent->d_inode;
@@ -2625,20 +2611,21 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
/**
* d_add - add dentry to hash queues
- * @entry: dentry to add
+ * @dentry: dentry to add
* @inode: The inode to attach to this dentry
*
* This adds the entry to the hash queues and initializes @inode.
* The entry was actually filled in earlier during d_alloc().
*/
-
-void d_add(struct dentry *entry, struct inode *inode)
+void d_add(struct dentry *dentry, struct inode *inode)
{
+//printk("%s: reserving dentry %p at %x\n", __func__, dentry, dentry->d_name.hash);
+ rbh_reserve(&all_dentries, dentry->d_name.hash);
if (inode) {
- security_d_instantiate(entry, inode);
+ security_d_instantiate(dentry, inode);
spin_lock(&inode->i_lock);
}
- __d_add(entry, inode);
+ __d_add(dentry, inode);
}
EXPORT_SYMBOL(d_add);
@@ -2658,6 +2645,8 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
struct dentry *alias;
unsigned int hash = entry->d_name.hash;
+//printk("%s: reserving dentry %p at %x\n", __func__, entry, hash);
+ rbh_reserve(&all_dentries, hash);
spin_lock(&inode->i_lock);
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
/*
@@ -2675,6 +2664,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
if (!d_unhashed(alias)) {
spin_unlock(&alias->d_lock);
alias = NULL;
+ rbh_release(&all_dentries, hash);
} else {
dget_dlock(alias);
__d_rehash(alias);
@@ -2684,6 +2674,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
return alias;
}
spin_unlock(&inode->i_lock);
+ rbh_release(&all_dentries, hash);
return NULL;
}
EXPORT_SYMBOL(d_exact_alias);
@@ -2967,6 +2958,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
BUG_ON(!d_unhashed(dentry));
+//printk("%s: reserving dentry %p at %x\n", __func__, dentry, dentry->d_name.hash);
+ rbh_reserve(&all_dentries, dentry->d_name.hash);
if (!inode)
goto out;
@@ -3002,6 +2995,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
write_sequnlock(&rename_lock);
}
iput(inode);
+ rbh_release(&all_dentries, dentry->d_name.hash);
return new;
}
}
@@ -3110,27 +3104,6 @@ static int __init set_dhash_entries(char *str)
}
__setup("dhash_entries=", set_dhash_entries);
-static void __init dcache_init_early(void)
-{
- /* If hashes are distributed across NUMA nodes, defer
- * hash allocation until vmalloc space is available.
- */
- if (hashdist)
- return;
-
- dentry_hashtable =
- alloc_large_system_hash("Dentry cache",
- sizeof(struct hlist_bl_head),
- dhash_entries,
- 13,
- HASH_EARLY | HASH_ZERO,
- &d_hash_shift,
- NULL,
- 0,
- 0);
- d_hash_shift = 32 - d_hash_shift;
-}
-
static void __init dcache_init(void)
{
/*
@@ -3141,22 +3114,6 @@ static void __init dcache_init(void)
dentry_cache = KMEM_CACHE_USERCOPY(dentry,
SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
d_iname);
-
- /* Hash may have been set up in dcache_init_early */
- if (!hashdist)
- return;
-
- dentry_hashtable =
- alloc_large_system_hash("Dentry cache",
- sizeof(struct hlist_bl_head),
- dhash_entries,
- 13,
- HASH_ZERO,
- &d_hash_shift,
- NULL,
- 0,
- 0);
- d_hash_shift = 32 - d_hash_shift;
}
/* SLAB cache for __getname() consumers */
@@ -3170,7 +3127,6 @@ void __init vfs_caches_init_early(void)
for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);
- dcache_init_early();
inode_init_early();
}
--
2.43.0
^ permalink raw reply related [flat|nested] 17+ messages in thread