From: Chris Mason <chris.mason@fusionio.com>
To: Linux FS Devel <linux-fsdevel@vger.kernel.org>,
David Woodhouse <David.Woodhouse@intel.com>,
"dchinner@redhat.com" <dchinner@redhat.com>,
<bo.li.liu@oracle.com>
Subject: [PATCH RFC 1/2] core skiplist code
Date: Thu, 2 May 2013 22:06:25 -0400 [thread overview]
Message-ID: <20130503020625.GA4271@shiny.masoncoding.com> (raw)
In-Reply-To: <20130503020211.3599.72404@localhost.localdomain>
diff --git a/include/linux/skiplist.h b/include/linux/skiplist.h
new file mode 100644
index 0000000..9b695fe
--- /dev/null
+++ b/include/linux/skiplist.h
@@ -0,0 +1,179 @@
+/*
+ * (C) 2011 Liu Bo <liubo2009@cn.fujitsu.com>
+ * (C) 2013 Fusion-io
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _SKIPLIST_H
+#define _SKIPLIST_H
+
+#include <linux/spinlock.h>
+#include <linux/stacktrace.h>
+
+/*
+ * This includes a basic skiplist implementation and builds a more
+ * cache friendly variant on top meant to index ranges of keys.
+ *
+ * our random generation function has P at 0.5, so a rough metric
+ * of good performance happens for lists up to 2^MAXLEVEL in size.
+ * Since we have an array of keys, you can do 2^MAXLEVEL * SKIP_KEYS_PER_NODE
+ */
+#define SKIP_MAXLEVEL 32 /* skiplist_get_new_level requires <= 32 */
+
+struct sl_node_ptr {
+ struct sl_node *prev;
+ struct sl_node *next;
+};
+
+/*
+ * sl_node must be last in the leaf data struct. Allocate enough
+ * ram for a given size using either the sl_ptrs_size or sl_node_size
+ * helpers.
+ */
+struct sl_node {
+ int level;
+ int dead;
+ spinlock_t lock;
+ struct sl_node_ptr ptrs[];
+};
+
+/*
+ * The head list structure. The head node has no data,
+ * but it does have the full array of pointers for all the levels.
+ */
+struct sl_list {
+ /* in the head pointer, we use head->prev to point to
+ * the highest item in the list. But, the last item does
+ * not point back to the head. The head->prev items
+ * are protected the by node lock on the last item
+ */
+ struct sl_node *head;
+ spinlock_t lock;
+ unsigned int level;
+};
+
+/*
+ * If you are indexing extents, embed sl_slots into your structure and use
+ * sl_slot_entry to pull out your real struct. The key and size must not
+ * change if you're using rcu.
+ */
+struct sl_slot {
+ /*
+ * when rcu is on, we use this key to verify the pointer we pull
+ * out of the array. It must not change once the object is
+ * inserted
+ */
+ unsigned long key;
+
+ /*
+ * the range searching functions follow pointers into this slot
+ * struct and use this size field to find out how big the
+ * range is.
+ */
+ unsigned long size;
+
+ /*
+ * there is no reference count here, that's the job of whatever
+ * struct you embed this into. Good luck.
+ */
+};
+
+/*
+ * Larger values here make us faster when single threaded. Lower values
+ * increase cache misses but give more chances for concurrency.
+ */
+#define SKIP_KEYS_PER_NODE 32
+
+/*
+ * For indexing extents, this is a leaf in our skip list tree.
+ * Each leaf has a number of pointers and the max field
+ * is used to figure out the key space covered.
+ */
+struct sl_leaf {
+ /* number of valid keys/ptrs in this leaf */
+ int nr;
+
+ /*
+ * max value of the range covered by this leaf. This
+ * includes the size field of the very last extent,
+ * so max = keys[last_index] + ptrs[last_index]->size
+ */
+ unsigned long max;
+
+ /*
+ * sorted, all the keys
+ */
+ unsigned long keys[SKIP_KEYS_PER_NODE];
+
+ /*
+ * data pointers corresponding to the keys
+ */
+ struct sl_slot *ptrs[SKIP_KEYS_PER_NODE];
+
+ /* for freeing our objects after the grace period */
+ struct rcu_head rcu_head;
+
+ /* this needs to be at the end. The size changes based on the level */
+ struct sl_node node;
+};
+
+/*
+ * for a given level, how much memory we need for an array of
+ * all the pointers
+ */
+static inline int sl_ptrs_size(int level)
+{
+ return sizeof(struct sl_node_ptr) * (level + 1);
+}
+
+/*
+ * for a given level, how much memory we need for the
+ * array of pointers and the sl_node struct
+ */
+static inline int sl_node_size(int level)
+{
+ return sizeof(struct sl_node) + sl_ptrs_size(level);
+}
+
+static inline int sl_leaf_size(int level)
+{
+ return sizeof(struct sl_leaf) + sl_ptrs_size(level);
+}
+
+#define sl_entry(ptr) container_of((ptr), struct sl_leaf, node)
+#define sl_slot_entry(ptr, type, member) container_of(ptr, type, member)
+
+static inline int sl_empty(const struct sl_node *head)
+{
+ return head->ptrs[0].next == NULL;
+}
+
+int skiplist_preload(struct sl_list *list, gfp_t gfp_mask);
+int skiplist_get_new_level(struct sl_list *list, int max_level);
+int skiplist_insert(struct sl_list *list, struct sl_slot *slot,
+ int preload_token);
+int sl_init_list(struct sl_list *list, gfp_t mask);
+struct sl_slot *skiplist_lookup(struct sl_list *list, unsigned long key, unsigned long size);
+struct sl_slot *skiplist_lookup_rcu(struct sl_list *list, unsigned long key, unsigned long size);
+struct sl_slot *skiplist_delete(struct sl_list *list, unsigned long key, unsigned long size);
+int skiplist_insert_hole(struct sl_list *list, unsigned long hint,
+ unsigned long limit,
+ unsigned long size, unsigned long align,
+ struct sl_slot *slot,
+ gfp_t gfp_mask);
+void sl_lock_node(struct sl_node *n);
+void sl_unlock_node(struct sl_node *n);
+void sl_free_leaf(struct sl_leaf *leaf);
+unsigned long sl_highest_key(struct sl_list *list);
+#endif /* _SKIPLIST_H */
diff --git a/init/main.c b/init/main.c
index 63534a1..fccbaa4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -90,6 +90,7 @@ extern void fork_init(unsigned long);
extern void mca_init(void);
extern void sbus_init(void);
extern void radix_tree_init(void);
+extern void skiplist_init(void);
#ifndef CONFIG_DEBUG_RODATA
static inline void mark_rodata_ro(void) { }
#endif
@@ -548,6 +549,7 @@ asmlinkage void __init start_kernel(void)
perf_event_init();
rcu_init();
radix_tree_init();
+
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
@@ -566,6 +568,10 @@ asmlinkage void __init start_kernel(void)
kmem_cache_init_late();
+#ifdef CONFIG_SKIPLIST
+ skiplist_init();
+#endif
+
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
diff --git a/lib/Kconfig b/lib/Kconfig
index 3958dc4..c571287 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -393,6 +393,20 @@ config SIGNATURE
Digital signature verification. Currently only RSA is supported.
Implementation is done using GnuPG MPI library
+config SKIPLIST
+ bool "Concurrent Skiplists"
+ default n
+ help
+ Concurrent skiplist indexing trees. Just say N
+
+config SKIPLIST_TEST
+ tristate "Skiplist testing module"
+ default n
+ select SKIPLIST
+ help
+ Testing module for the skiplist implementation
+
+#
#
# libfdt files, only selected if needed.
#
diff --git a/lib/Makefile b/lib/Makefile
index d7946ff..5c63f08 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,9 @@ obj-$(CONFIG_NLATTR) += nlattr.o
obj-$(CONFIG_LRU_CACHE) += lru_cache.o
+obj-$(CONFIG_SKIPLIST) += skiplist.o
+obj-$(CONFIG_SKIPLIST_TEST) += skiplist_test.o
+
obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
obj-$(CONFIG_GENERIC_CSUM) += checksum.o
diff --git a/lib/skiplist.c b/lib/skiplist.c
new file mode 100644
index 0000000..7692792
--- /dev/null
+++ b/lib/skiplist.c
@@ -0,0 +1,1877 @@
+/*
+ * (C) 2011 Liu Bo <liubo2009@cn.fujitsu.com>
+ * (C) 2013 Fusion-io
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/radix-tree.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/skiplist.h>
+#include <linux/lockdep.h>
+#include <linux/stacktrace.h>
+#include <linux/sched.h>
+
+
+static struct kmem_cache *slab_caches[SKIP_MAXLEVEL];
+
+/*
+ * we have preemption off anyway for insertion, make the cursor
+ * per-cpu so we don't need to allocate one
+ */
+struct skip_preload {
+ /* one of these per CPU for tracking insertion */
+ struct sl_node *cursor[SKIP_MAXLEVEL + 1];
+
+ /*
+ * the preload is filled in based on the highest possible level
+ * of the list you're preloading. So we basically end up with
+ * one preloaded node for each max size.
+ */
+ struct sl_leaf *preload[SKIP_MAXLEVEL + 1];
+};
+
+static DEFINE_PER_CPU(struct skip_preload, skip_preloads) = { {NULL,}, };
+
+static void sl_init_node(struct sl_node *node, int level)
+{
+ spin_lock_init(&node->lock);
+
+ node->ptrs[level].prev = NULL;
+ node->level = level;
+ node->dead = 0;
+}
+
+/*
+ * the rcu based searches need to block reuse until a given search round
+ * is done. So, we use call_rcu for freeing the leaf structure.
+ */
+static void sl_free_rcu(struct rcu_head *head)
+{
+ struct sl_leaf *leaf = container_of(head, struct sl_leaf, rcu_head);
+ kmem_cache_free(slab_caches[leaf->node.level], leaf);
+}
+
+void sl_free_leaf(struct sl_leaf *leaf)
+{
+ call_rcu(&leaf->rcu_head, sl_free_rcu);
+}
+
+/*
+ * helper functions to wade through dead nodes pending deletion
+ * and return live ones.
+ */
+static struct sl_node *find_live_prev(struct sl_list *list,
+ struct sl_node *node, int level)
+{
+ /* head->prev points to the max, this makes sure we don't loop */
+ if (node == list->head)
+ return NULL;
+
+ while (node) {
+ node = rcu_dereference(node->ptrs[level].prev);
+ /*
+ * the head is never dead, so we'll never walk past
+ * it down in this loop
+ */
+ if (!node->dead)
+ break;
+ }
+
+ return node;
+}
+
+static struct sl_node *find_live_next(struct sl_list *list,
+ struct sl_node *node, int level)
+{
+ while (node) {
+ node = rcu_dereference(node->ptrs[level].next);
+ if (!node || !node->dead)
+ break;
+ }
+ return node;
+}
+
+/*
+ * having trouble teaching lockdep about the skiplist
+ * locking. The problem is that we're allowed to
+ * hold multiple locks on the same level as long as we
+ * go from left to right.
+ */
+void sl_lock_node(struct sl_node *n)
+{
+ spin_lock(&n->lock);
+}
+
+void sl_unlock_node(struct sl_node *n)
+{
+ if (n)
+ spin_unlock(&n->lock);
+}
+
+/*
+ * the cursors are used by the insertion code to remember the leaves we passed
+ * on the way down to our insertion point. Any new nodes are linking in
+ * after the nodes in our cursor.
+ *
+ * Nodes may appear in the cursor more than once, but if so they are
+ * always consecutive. We don't have A, B, C, B, D, only
+ * A, B, B, B, C, D. When locking and unlocking things, we
+ * have to make sure we leave any node inside the cursor properly locked.
+ *
+ * Right now, everything in the cursor must be locked.
+ */
+int found_in_cursor(struct sl_node **cursor, int max_level, struct sl_node *p)
+{
+ int i;
+
+ for (i = 0; i <= max_level; i++) {
+ if (cursor[i] == p)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * add p into cursor at a specific level. If p is replacing another
+ * pointer, that pointer is unlocked, unless it is also at a
+ * higher level in the cursor.
+ *
+ * p is locked unless it was already in the cursor.
+ */
+static void add_to_cursor(struct sl_node **cursor, int level,
+ struct sl_node *p)
+{
+ struct sl_node *old;
+ struct sl_node *higher;
+
+ old = cursor[level];
+ cursor[level] = p;
+
+ if (old == p)
+ return;
+
+ if (level == SKIP_MAXLEVEL) {
+ sl_lock_node(p);
+ sl_unlock_node(old);
+ return;
+ }
+ higher = cursor[level + 1];
+
+ if (higher != p)
+ sl_lock_node(p);
+ if (higher != old)
+ sl_unlock_node(old);
+}
+
+/*
+ * same as add_to_cursor, but p must already be locked.
+ */
+static void add_locked_to_cursor(struct sl_node **cursor, int level,
+ struct sl_node *p)
+{
+ struct sl_node *old;
+
+ old = cursor[level];
+ cursor[level] = p;
+
+ if (old == p)
+ return;
+
+ if (level == SKIP_MAXLEVEL) {
+ sl_unlock_node(old);
+ return;
+ }
+
+ if (cursor[level + 1] != old)
+ sl_unlock_node(old);
+}
+
+/*
+ * unlock any nodes in the cursor below max_level
+ */
+static void free_cursor_locks(struct sl_node **cursor, int max_level)
+{
+ struct sl_node *p;
+ int i;
+
+ for (i = max_level; i >= 0; i--) {
+ p = cursor[i];
+ cursor[i] = NULL;
+ if (i == 0 || cursor[i - 1] != p)
+ sl_unlock_node(p);
+ }
+}
+
+/*
+ * helper function to link a single level during an insert.
+ * prev must be locked, and it is the node we are linking after.
+ *
+ * This will find a live next pointer, lock it, and link it
+ * with our new node
+ */
+static void sl_link_one_level(struct sl_list *list,
+ struct sl_node *prev,
+ struct sl_node *node, int level)
+{
+ struct sl_node *next;
+ struct sl_node *test;
+
+ assert_spin_locked(&prev->lock);
+ BUG_ON(prev->dead);
+
+again:
+ next = find_live_next(list, prev, level);
+ if (next) {
+ sl_lock_node(next);
+ test = find_live_next(list, prev, level);
+ if (test != next || next->dead) {
+ sl_unlock_node(next);
+ goto again;
+ }
+ /*
+ * make sure the our next and prev really point to each
+ * other now that we have next locked.
+ */
+ if (find_live_prev(list, next, level) != prev) {
+ sl_unlock_node(next);
+ goto again;
+ }
+ }
+
+ rcu_assign_pointer(node->ptrs[level].next, next);
+ rcu_assign_pointer(node->ptrs[level].prev, prev);
+ rcu_assign_pointer(prev->ptrs[level].next, node);
+
+ /*
+ * if next is null, we're the last node on this level.
+ * The head->prev pointer is used to cache this fact
+ */
+ if (next)
+ rcu_assign_pointer(next->ptrs[level].prev, node);
+ else
+ rcu_assign_pointer(list->head->ptrs[level].prev, node);
+
+ sl_unlock_node(next);
+}
+
+/*
+ * link a node at a given level. The cursor needs pointers to all the
+ * nodes that are just behind us in the list.
+ *
+ * Link from the bottom up so that our searches from the top down don't
+ * find bogus pointers
+ */
+static void sl_link_node(struct sl_list *list, struct sl_node *node,
+ struct sl_node **cursor, int level)
+{
+ int i;
+
+ for (i = 0; i <= level; i++)
+ sl_link_one_level(list, cursor[i], node, i);
+}
+
+/*
+ * just like sl_link_node, but use 'after' for starting point of the link.
+ * Any pointers not provided from 'after' come from the cursor.
+ * 'after' must be locked.
+ */
+static void sl_link_after_node(struct sl_list *list, struct sl_node *node,
+ struct sl_node *after,
+ struct sl_node **cursor, int level)
+{
+ int i;
+
+ /* first use all the pointers from 'after' */
+ for (i = 0; i <= after->level && i <= level; i++)
+ sl_link_one_level(list, after, node, i);
+
+ /* then use the cursor for anything left */
+ for (; i <= level; i++)
+ sl_link_one_level(list, cursor[i], node, i);
+
+}
+
+/*
+ * helper function to pull out the next live leaf at a given level.
+ * It is not locked
+ */
+struct sl_leaf *sl_next_leaf(struct sl_list *list, struct sl_node *p, int l)
+{
+ struct sl_node *next;
+ if (!p)
+ return NULL;
+
+ next = find_live_next(list, p, l);
+ if (next)
+ return sl_entry(next);
+ return NULL;
+}
+
+/*
+ * return the highest value for a given leaf. This is cached
+ * in leaf->max so that we don't have to wander into
+ * the slot pointers. The max is equal to the key + size of the
+ * last slot.
+ */
+static unsigned long sl_max_key(struct sl_leaf *leaf)
+{
+ smp_rmb();
+ return leaf->max;
+}
+
+/*
+ * return the lowest key for a given leaf. This comes out
+ * of the node key array and not the slots
+ */
+static unsigned long sl_min_key(struct sl_leaf *leaf)
+{
+ smp_rmb();
+ return leaf->keys[0];
+}
+
+struct sl_leaf *sl_first_leaf(struct sl_list *list)
+{
+ struct sl_leaf *leaf;
+ struct sl_node *p;
+
+ p = list->head->ptrs[0].next;
+ if (!p)
+ return NULL;
+ leaf = sl_entry(p);
+
+ return leaf;
+}
+
+struct sl_leaf *sl_last_leaf(struct sl_list *list)
+{
+ struct sl_leaf *leaf;
+ struct sl_node *p;
+
+ p = list->head->ptrs[0].prev;
+ if (!p)
+ return NULL;
+ leaf = sl_entry(p);
+
+ return leaf;
+}
+
+/*
+ * search inside the key array of a given leaf. The leaf must be
+ * locked because we're using a binary search. This returns
+ * zero if we found a slot with the key in it, and sets
+ * 'slot' to the number of the slot pointer.
+ *
+ * 1 is returned if the key was not found, and we set slot to
+ * the location where the insert needs to be performed.
+ *
+ */
+int leaf_slot_locked(struct sl_leaf *leaf, unsigned long key,
+ unsigned long size, int *slot)
+{
+ int low = 0;
+ int high = leaf->nr - 1;
+ int mid;
+ unsigned long k1;
+ struct sl_slot *found;
+
+ /*
+ * case1:
+ * [ key ... size ]
+ * [found .. found size ]
+ *
+ * case2:
+ * [key ... size ]
+ * [found .. found size ]
+ *
+ * case3:
+ * [key ... size ]
+ * [ found .. found size ]
+ *
+ * case4:
+ * [key ...size ]
+ * [ found ... found size ]
+ *
+ * case5:
+ * [key ...size ]
+ * [ found ... found size ]
+ */
+
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ k1 = leaf->keys[mid];
+ if (k1 < key) {
+ low = mid + 1;
+ } else if (k1 >= key + size) {
+ high = mid - 1;
+ } else {
+ *slot = mid;
+ return 0;
+ }
+ }
+
+ /*
+ * nothing found, at this point we're in the slot this key would
+ * normally be inserted at. Check the previous slot to see if
+ * it is inside the range there
+ */
+ if (low > 0) {
+ k1 = leaf->keys[low - 1];
+ found = leaf->ptrs[low - 1];
+
+ /* case1, case2, case5 */
+ if (k1 < key + size && k1 + found->size > key) {
+ *slot = low - 1;
+ return 0;
+ }
+
+ /* case3, case4 */
+ if (k1 < key + size && k1 >= key) {
+ *slot = low - 1;
+ return 0;
+ }
+ }
+ *slot = low;
+ return 1;
+}
+
+/*
+ * sequential search for lockless rcu. The insert/deletion routines
+ * try to order their operations to make this safe. See leaf_slot_locked
+ * for a list of extent range cases we're trying to cover.
+ */
+static int leaf_slot(struct sl_leaf *leaf, unsigned long key,
+ unsigned long size, int *slot)
+{
+ int i;
+ int cur;
+ int last;
+ unsigned long this_key;
+ struct sl_slot *found;
+
+again:
+ cur = 0;
+ last = leaf->nr;
+
+ /* find the first slot greater than our key */
+ for (i = 0; i < last; i++) {
+ smp_rmb();
+ this_key = leaf->keys[i];
+ if (this_key >= key + size)
+ break;
+ cur = i;
+ }
+ if (leaf->keys[cur] < key + size) {
+ /*
+ * if we're in the middle of an insert, pointer may
+ * be null. This little loop will wait for the insertion
+ * to finish.
+ */
+ while (1) {
+ found = rcu_dereference(leaf->ptrs[cur]);
+ if (found)
+ break;
+ cpu_relax();
+ }
+
+ /* insert is juggling our slots, try again */
+ if (found->key != leaf->keys[cur])
+ goto again;
+
+ /* case1, case2, case5 */
+ if (found->key < key + size && found->key + found->size > key) {
+ *slot = cur;
+ return 0;
+ }
+
+ /* case3, case4 */
+ if (found->key < key + size && found->key >= key) {
+ *slot = cur;
+ return 0;
+ }
+
+ *slot = cur + 1;
+ return 1;
+ }
+ *slot = cur;
+ return 1;
+}
+
+/*
+ * this does the dirty work of splitting and/or shifting a leaf
+ * to get a new slot inside. The leaf must be locked. slot
+ * tells us where into we should insert in the leaf and the cursor
+ * should have all the pointers we need to fully link any new nodes
+ * we have to create. leaf and everything in the cursor must be locked.
+ */
+static noinline int add_key_to_leaf(struct sl_list *list, struct sl_leaf *leaf,
+ struct sl_slot *slot_ptr, unsigned long key,
+ int slot, struct sl_node **cursor, int preload_token)
+{
+ struct sl_leaf *split;
+ struct skip_preload *skp;
+ int level;
+
+ /* no splitting required, just shift our way in */
+ if (leaf->nr < SKIP_KEYS_PER_NODE)
+ goto insert;
+
+ skp = &__get_cpu_var(skip_preloads);
+ split = skp->preload[preload_token];
+
+ /*
+ * we need to insert a new leaf, but we try not to insert a new leaf at
+ * the same height as our previous one, it's just a waste of high level
+ * searching. If the new node is the same level or lower than the
+ * existing one, try to use a level 0 leaf instead.
+ *
+ * The preallocation code tries to keep a level 0 leaf preallocated,
+ * lets see if we can grab one.
+ */
+ if (leaf->node.level > 0 && split->node.level <= leaf->node.level) {
+ if (skp->preload[0]) {
+ preload_token = 0;
+ split = skp->preload[0];
+ }
+ }
+ skp->preload[preload_token] = NULL;
+
+ level = split->node.level;
+
+ /*
+ * bump our list->level to whatever we've found. Nobody allocating
+ * a new node is going to set it higher than list->level + 1
+ */
+ if (level > list->level)
+ list->level = level;
+
+ /*
+ * this locking is really only required for the small window where
+ * we are linking the node and someone might be deleting one of the
+ * nodes we are linking with. The leaf passed in was already
+ * locked.
+ */
+ sl_lock_node(&split->node);
+
+ if (slot == leaf->nr) {
+ /*
+ * our new slot just goes at the front of the new leaf, don't
+ * bother shifting things in from the previous leaf.
+ */
+ slot = 0;
+ split->nr = 1;
+ split->max = key + slot_ptr->size;
+ split->keys[0] = key;
+ split->ptrs[0] = slot_ptr;
+ smp_wmb();
+ sl_link_after_node(list, &split->node, &leaf->node,
+ cursor, level);
+ sl_unlock_node(&split->node);
+ return 0;
+ } else {
+ int nr = SKIP_KEYS_PER_NODE / 2;
+ int mid = SKIP_KEYS_PER_NODE - nr;
+ int src_i = mid;
+ int dst_i = 0;
+ int orig_nr = leaf->nr;
+
+ /* split the previous leaf in half and copy items over */
+ split->nr = nr;
+ split->max = leaf->max;
+
+ while (src_i < slot) {
+ split->keys[dst_i] = leaf->keys[src_i];
+ split->ptrs[dst_i++] = leaf->ptrs[src_i++];
+ }
+
+ if (slot >= mid) {
+ split->keys[dst_i] = key;
+ split->ptrs[dst_i++] = slot_ptr;
+ split->nr++;
+ }
+
+ while (src_i < orig_nr) {
+ split->keys[dst_i] = leaf->keys[src_i];
+ split->ptrs[dst_i++] = leaf->ptrs[src_i++];
+ }
+
+ sl_link_after_node(list, &split->node, &leaf->node,
+ cursor, level);
+ nr = SKIP_KEYS_PER_NODE - nr;
+
+ /*
+ * now what we have all the items copied and our new
+ * leaf inserted, update the nr in this leaf. Anyone
+ * searching in rculand will find the fully updated
+ */
+ leaf->max = leaf->keys[nr - 1] + leaf->ptrs[nr - 1]->size;
+ smp_wmb();
+ leaf->nr = nr;
+ sl_unlock_node(&split->node);
+
+ /*
+ * if the slot was in split item, we're done,
+ * otherwise we need to move down into the
+ * code below that shifts the items and
+ * inserts the new key
+ */
+ if (slot >= mid)
+ return 0;
+ }
+insert:
+ if (slot < leaf->nr) {
+ int i;
+
+ /*
+ * put something sane into the new last slot so rcu
+ * searchers won't get confused
+ */
+ leaf->keys[leaf->nr] = 0;
+ leaf->ptrs[leaf->nr] = NULL;
+ smp_wmb();
+
+ /* then bump the nr */
+ leaf->nr++;
+
+ /*
+ * now step through each pointer after our
+ * destination and bubble it forward. memcpy
+ * would be faster but rcu searchers will be
+ * able to validate pointers as they go with
+ * this method.
+ */
+ for (i = leaf->nr - 1; i > slot; i--) {
+ leaf->keys[i] = leaf->keys[i - 1];
+ leaf->ptrs[i] = leaf->ptrs[i - 1];
+ /*
+ * make sure the key/pointer pair is
+ * fully visible in the new home before
+ * we move forward
+ */
+ smp_wmb();
+ }
+
+ /* finally stuff in our key */
+ leaf->keys[slot] = key;
+ leaf->ptrs[slot] = slot_ptr;
+ smp_wmb();
+ } else {
+ /*
+ * just extending the leaf, toss
+ * our key in and update things
+ */
+ leaf->max = key + slot_ptr->size;
+ leaf->keys[slot] = key;
+ leaf->ptrs[slot] = slot_ptr;
+
+ smp_wmb();
+ leaf->nr++;
+ }
+ return 0;
+}
+
+/*
+ * when we're extending a leaf with a new key, make sure the
+ * range of the [key,size] doesn't extend into the next
+ * leaf.
+ */
+static int check_overlap(struct sl_list *list, struct sl_leaf *leaf,
+ unsigned long key, unsigned long size)
+{
+ struct sl_node *p;
+ struct sl_leaf *next;
+ int ret = 0;
+
+ p = leaf->node.ptrs[0].next;
+ if (!p)
+ return 0;
+
+ sl_lock_node(p);
+ next = sl_entry(p);
+ if (key + size > sl_min_key(next))
+ ret = 1;
+ sl_unlock_node(p);
+
+ return ret;
+}
+
+/*
+ * helper function for insert. This will either return an existing
+ * key or insert a new slot into the list. leaf must be locked,
+ * and everything in the cursor must be locked.
+ */
+static noinline int find_or_add_key(struct sl_list *list, unsigned long key,
+ unsigned long size, struct sl_leaf *leaf,
+ struct sl_slot *slot_ptr,
+ struct sl_node **cursor, int preload_token)
+{
+ int ret;
+ int slot;
+
+ if (check_overlap(list, leaf, key, size)) {
+ ret = -EEXIST;
+ goto out;
+ }
+ if (key < leaf->max) {
+ ret = leaf_slot_locked(leaf, key, size, &slot);
+ if (ret == 0) {
+ ret = -EEXIST;
+ goto out;
+ }
+ } else {
+ slot = leaf->nr;
+ }
+
+ add_key_to_leaf(list, leaf, slot_ptr, key, slot, cursor, preload_token);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * pull a new leaf out of the prealloc area, and insert the slot/key into it
+ */
+static struct sl_leaf *alloc_leaf(struct sl_slot *slot_ptr, unsigned long key,
+ int preload_token)
+{
+ struct sl_leaf *leaf;
+ struct skip_preload *skp;
+ int level;
+
+ skp = &__get_cpu_var(skip_preloads);
+ leaf = skp->preload[preload_token];
+ skp->preload[preload_token] = NULL;
+ level = leaf->node.level;
+
+ leaf->keys[0] = key;
+ leaf->ptrs[0] = slot_ptr;
+ leaf->nr = 1;
+ leaf->max = key + slot_ptr->size;
+ return leaf;
+}
+
+/*
+ * helper to grab the cursor from the prealloc area
+ * you must already have preempt off. The whole
+ * cursor is zero'd out, so don't call this if you're
+ * currently using the cursor.
+ */
+static struct sl_node **get_cursor(void)
+{
+ struct skip_preload *skp;
+ skp = &__get_cpu_var(skip_preloads);
+ memset(skp->cursor, 0, sizeof(skp->cursor[0]) * (SKIP_MAXLEVEL + 1));
+ return skp->cursor;
+}
+
+/*
+ * this returns with preempt disabled and the preallocation
+ * area setup for a new insert. To get there, it may or
+ * may not allocate a new leaf for the next insert.
+ *
+ * If allocations are done, this will also try to preallocate a level 0
+ * leaf, which allows us to optimize insertion by not placing two
+ * adjacent nodes together with the same level.
+ *
+ * This returns < 0 on errors. If everything works, it returns a preload
+ * token which you should use when fetching your preallocated items.
+ *
+ * The token allows us to preallocate based on the current
+ * highest level of the list. For a list of level N, we won't allocate
+ * higher than N + 1.
+ */
+int skiplist_preload(struct sl_list *list, gfp_t gfp_mask)
+{
+ struct skip_preload *skp;
+ struct sl_leaf *leaf;
+ struct sl_leaf *leaf0 = NULL;
+ int alloc_leaf0 = 1;
+ int level;
+ int max_level = min_t(int, list->level + 1, SKIP_MAXLEVEL - 1);
+ int token = max_level;
+
+ preempt_disable();
+ skp = &__get_cpu_var(skip_preloads);
+ if (max_level && !skp->preload[0])
+ alloc_leaf0 = 1;
+
+ if (skp->preload[max_level])
+ return token;
+
+ preempt_enable();
+ level = skiplist_get_new_level(list, max_level);
+ leaf = kmem_cache_alloc(slab_caches[level], gfp_mask);
+ if (leaf == NULL)
+ return -ENOMEM;
+
+ if (alloc_leaf0)
+ leaf0 = kmem_cache_alloc(slab_caches[0], gfp_mask);
+
+ preempt_disable();
+ skp = &__get_cpu_var(skip_preloads);
+
+ if (leaf0) {
+ if (skp->preload[0] == NULL) {
+ sl_init_node(&leaf0->node, 0);
+ skp->preload[0] = leaf0;
+ } else {
+ kmem_cache_free(slab_caches[0], leaf0);
+ }
+ }
+
+ if (skp->preload[max_level]) {
+ kmem_cache_free(slab_caches[level], leaf);
+ return token;
+ }
+ sl_init_node(&leaf->node, level);
+ skp->preload[max_level] = leaf;
+
+ return token;
+}
+EXPORT_SYMBOL(skiplist_preload);
+
+/*
+ * use the kernel prandom call to pick a new random level. This
+ * uses P = .50. If you bump the SKIP_MAXLEVEL past 32 bits,
+ * this function needs updating.
+ */
+int skiplist_get_new_level(struct sl_list *list, int max_level)
+{
+ int level = 0;
+ unsigned long randseed;
+
+ randseed = prandom_u32();
+
+ while (randseed && (randseed & 1)) {
+ randseed >>= 1;
+ level++;
+ if (level == max_level)
+ break;
+ }
+ return (level >= SKIP_MAXLEVEL ? SKIP_MAXLEVEL - 1: level);
+}
+EXPORT_SYMBOL(skiplist_get_new_level);
+
+/*
+ * just return the level of the leaf we're going to use
+ * for the next insert
+ */
+static int pending_insert_level(int preload_token)
+{
+ struct skip_preload *skp;
+ skp = &__get_cpu_var(skip_preloads);
+ return skp->preload[preload_token]->node.level;
+}
+
+/*
+ * after a lockless search, this makes sure a given key is still
+ * inside the min/max of a leaf. If not, you have to repeat the
+ * search and try again.
+ */
+static int verify_key_in_leaf(struct sl_leaf *leaf, unsigned long key,
+ unsigned long size)
+{
+ if (leaf->node.dead)
+ return 0;
+
+ if (key + size < sl_min_key(leaf) ||
+ key >= sl_max_key(leaf))
+ return 0;
+ return 1;
+}
+
+/* The insertion code tries to delay taking locks for as long as possible.
+ * Once we've found a good place to insert, we need to make sure the leaf
+ * we have picked is still a valid location.
+ *
+ * This checks the previous and next pointers to make sure everything is
+ * still correct for the insert. You should send an unlocked leaf, and
+ * it will return 1 with the leaf locked if everything worked.
+ *
+ * We return 0 if the insert cannot proceed, and the leaf is returned unlocked.
+ */
+static int verify_key_in_path(struct sl_list *list,
+ struct sl_node *node, unsigned long key,
+ unsigned long size, int level,
+ struct sl_node **cursor,
+ struct sl_node **locked)
+{
+ struct sl_leaf *prev = NULL;
+ struct sl_leaf *next;
+ struct sl_node *p;
+ struct sl_leaf *leaf = NULL;
+ struct sl_node *lock1;
+ struct sl_node *lock2;
+ struct sl_node *lock3;
+
+ BUG_ON(*locked);
+
+again:
+ lock1 = NULL;
+ lock2 = NULL;
+ lock3 = NULL;
+ if (node != list->head) {
+ p = node->ptrs[level].prev;
+ if (!found_in_cursor(cursor, SKIP_MAXLEVEL, p)) {
+ lock1 = p;
+ sl_lock_node(p);
+ }
+ sl_lock_node(node);
+ lock2 = node;
+
+ if (p->dead || node->dead)
+ goto out;
+
+ if (p != list->head)
+ prev = sl_entry(p);
+
+ /*
+ * once we have the locks, make sure everyone
+ * still points to each other
+ */
+ if (node->ptrs[level].prev != p ||
+ p->ptrs[level].next != node) {
+ sl_unlock_node(lock1);
+ sl_unlock_node(lock2);
+ goto again;
+ }
+
+ leaf = sl_entry(node);
+ } else {
+ sl_lock_node(node);
+ lock2 = node;
+ }
+
+ /*
+ * rule #1, the key must be greater than the max of the previous
+ * leaf
+ */
+ if (prev && key < sl_max_key(prev))
+ goto out;
+
+ /* we're done with prev, unlock it */
+ sl_unlock_node(lock1);
+ lock1 = NULL;
+
+ p = node->ptrs[level].next;
+ if (p)
+ next = sl_entry(p);
+ else
+ next = NULL;
+
+ if (next) {
+ sl_lock_node(&next->node);
+ lock3 = &next->node;
+ /*
+ * rule #2 the key must be smaller than the min key
+ * in the next node
+ */
+ if (node->ptrs[level].next != &next->node ||
+ next->node.ptrs[level].prev != node ||
+ next->node.dead || key >= sl_min_key(next)) {
+ /* next may be in the middle of a delete
+ * here. If so, we can't just goto
+ * again because the delete is waiting
+ * for our lock on node.
+ * FIXME, we can try harder to avoid
+ * the goto out here
+ */
+ goto out;
+ }
+ }
+ /*
+ * return with our leaf locked and sure that our leaf is the
+ * best place for this key
+ */
+ *locked = node;
+ sl_unlock_node(lock1);
+ sl_unlock_node(lock3);
+ return 1;
+
+out:
+ sl_unlock_node(lock1);
+ sl_unlock_node(lock2);
+ sl_unlock_node(lock3);
+ return 0;
+}
+
+
+/*
+ * Before calling this you must have stocked the preload area by
+ * calling skiplist_preload, and you must have kept preemption
+ * off. preload_token comes from skiplist_preload, pass in
+ * exactly what preload gave you.
+ *
+ * More details in the comments below.
+ */
+int skiplist_insert(struct sl_list *list, struct sl_slot *slot,
+ int preload_token)
+{
+ struct sl_node **cursor;
+ struct sl_node *p;
+ struct sl_node *ins_locked = NULL;
+ struct sl_leaf *leaf;
+ unsigned long key = slot->key;
+ unsigned long size = slot->size;
+ unsigned long min_key;
+ unsigned long max_key;
+ int level;
+ int pending_level = pending_insert_level(preload_token);
+ int ret;
+
+ rcu_read_lock();
+ ret = -EEXIST;
+ cursor = get_cursor();
+
+ /*
+ * notes on pending_level, locking and general flow:
+ *
+ * pending_level is the level of the node we might insert
+ * if we can't find free space in the tree. It's important
+ * because it tells us where our cursor is going to start
+ * recording nodes, and also the first node we have to lock
+ * to keep other inserts from messing with the nodes in our cursor.
+ *
+ * The most common answer is pending_level == 0, which is great
+ * because that means we won't have to take a lock until the
+ * very last level.
+ *
+ * if we're really lucky, we doing a level 0 insert for a key past
+ * our current max. We can just jump down to the insertion
+ * code
+ */
+ leaf = sl_last_leaf(list);
+ if (leaf && sl_min_key(leaf) <= key &&
+ (pending_level == 0 || leaf->nr < SKIP_KEYS_PER_NODE)) {
+ p = &leaf->node;
+
+ /* lock and recheck */
+ sl_lock_node(p);
+
+ if (!p->dead && sl_min_key(leaf) <= key &&
+ leaf == sl_last_leaf(list) &&
+ leaf->node.ptrs[0].next == NULL &&
+ (pending_level == 0 || leaf->nr < SKIP_KEYS_PER_NODE)) {
+ ins_locked = p;
+ level = 0;
+ goto find_or_add;
+ } else {
+ sl_unlock_node(p);
+ }
+ }
+
+again:
+ /*
+ * the goto again code below will bump pending level
+ * so we start locking higher and higher in the tree as
+ * contention increases. Make sure to limit
+ * it to SKIP_MAXLEVEL
+ */
+ pending_level = min(pending_level, SKIP_MAXLEVEL);
+ p = list->head;
+ level = list->level;
+
+ /*
+ * once we hit pending_level, we have to start filling
+ * in the cursor and locking nodes
+ */
+ if (level <= pending_level) {
+ if (level != pending_level)
+ add_to_cursor(cursor, pending_level, p);
+ add_to_cursor(cursor, level, p);
+ }
+
+ /*
+ * skip over any NULL levels in the list
+ */
+ while (p->ptrs[level].next == NULL && level > 0) {
+ level--;
+ if (level <= pending_level) {
+ add_to_cursor(cursor, level, p);
+ }
+ }
+
+ do {
+ while (1) {
+ leaf = sl_next_leaf(list, p, level);
+ if (!leaf) {
+ /*
+ * if we're at level 0 and p points to
+ * the head, the list is just empty. If
+ * we're not at level 0 yet, keep walking
+ * down.
+ */
+ if (p == list->head || level != 0)
+ break;
+
+ /*
+ * p was the last leaf on the bottom level,
+ * We're here because 'key' was bigger than the
+ * max key in p. find_or_add will append into
+ * the last leaf.
+ */
+ goto find_or_add;
+ }
+
+ /*
+ * once we get down to the pending level, we have to
+ * start locking. Lock the node and verify it really
+ * is exactly the node we expected to find. It may
+ * get used in the cursor.
+ */
+ if (level <= pending_level) {
+ sl_lock_node(&leaf->node);
+ if (leaf->node.dead ||
+ find_live_next(list, p, level) != &leaf->node ||
+ find_live_prev(list, &leaf->node, level) != p) {
+ sl_unlock_node(&leaf->node);
+ if (!found_in_cursor(cursor,
+ pending_level, p))
+ sl_unlock_node(p);
+ free_cursor_locks(cursor, pending_level);
+ goto again;
+ }
+ }
+ min_key = sl_min_key(leaf);
+ max_key = sl_max_key(leaf);
+
+ /*
+ * strictly speaking this test is covered again below.
+ * But usually we have to walk forward through the
+ * pointers, so this is the most common condition. Try
+ * it first.
+ */
+ if (key >= max_key)
+ goto next;
+
+ if (key < min_key) {
+ /*
+ * when we aren't locking, we have to make sure
+ * new nodes haven't appeared between p and us.
+ */
+ if (level > pending_level &&
+ (find_live_prev(list, &leaf->node, level) != p ||
+ min_key != sl_min_key(leaf))) {
+ goto again;
+ }
+
+ /*
+ * our key is smaller than the smallest key in
+ * leaf. If we're not in level 0 yet, we don't
+ * want to cross over into the leaf
+ */
+ if (level != 0) {
+ if (level <= pending_level)
+ sl_unlock_node(&leaf->node);
+ break;
+ }
+
+ /*
+ * we are in level 0, just stuff our slot into
+ * the front of this leaf. We could also stuff
+ * our slot into p. FIXME, we should pick the
+ * leaf with the smallest number of items.
+ */
+ if (level <= pending_level &&
+ !found_in_cursor(cursor, pending_level, p)) {
+ sl_unlock_node(p);
+ }
+
+ p = &leaf->node;
+ goto find_or_add;
+ }
+
+ if (key < sl_max_key(leaf)) {
+ /*
+ * our key is >= the min and < the max. This
+ * leaf is the one true home for our key. The
+ * level doesn't matter, we could walk the
+ * whole tree and this would still be the best
+ * location.
+ *
+ * So, stop now and do the insert. Our cursor
+ * might not be fully formed down to level0,
+ * but that's ok because every pointer from
+ * our current level down to zero is going to
+ * be this one leaf. find_or_add deals with
+ * all of that.
+ *
+ * If we haven't already locked this leaf,
+ * do so now and make sure it still is
+ * the right location for our key.
+ */
+ if (level > pending_level) {
+ sl_lock_node(&leaf->node);
+ if (key < sl_min_key(leaf) ||
+ key >= sl_max_key(leaf)) {
+ sl_unlock_node(&leaf->node);
+ pending_level = level;
+ goto again;
+ }
+ /*
+ * remember that we've locked this
+ * leaf for the goto find_or_add
+ */
+ ins_locked = &leaf->node;
+ }
+
+ /* unless p is in our cursor, we're done with it */
+ if (level <= pending_level &&
+ !found_in_cursor(cursor, pending_level, p)) {
+ sl_unlock_node(p);
+ }
+
+ p = &leaf->node;
+ goto find_or_add;
+ }
+next:
+ /* walk our lock forward */
+ if (level <= pending_level &&
+ !found_in_cursor(cursor, pending_level, p)) {
+ sl_unlock_node(p);
+ }
+ p = &leaf->node;
+ }
+
+ /*
+ * the while loop is done with this level. Put
+ * p into our cursor if we've started locking/
+ */
+ if (level <= pending_level)
+ add_locked_to_cursor(cursor, level, p);
+
+ level--;
+
+ /*
+ * pending_level is the line that tells us where we
+ * need to start locking. Each node
+ * we record in the cursor needs to be exactly right,
+ * so we verify the first node in the cursor here.
+ * At this point p isn't in the cursor yet but it
+ * will be (or a downstream pointer at the
+ * same level).
+ */
+ if (level == pending_level) {
+ struct sl_node *locked = NULL;
+ if (!verify_key_in_path(list, p,
+ key, size, level + 1,
+ cursor, &locked)) {
+ pending_level++;
+ goto again;
+ }
+ cursor[level] = locked;
+ }
+ } while (level >= 0);
+
+ /* we only get here if the list is completely empty. FIXME
+ * this can be folded into the find_or_add code below
+ */
+ if (!cursor[0]) {
+ add_to_cursor(cursor, 0, list->head);
+ if (list->head->ptrs[0].next != NULL) {
+ free_cursor_locks(cursor, pending_level);
+ goto again;
+ }
+
+ }
+ leaf = alloc_leaf(slot, key, preload_token);
+ level = leaf->node.level;
+
+ if (level > list->level) {
+ list->level++;
+ cursor[list->level] = list->head;
+ }
+
+ sl_link_node(list, &leaf->node, cursor, level);
+ ret = 0;
+ free_cursor_locks(cursor, list->level);
+ rcu_read_unlock();
+
+ return ret;
+
+find_or_add:
+
+ leaf = sl_entry(p);
+ if (!ins_locked) {
+ if (level > pending_level) {
+ /* our cursor is empty, lock this one node */
+ if (!verify_key_in_path(list, p, key, size, 0,
+ cursor, &ins_locked)) {
+ free_cursor_locks(cursor, pending_level);
+ pending_level++;
+ goto again;
+ }
+ } else if (!found_in_cursor(cursor, pending_level, p)) {
+ /* we have a good cursor, but we're linking after
+ * p. Make sure it gets unlocked below
+ */
+ ins_locked = p;
+ }
+ }
+
+ ret = find_or_add_key(list, key, size, leaf, slot, cursor, preload_token);
+ free_cursor_locks(cursor, pending_level);
+ sl_unlock_node(ins_locked);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(skiplist_insert);
+
+/*
+ * lookup has two stages. First we find the leaf that should have
+ * our key, and then we go through all the slots in that leaf and
+ * look for the key. This helper function is just the first stage
+ * and it must be called under rcu_read_lock(). You may be using the
+ * non-rcu final lookup variant, but this part must be rcu.
+ *
+ * We'll return NULL if we find nothing or the candidate leaf
+ * for you to search.
+ */
+static struct sl_leaf *skiplist_lookup_leaf(struct sl_list *list,
+ struct sl_node **last,
+ unsigned long key,
+ unsigned long size)
+{
+ struct sl_node *p;
+ struct sl_leaf *leaf;
+ int level;
+ struct sl_leaf *leaf_ret = NULL;
+ unsigned long max_key = 0;
+ unsigned long min_key = 0;
+
+again:
+ level = list->level;
+ p = list->head;
+ do {
+ while ((leaf = sl_next_leaf(list, p, level))) {
+ max_key = sl_max_key(leaf);
+ min_key = sl_min_key(leaf);
+
+ if (key >= max_key)
+ goto next;
+
+ if (key < min_key) {
+ smp_rmb();
+
+ /*
+ * we're about to stop walking. Make sure
+ * no new pointers have been inserted between
+ * p and our leaf
+ */
+ if (find_live_prev(list, &leaf->node, level) != p ||
+ sl_min_key(leaf) != min_key ||
+ p->dead ||
+ leaf->node.dead) {
+ goto again;
+ }
+ break;
+ }
+
+ if (key < max_key) {
+ leaf_ret = leaf;
+ goto done;
+ }
+next:
+ p = &leaf->node;
+ }
+ level--;
+ } while (level >= 0);
+
+done:
+ if (last)
+ *last = p;
+ return leaf_ret;
+}
+
+/*
+ * this lookup function expects RCU to protect the slots in the leaves
+ * as well as the skiplist indexing structures
+ *
+ * Note, you must call this with rcu_read_lock held, and you must verify
+ * the result yourself. If the key field of the returned slot doesn't
+ * match your key, repeat the lookup. Reference counting etc is also
+ * all your responsibility.
+ */
+struct sl_slot *skiplist_lookup_rcu(struct sl_list *list, unsigned long key,
+ unsigned long size)
+{
+ struct sl_leaf *leaf;
+ struct sl_slot *slot_ret = NULL;
+ int slot;
+ int ret;
+
+again:
+ leaf = skiplist_lookup_leaf(list, NULL, key, size);
+ if (leaf) {
+ ret = leaf_slot(leaf, key, size, &slot);
+ if (ret == 0)
+ slot_ret = rcu_dereference(leaf->ptrs[slot]);
+ else if (!verify_key_in_leaf(leaf, key, size))
+ goto again;
+ }
+ return slot_ret;
+
+}
+EXPORT_SYMBOL(skiplist_lookup_rcu);
+
+/*
+ * this lookup function only uses RCU to protect the skiplist indexing
+ * structs. The actual slots are protected by full locks.
+ */
+struct sl_slot *skiplist_lookup(struct sl_list *list, unsigned long key,
+ unsigned long size)
+{
+ struct sl_leaf *leaf;
+ struct sl_slot *slot_ret = NULL;
+ struct sl_node *p;
+ int slot;
+ int ret;
+
+again:
+ rcu_read_lock();
+ leaf = skiplist_lookup_leaf(list, &p, key, size);
+ if (leaf) {
+ sl_lock_node(&leaf->node);
+ if (!verify_key_in_leaf(leaf, key, size)) {
+ sl_unlock_node(&leaf->node);
+ rcu_read_unlock();
+ goto again;
+ }
+ ret = leaf_slot_locked(leaf, key, size, &slot);
+ if (ret == 0)
+ slot_ret = leaf->ptrs[slot];
+ sl_unlock_node(&leaf->node);
+ }
+ rcu_read_unlock();
+ return slot_ret;
+
+}
+EXPORT_SYMBOL(skiplist_lookup);
+
+/* helper for skiplist_insert_hole. the iommu requires alignment */
+static unsigned long align_start(unsigned long val, unsigned long align)
+{
+ return (val + align - 1) & ~(align - 1);
+}
+
+/*
+ * this is pretty ugly, but it is used to find a free spot in the
+ * tree for a new iommu allocation. We start from a given
+ * hint and try to find an aligned range of a given size.
+ *
+ * Send the slot pointer, and we'll update it with the location
+ * we found.
+ *
+ * This will return -EAGAIN if we found a good spot but someone
+ * raced in and allocated it before we could. This gives the
+ * caller the chance to update their hint.
+ *
+ * This will return -EEXIST if we couldn't find anything at all
+ *
+ * returns 0 if all went well, or some other negative error
+ * if things went badly.
+ */
+int skiplist_insert_hole(struct sl_list *list, unsigned long hint,
+ unsigned long limit,
+ unsigned long size, unsigned long align,
+ struct sl_slot *slot,
+ gfp_t gfp_mask)
+{
+ unsigned long last_end = 0;
+ struct sl_node *p;
+ struct sl_leaf *leaf;
+ int i;
+ int ret = -EEXIST;
+ int preload_token;
+ int pending_level;
+
+ preload_token = skiplist_preload(list, gfp_mask);
+ if (preload_token < 0) {
+ return preload_token;
+ }
+ pending_level = pending_insert_level(preload_token);
+
+ /* step one, lets find our hint */
+ rcu_read_lock();
+again:
+
+ last_end = max(last_end, hint);
+ last_end = align_start(last_end, align);
+ slot->key = align_start(hint, align);
+ slot->size = size;
+ leaf = skiplist_lookup_leaf(list, &p, hint, 1);
+ if (!p)
+ p = list->head;
+
+ if (leaf && !verify_key_in_leaf(leaf, hint, size)) {
+ goto again;
+ }
+
+again_lock:
+ sl_lock_node(p);
+ if (p->dead) {
+ sl_unlock_node(p);
+ goto again;
+ }
+
+ if (p != list->head) {
+ leaf = sl_entry(p);
+ /*
+ * the leaf we found was past the hint,
+ * go back one
+ */
+ if (sl_max_key(leaf) > hint) {
+ struct sl_node *locked = p;
+ p = p->ptrs[0].prev;
+ sl_unlock_node(locked);
+ goto again_lock;
+ }
+ last_end = align_start(sl_max_key(sl_entry(p)), align);
+ }
+
+ /*
+ * now walk at level 0 and find a hole. We could use lockless walks
+ * if we wanted to bang more on the insertion code, but this
+ * instead holds the lock on each node as we inspect it
+ *
+ * This is a little sloppy, insert will return -eexist if we get it
+ * wrong.
+ */
+ while(1) {
+ leaf = sl_next_leaf(list, p, 0);
+ if (!leaf)
+ break;
+
+ /* p and leaf are locked */
+ sl_lock_node(&leaf->node);
+ if (last_end > sl_max_key(leaf))
+ goto next;
+
+ for (i = 0; i < leaf->nr; i++) {
+ if (last_end > leaf->keys[i])
+ continue;
+ if (leaf->keys[i] - last_end >= size) {
+
+ if (last_end + size > limit) {
+ sl_unlock_node(&leaf->node);
+ goto out_rcu;
+ }
+
+ sl_unlock_node(p);
+ slot->key = last_end;
+ slot->size = size;
+ goto try_insert;
+ }
+ last_end = leaf->keys[i] + leaf->ptrs[i]->size;
+ last_end = align_start(last_end, align);
+ if (last_end + size > limit) {
+ sl_unlock_node(&leaf->node);
+ goto out_rcu;
+ }
+ }
+next:
+ sl_unlock_node(p);
+ p = &leaf->node;
+ }
+
+ if (last_end + size <= limit) {
+ sl_unlock_node(p);
+ slot->key = last_end;
+ slot->size = size;
+ goto try_insert;
+ }
+
+out_rcu:
+ /* we've failed */
+ sl_unlock_node(p);
+ rcu_read_unlock();
+ preempt_enable();
+
+ return ret;
+
+try_insert:
+ /*
+ * if the pending_level is zero or there is room in the
+ * leaf, we're ready to insert. This is true most of the
+ * time, and we won't have to drop our lock and give others
+ * the chance to race in and steal our spot.
+ */
+ if (leaf && (pending_level == 0 || leaf->nr < SKIP_KEYS_PER_NODE) &&
+ !leaf->node.dead && (slot->key >= sl_min_key(leaf) &&
+ slot->key + slot->size <= sl_max_key(leaf))) {
+ /* pass null for a cursor, it won't get used */
+ ret = find_or_add_key(list, slot->key, size, leaf, slot,
+ NULL, preload_token);
+ sl_unlock_node(&leaf->node);
+ rcu_read_unlock();
+ goto out;
+ }
+ /*
+ * no such luck, drop our lock and try the insert the
+ * old fashioned way
+ */
+ if (leaf)
+ sl_unlock_node(&leaf->node);
+
+ rcu_read_unlock();
+ ret = skiplist_insert(list, slot, preload_token);
+
+out:
+ /*
+ * if we get an EEXIST here, it just means we lost the race.
+ * return eagain to the caller so they can update the hint
+ */
+ if (ret == -EEXIST)
+ ret = -EAGAIN;
+
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL(skiplist_insert_hole);
+
+/*
+ * we erase one level at a time, from top to bottom.
+ * The basic idea is to find a live prev and next pair,
+ * and make them point to each other.
+ *
+ * For a given level, this takes locks on prev, node, next
+ * makes sure they all point to each other and then
+ * removes node from the middle.
+ *
+ * The node must already be marked dead and it must already
+ * be empty.
+ */
+static void erase_one_level(struct sl_list *list,
+ struct sl_node *node, int level)
+{
+ struct sl_node *prev;
+ struct sl_node *next;
+ struct sl_node *test_prev;
+ struct sl_node *test_next;
+
+again:
+ prev = find_live_prev(list, node, level);
+ sl_lock_node(prev);
+ sl_lock_node(node);
+
+ test_prev = find_live_prev(list, node, level);
+ if (test_prev != prev || prev->dead) {
+ sl_unlock_node(prev);
+ sl_unlock_node(node);
+ goto again;
+ }
+
+again_next:
+ next = find_live_next(list, prev, level);
+ if (next) {
+ sl_lock_node(next);
+ test_next = find_live_next(list, prev, level);
+ if (test_next != next || next->dead) {
+ sl_unlock_node(next);
+ goto again_next;
+ }
+ test_prev = find_live_prev(list, next, level);
+ test_next = find_live_next(list, prev, level);
+ if (test_prev != prev || test_next != next) {
+ sl_unlock_node(prev);
+ sl_unlock_node(node);
+ sl_unlock_node(next);
+ goto again;
+ }
+ } else {
+ test_next = find_live_next(list, prev, level);
+ if (test_next) {
+ sl_unlock_node(prev);
+ sl_unlock_node(node);
+ goto again;
+ }
+ }
+ rcu_assign_pointer(prev->ptrs[level].next, next);
+ if (next)
+ rcu_assign_pointer(next->ptrs[level].prev, prev);
+ else if (prev != list->head)
+ rcu_assign_pointer(list->head->ptrs[level].prev, prev);
+ else
+ rcu_assign_pointer(list->head->ptrs[level].prev, NULL);
+
+ sl_unlock_node(prev);
+ sl_unlock_node(node);
+ sl_unlock_node(next);
+}
+
+void sl_erase(struct sl_list *list, struct sl_leaf *leaf)
+{
+ int i;
+ int level = leaf->node.level;
+
+ for (i = level; i >= 0; i--)
+ erase_one_level(list, &leaf->node, i);
+}
+
+/*
+ * helper for skiplist_delete, this pushes pointers
+ * around to remove a single slot
+ */
+static void delete_slot(struct sl_leaf *leaf, int slot)
+{
+ if (slot != leaf->nr - 1) {
+ int i;
+ for (i = slot; i <= leaf->nr - 1; i++) {
+ leaf->keys[i] = leaf->keys[i + 1];
+ leaf->ptrs[i] = leaf->ptrs[i + 1];
+ smp_wmb();
+ }
+ } else if (leaf->nr > 1) {
+ leaf->max = leaf->keys[leaf->nr - 2] +
+ leaf->ptrs[leaf->nr - 2]->size;
+ smp_wmb();
+ }
+ leaf->nr--;
+}
+
+/*
+ * find a given [key, size] in the skiplist and remove it.
+ * If we find anything, we return the slot pointer that
+ * was stored in the tree.
+ *
+ * deletion involves a mostly lockless lookup to
+ * find the right leaf. Then we take the lock and find the
+ * correct slot.
+ *
+ * The slot is removed from the leaf, and if the leaf
+ * is now empty, it is removed from the skiplist.
+ *
+ * FIXME -- merge mostly empty leaves.
+ */
+struct sl_slot *skiplist_delete(struct sl_list *list,
+ unsigned long key,
+ unsigned long size)
+{
+ struct sl_slot *slot_ret = NULL;
+ struct sl_leaf *leaf;
+ int slot;
+ int ret;
+
+ rcu_read_lock();
+again:
+ leaf = skiplist_lookup_leaf(list, NULL, key, size);
+ if (!leaf)
+ goto out;
+
+ sl_lock_node(&leaf->node);
+ if (!verify_key_in_leaf(leaf, key, size)) {
+ sl_unlock_node(&leaf->node);
+ goto again;
+ }
+
+ ret = leaf_slot_locked(leaf, key, size, &slot);
+ if (ret == 0) {
+ slot_ret = leaf->ptrs[slot];
+ } else {
+ sl_unlock_node(&leaf->node);
+ goto out;
+ }
+
+ delete_slot(leaf, slot);
+ if (leaf->nr == 0) {
+ /*
+ * sl_erase has to mess wit the prev pointers, so
+ * we need to unlock it here
+ */
+ leaf->node.dead = 1;
+ sl_unlock_node(&leaf->node);
+ sl_erase(list, leaf);
+ sl_free_leaf(leaf);
+ } else {
+ sl_unlock_node(&leaf->node);
+ }
+out:
+ rcu_read_unlock();
+ return slot_ret;
+}
+EXPORT_SYMBOL(skiplist_delete);
+
+int sl_init_list(struct sl_list *list, gfp_t mask)
+{
+ int i;
+
+ list->head = kmalloc(sl_node_size(SKIP_MAXLEVEL), mask);
+ if (!list->head)
+ return -ENOMEM;
+ sl_init_node(list->head, SKIP_MAXLEVEL);
+ list->level = 0;
+ spin_lock_init(&list->lock);
+
+ for (i = 0; i < SKIP_MAXLEVEL; i++) {
+ list->head->ptrs[i].next = NULL;
+ list->head->ptrs[i].prev = NULL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(sl_init_list);
+
+
+static int skiplist_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ int cpu = (long)hcpu;
+ struct skip_preload *skp;
+ struct sl_leaf *l;
+ int level;
+ int i;
+
+ /* Free per-cpu pool of preloaded nodes */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ skp = &per_cpu(skip_preloads, cpu);
+ for (i = 0; i < SKIP_MAXLEVEL + 1; i++) {
+ l = skp->preload[i];
+ if (!l)
+ continue;
+ level = l->node.level;
+ kmem_cache_free(slab_caches[level], l);
+ skp->preload[i] = NULL;
+ }
+ }
+ return NOTIFY_OK;
+}
+
+void __init skiplist_init(void)
+{
+ char buffer[16];
+ int i;
+
+ hotcpu_notifier(skiplist_callback, 0);
+ for (i = 0; i < SKIP_MAXLEVEL; i++) {
+ snprintf(buffer, 16, "skiplist-%d", i);
+ slab_caches[i] = kmem_cache_create(buffer,
+ sl_leaf_size(i), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_DESTROY_BY_RCU,
+ NULL);
+ }
+}
+
diff --git a/lib/skiplist_test.c b/lib/skiplist_test.c
new file mode 100644
index 0000000..414b853
--- /dev/null
+++ b/lib/skiplist_test.c
@@ -0,0 +1,688 @@
+/*
+ * (C) 2013 Fusion-io
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/skiplist.h>
+#include <linux/kthread.h>
+#include <linux/rbtree.h>
+#include <linux/random.h>
+
+static int threads = 1;
+static int rounds = 100;
+static int items = 100;
+static int module_exiting;
+static struct completion startup = COMPLETION_INITIALIZER(startup);
+static DEFINE_MUTEX(fill_mutex);
+static int filled;
+
+static struct timespec *times;
+
+
+#define FILL_TIME_INDEX 0
+#define CHECK_TIME_INDEX 1
+#define DEL_TIME_INDEX 2
+#define FIRST_THREAD_INDEX 3
+
+#define SKIPLIST_RCU_BENCH 1
+#define SKIPLIST_BENCH 2
+#define RBTREE_BENCH 3
+
+static int benchmark = SKIPLIST_RCU_BENCH;
+
+module_param(threads, int, 0);
+module_param(rounds, int, 0);
+module_param(items, int, 0);
+module_param(benchmark, int, 0);
+
+MODULE_PARM_DESC(threads, "number of threads to run");
+MODULE_PARM_DESC(rounds, "how many random operations to run");
+MODULE_PARM_DESC(items, "number of items to fill the list with");
+MODULE_PARM_DESC(benchmark, "benchmark to run 1=skiplist-rcu 2=skiplist-locking 3=rbtree");
+MODULE_LICENSE("GPL");
+
+static atomic_t threads_running = ATOMIC_INIT(0);
+
+/*
+ * since the skiplist code is more concurrent, it is also more likely to
+ * have races into the same slot during our delete/insert bashing run.
+ * This makes counts the number of delete/insert pairs done so we can
+ * make sure the results are roughly accurate
+ */
+static atomic_t pops_done = ATOMIC_INIT(0);
+
+static struct kmem_cache *slot_cache;
+struct sl_list skiplist;
+
+spinlock_t rbtree_lock;
+struct rb_root rb_root = RB_ROOT;
+
+struct rbtree_item {
+ struct rb_node rb_node;
+ unsigned long key;
+ unsigned long size;
+};
+
+static int __insert_one_rbtree(struct rb_root *root, struct rbtree_item *ins)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rbtree_item *item;
+ unsigned long key = ins->key;
+ int ret = -EEXIST;
+
+ while (*p) {
+ parent = *p;
+ item = rb_entry(parent, struct rbtree_item, rb_node);
+
+ if (key < item->key)
+ p = &(*p)->rb_left;
+ else if (key >= item->key + item->size)
+ p = &(*p)->rb_right;
+ else
+ goto out;
+ }
+
+ rb_link_node(&ins->rb_node, parent, p);
+ rb_insert_color(&ins->rb_node, root);
+ ret = 0;
+out:
+
+ return ret;
+}
+
+static int insert_one_rbtree(struct rb_root *root, unsigned long key,
+ unsigned long size)
+{
+ int ret;
+ struct rbtree_item *ins;
+
+ ins = kmalloc(sizeof(*ins), GFP_KERNEL);
+ ins->key = key;
+ ins->size = size;
+
+ spin_lock(&rbtree_lock);
+ ret = __insert_one_rbtree(root, ins);
+ spin_unlock(&rbtree_lock);
+
+ if (ret) {
+ printk(KERN_CRIT "err %d inserting rbtree key %lu\n", ret, key);
+ kfree(ins);
+ }
+ return ret;
+}
+
+static struct rbtree_item *__lookup_one_rbtree(struct rb_root *root, unsigned long key)
+{
+ struct rb_node *p = root->rb_node;
+ struct rbtree_item *item;
+ struct rbtree_item *ret;
+
+ while (p) {
+ item = rb_entry(p, struct rbtree_item, rb_node);
+
+ if (key < item->key)
+ p = p->rb_left;
+ else if (key >= item->key + item->size)
+ p = p->rb_right;
+ else {
+ ret = item;
+ goto out;
+ }
+ }
+
+ ret = NULL;
+out:
+ return ret;
+}
+
+static int lookup_one_rbtree(struct rb_root *root, unsigned long key)
+{
+ struct rbtree_item *item;
+ int ret;
+
+ spin_lock(&rbtree_lock);
+ item = __lookup_one_rbtree(root, key);
+ if (item)
+ ret = 0;
+ else
+ ret = -ENOENT;
+ spin_unlock(&rbtree_lock);
+
+ return ret;
+}
+
+static int pop_one_rbtree(struct rb_root *root, unsigned long key)
+{
+ int ret = 0;
+ struct rbtree_item *item;
+ struct rbtree_item **victims;
+ int nr_victims = 128;
+ int found = 0;
+ int loops = 0;
+ int i;
+
+ nr_victims = min(nr_victims, items / 2);
+
+ victims = kzalloc(nr_victims * sizeof(victims[0]), GFP_KERNEL);
+ /*
+ * this is intentionally deleting adjacent items to empty
+ * skiplist leaves. The goal is to find races between
+ * leaf deletion and the rest of the code
+ */
+ while (found < nr_victims && loops < 256) {
+ loops++;
+
+ spin_lock(&rbtree_lock);
+ item = __lookup_one_rbtree(root, key + loops * 4096);
+ if (item) {
+ victims[found] = item;
+ atomic_inc(&pops_done);
+ rb_erase(&item->rb_node, root);
+ found++;
+ }
+ spin_unlock(&rbtree_lock);
+ cond_resched();
+ }
+
+ for (i = 0; i < found; i++) {
+ item = victims[i];
+ spin_lock(&rbtree_lock);
+ ret = __insert_one_rbtree(root, item);
+ if (ret) {
+ printk(KERN_CRIT "pop_one unable to insert %lu\n", key);
+ kfree(item);
+ }
+ spin_unlock(&rbtree_lock);
+ cond_resched();
+ }
+ kfree(victims);
+ return ret;
+
+}
+
+static int run_initial_fill_rbtree(void)
+{
+ unsigned long i;
+ unsigned long key;
+ int ret;
+ int inserted = 0;
+
+ sl_init_list(&skiplist, GFP_KERNEL);
+
+ for (i = 0; i < items; i++) {
+ key = i * 4096;
+ ret = insert_one_rbtree(&rb_root, key, 4096);
+ if (ret)
+ return ret;
+ inserted++;
+ }
+ printk("rbtree inserted %d items\n", inserted);
+ return 0;
+}
+
+static void check_post_work_rbtree(void)
+{
+ unsigned long i;
+ unsigned long key;
+ int errors = 0;
+ int ret;
+
+ for (i = 0; i < items; i++) {
+ key = i * 4096;
+ ret = lookup_one_rbtree(&rb_root, key);
+ if (ret) {
+ printk("rbtree failed to find key %lu\n", key);
+ errors++;
+ }
+ cond_resched();
+ }
+ printk(KERN_CRIT "rbtree check found %d errors\n", errors);
+}
+
+static void delete_all_items_rbtree(void)
+{
+ unsigned long i;
+ unsigned long key;
+ int mid = items / 2;
+ int bounce;
+ struct rbtree_item *item;
+
+ for (i = 0; i < mid; i++) {
+ bounce = 0;
+ key = i * 4096;
+again:
+ spin_lock(&rbtree_lock);
+ item = __lookup_one_rbtree(&rb_root, key);
+ if (!item)
+ printk(KERN_CRIT "delete_all unable to find %lu\n", key);
+ rb_erase(&item->rb_node, &rb_root);
+ spin_unlock(&rbtree_lock);
+ kfree(item);
+
+ if (!bounce) {
+ key = (items - 1 - i) * 4096;
+ bounce = 1;
+ goto again;
+ }
+ }
+}
+
+static int insert_one_skiplist(struct sl_list *skiplist, unsigned long key,
+ unsigned long size)
+{
+ int ret;
+ int preload_token;
+ struct sl_slot *slot;
+
+ slot = kmem_cache_alloc(slot_cache, GFP_KERNEL);
+ if (!slot)
+ return -ENOMEM;
+
+ slot->key = key;
+ slot->size = size;
+
+ preload_token = skiplist_preload(skiplist, GFP_KERNEL);
+ if (preload_token < 0) {
+ ret = preload_token;
+ goto out;
+ }
+
+ ret = skiplist_insert(skiplist, slot, preload_token);
+ preempt_enable();
+
+out:
+ if (ret)
+ kmem_cache_free(slot_cache, slot);
+
+ return ret;
+}
+
+static unsigned long tester_random(void)
+{
+ return prandom_u32();
+}
+
+static int run_initial_fill_skiplist(void)
+{
+ unsigned long i;
+ unsigned long key;
+ int ret;
+ int inserted = 0;
+
+ sl_init_list(&skiplist, GFP_KERNEL);
+
+ for (i = 0; i < items; i++) {
+ key = i * 4096;
+ ret = insert_one_skiplist(&skiplist, key, 4096);
+ if (ret)
+ return ret;
+ inserted++;
+ }
+ printk("skiplist inserted %d items\n", inserted);
+ return 0;
+}
+
+static void check_post_work_skiplist(void)
+{
+ unsigned long i;
+ unsigned long key;
+ struct sl_slot *slot;
+ int errors = 0;
+
+ for (i = 0; i < items; i++) {
+ key = i * 4096;
+ if (benchmark == SKIPLIST_RCU_BENCH) {
+ rcu_read_lock();
+again:
+ slot = skiplist_lookup_rcu(&skiplist, key + 64, 512);
+ if (slot && slot->key != key) {
+ goto again;
+ }
+ rcu_read_unlock();
+ } else {
+ slot = skiplist_lookup(&skiplist, key + 64, 512);
+ }
+
+ if (!slot) {
+ printk("failed to find key %lu\n", key);
+ errors++;
+ } else if (slot->key != key) {
+ errors++;
+ printk("key mismatch wanted %lu found %lu\n", key, slot->key);
+ }
+ cond_resched();
+ }
+ printk(KERN_CRIT "skiplist check found %d errors\n", errors);
+}
+
+static void verify_post_work_skiplist(void)
+{
+ unsigned long i;
+ unsigned long key = 0;
+ struct sl_slot *slot;
+ struct sl_node *node = skiplist.head->ptrs[0].next;
+ struct sl_leaf *leaf;
+ int found = 0;
+
+ while (node) {
+ leaf = sl_entry(node);
+ for (i = 0; i < leaf->nr; i++) {
+ slot = leaf->ptrs[i];
+ if (slot->key != key) {
+ printk(KERN_CRIT "found bad key %lu wanted %lu\n", slot->key, key);
+ }
+ key += slot->size;
+ }
+ found += leaf->nr;
+ node = node->ptrs[0].next;
+ }
+ if (found != items) {
+ printk(KERN_CRIT "skiplist check found only %d items instead of %d\n", found, items);
+ } else {
+ printk(KERN_CRIT "skiplist verify passed\n");
+ }
+}
+
+static void delete_all_items_skiplist(void)
+{
+ unsigned long i;
+ unsigned long key;
+ struct sl_slot *slot;
+ int errors = 0;
+ int mid = items / 2;
+ int bounce;
+
+ for (i = 0; i < mid; i++) {
+ bounce = 0;
+ key = i * 4096;
+again:
+ slot = skiplist_delete(&skiplist, key + 512, 1);
+ if (!slot) {
+ printk("missing key %lu\n", key);
+ } else if (slot->key != key) {
+ errors++;
+ printk("key mismatch wanted %lu found %lu\n", key, slot->key);
+ }
+ kfree(slot);
+ if (!bounce) {
+ key = (items - 1 - i) * 4096;
+ bounce = 1;
+ goto again;
+ }
+ }
+ printk(KERN_CRIT "skiplist deletion done\n");
+}
+
+static int lookup_one_skiplist(struct sl_list *skiplist, unsigned long key)
+{
+ int ret = 0;
+ struct sl_slot *slot;
+
+ if (benchmark == SKIPLIST_RCU_BENCH)
+ slot = skiplist_lookup_rcu(skiplist, key, 4096);
+ else if (benchmark == SKIPLIST_BENCH)
+ slot = skiplist_lookup(skiplist, key, 4096);
+ if (!slot)
+ ret = -ENOENT;
+ return ret;
+}
+
+static int pop_one_skiplist(struct sl_list *skiplist, unsigned long key)
+{
+ int ret = 0;
+ int preload_token;
+ struct sl_slot *slot;
+ struct sl_slot **victims;
+ int nr_victims = 128;
+ int found = 0;
+ int loops = 0;
+ int i;
+
+ nr_victims = min(nr_victims, items / 2);
+
+ victims = kzalloc(nr_victims * sizeof(victims[0]), GFP_KERNEL);
+ /*
+ * this is intentionally deleting adjacent items to empty
+ * skiplist leaves. The goal is to find races between
+ * leaf deletion and the rest of the code
+ */
+ while (found < nr_victims && loops < 256) {
+ loops++;
+ slot = skiplist_delete(skiplist, key + loops * 4096, 1024);
+ if (!slot)
+ continue;
+
+ victims[found] = slot;
+ atomic_inc(&pops_done);
+ found++;
+ cond_resched();
+ }
+ for (i = 0; i < found; i++) {
+ preload_token = skiplist_preload(skiplist, GFP_KERNEL);
+ if (preload_token < 0) {
+ ret = preload_token;
+ goto out;
+ }
+
+ ret = skiplist_insert(skiplist, victims[i], preload_token);
+ if (ret) {
+ printk(KERN_CRIT "failed to insert key %lu ret %d\n", key, ret);
+ preempt_enable();
+ goto out;
+ }
+ ret = 0;
+ preempt_enable();
+ cond_resched();
+ }
+
+out:
+ kfree(victims);
+ return ret;
+}
+
+void tvsub(struct timespec * tdiff, struct timespec * t1, struct timespec * t0)
+{
+ tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
+ tdiff->tv_nsec = t1->tv_nsec - t0->tv_nsec;
+ if (tdiff->tv_nsec < 0 && tdiff->tv_sec > 0) {
+ tdiff->tv_sec--;
+ tdiff->tv_nsec += 1000000000ULL;
+ }
+
+ /* time shouldn't go backwards!!! */
+ if (tdiff->tv_nsec < 0 || t1->tv_sec < t0->tv_sec) {
+ tdiff->tv_sec = 0;
+ tdiff->tv_nsec = 0;
+ }
+}
+
+static void pretty_time(struct timespec *ts, unsigned long long *seconds, unsigned long long *ms)
+{
+ unsigned long long m;
+
+ *seconds = ts->tv_sec;
+
+ m = ts->tv_nsec / 1000000ULL;
+ *ms = m;
+}
+
+static void runbench(int thread_index)
+{
+ int ret = 0;
+ unsigned long i;
+ unsigned long op;
+ unsigned long key;
+ struct timespec start;
+ struct timespec cur;
+ unsigned long long sec;
+ unsigned long long ms;
+ char *tag = "skiplist-rcu";
+
+ if (benchmark == SKIPLIST_BENCH)
+ tag = "skiplist-locking";
+ else if (benchmark == RBTREE_BENCH)
+ tag = "rbtree";
+
+ mutex_lock(&fill_mutex);
+
+ if (filled == 0) {
+ start = current_kernel_time();
+
+ printk(KERN_CRIT "Running %s benchmark\n", tag);
+
+ if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+ ret = run_initial_fill_skiplist();
+ else if (benchmark == RBTREE_BENCH)
+ ret = run_initial_fill_rbtree();
+
+ if (ret < 0) {
+ printk(KERN_CRIT "failed to setup initial tree ret %d\n", ret);
+ filled = ret;
+ } else {
+ filled = 1;
+ }
+ cur = current_kernel_time();
+ tvsub(times + FILL_TIME_INDEX, &cur, &start);
+ }
+
+ mutex_unlock(&fill_mutex);
+ if (filled < 0)
+ return;
+
+ start = current_kernel_time();
+
+ for (i = 0; i < rounds; i++) {
+ op = tester_random();
+ key = op % items;
+ key *= 4096;
+ if (op % 2 == 0) {
+ if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+ ret = lookup_one_skiplist(&skiplist, key);
+ else if (benchmark == RBTREE_BENCH)
+ ret = lookup_one_rbtree(&rb_root, key);
+ }
+ if (op % 3 == 0) {
+ if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+ ret = pop_one_skiplist(&skiplist, key);
+ else if (benchmark == RBTREE_BENCH)
+ ret = pop_one_rbtree(&rb_root, key);
+ }
+ cond_resched();
+ }
+
+ cur = current_kernel_time();
+ tvsub(times + FIRST_THREAD_INDEX + thread_index, &cur, &start);
+
+ if (!atomic_dec_and_test(&threads_running)) {
+ return;
+ }
+
+ start = current_kernel_time();
+ if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+ check_post_work_skiplist();
+ else if (benchmark == RBTREE_BENCH)
+ check_post_work_rbtree();
+
+ cur = current_kernel_time();
+
+ if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+ verify_post_work_skiplist();
+
+ tvsub(times + CHECK_TIME_INDEX, &cur, &start);
+
+ start = current_kernel_time();
+ if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+ delete_all_items_skiplist();
+ else if (benchmark == RBTREE_BENCH)
+ delete_all_items_rbtree();
+ cur = current_kernel_time();
+
+ tvsub(times + DEL_TIME_INDEX, &cur, &start);
+
+ pretty_time(×[FILL_TIME_INDEX], &sec, &ms);
+ printk("%s fill time %llu s %llu ms\n", tag, sec, ms);
+ pretty_time(×[CHECK_TIME_INDEX], &sec, &ms);
+ printk("%s check time %llu s %llu ms\n", tag, sec, ms);
+ pretty_time(×[DEL_TIME_INDEX], &sec, &ms);
+ printk("%s del time %llu s %llu ms \n", tag, sec, ms);
+ for (i = 0; i < threads; i++) {
+ pretty_time(×[FIRST_THREAD_INDEX + i], &sec, &ms);
+ printk("%s thread %lu time %llu s %llu ms\n", tag, i, sec, ms);
+ }
+
+ printk("worker thread pops done %d\n", atomic_read(&pops_done));
+
+ kfree(times);
+}
+
+static int skiptest_thread(void *index)
+{
+ unsigned long thread_index = (unsigned long)index;
+ complete(&startup);
+ runbench(thread_index);
+ complete(&startup);
+ return 0;
+}
+
+static int __init skiptest_init(void)
+{
+ unsigned long i;
+ init_completion(&startup);
+ slot_cache = kmem_cache_create("skiplist_slot", sizeof(struct sl_slot), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+ SLAB_DESTROY_BY_RCU, NULL);
+
+ if (!slot_cache)
+ return -ENOMEM;
+
+ spin_lock_init(&rbtree_lock);
+
+ printk("skiptest benchmark module (%d threads) (%d items) (%d rounds)\n", threads, items, rounds);
+
+ times = kmalloc(sizeof(times[0]) * (threads + 3), GFP_KERNEL);
+
+ atomic_set(&threads_running, threads);
+ for (i = 0; i < threads; i++) {
+ kthread_run(skiptest_thread, (void *)i, "skiptest_thread");
+ }
+ for (i = 0; i < threads; i++)
+ wait_for_completion(&startup);
+ return 0;
+}
+
+static void __exit skiptest_exit(void)
+{
+ int i;
+ module_exiting = 1;
+
+ for (i = 0; i < threads; i++) {
+ wait_for_completion(&startup);
+ }
+
+ synchronize_rcu();
+ kmem_cache_destroy(slot_cache);
+ printk("all skiptest threads done\n");
+ return;
+}
+
+module_init(skiptest_init);
+module_exit(skiptest_exit);
--
1.8.2
next prev parent reply other threads:[~2013-05-03 2:06 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-05-03 2:02 [PATCH RFC 0/2] skiplists for range indexes Chris Mason
2013-05-03 2:06 ` Chris Mason [this message]
2013-05-03 2:10 ` [PATCH RFC 2/2] skiplists for the IOMMU Chris Mason
2013-05-03 9:19 ` [PATCH RFC 0/2] skiplists for range indexes Jan Kara
2013-05-03 10:45 ` Chris Mason
2013-05-04 3:25 ` Dave Chinner
2013-05-04 11:11 ` Chris Mason
2013-05-05 7:33 ` Dave Chinner
2013-05-05 14:38 ` Chris Mason
2013-05-05 22:44 ` Dave Chinner
2013-05-06 11:28 ` [BULK] " Chris Mason
2013-05-07 2:12 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130503020625.GA4271@shiny.masoncoding.com \
--to=chris.mason@fusionio.com \
--cc=David.Woodhouse@intel.com \
--cc=bo.li.liu@oracle.com \
--cc=dchinner@redhat.com \
--cc=linux-fsdevel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).