[PATCH RFC 1/2] Skiplist: rcu range index

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Chris Mason <chris.mason@fusionio.com>
To: Chris Mason <clmason@fusionio.com>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linux FS Devel <linux-fsdevel@vger.kernel.org>,
	David Woodhouse <David.Woodhouse@intel.com>,
	"dchinner@redhat.com" <dchinner@redhat.com>,
	"bo.li.liu@oracle.com" <bo.li.liu@oracle.com>,
	"rp@svcs.cs.pdx.edu" <rp@svcs.cs.pdx.edu>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Lai Jiangshan <laijs@cn.fujitsu.com>,
	Stephen Hemminger <shemminger@vyatta.com>,
	Alan Stern <stern@rowland.harvard.edu>
Subject: [PATCH RFC 1/2] Skiplist: rcu range index
Date: Sun, 16 Jun 2013 10:57:25 -0400	[thread overview]
Message-ID: <20130616145725.4914.62425@localhost.localdomain> (raw)
In-Reply-To: <20130616145612.4914.3009@localhost.localdomain>

Signed-off-by: Chris Mason <chris.mason@fusionio.com>
---
 include/linux/skiplist.h |  235 ++++++
 init/main.c              |    5 +
 lib/Kconfig              |   14 +
 lib/Makefile             |    3 +
 lib/skiplist.c           | 2106 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/skiplist_test.c      |  882 +++++++++++++++++++
 6 files changed, 3245 insertions(+)
 create mode 100644 include/linux/skiplist.h
 create mode 100644 lib/skiplist.c
 create mode 100644 lib/skiplist_test.c

diff --git a/include/linux/skiplist.h b/include/linux/skiplist.h
new file mode 100644
index 0000000..04cfe5e
--- /dev/null
+++ b/include/linux/skiplist.h
@@ -0,0 +1,235 @@
+/*
+ * (C) 2011  Liu Bo <liubo2009@cn.fujitsu.com>
+ * (C) 2013 Fusion-io
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef _SKIPLIST_H
+#define _SKIPLIST_H
+
+#include <linux/spinlock.h>
+
+/*
+ * This includes a basic skiplist implementation and builds a more
+ * cache friendly variant on top meant to index ranges of keys.
+ *
+ * our random generation function has P at 0.5, so a rough metric
+ * of good performance happens for lists up to 2^MAXLEVEL in size.
+ * Since we have an array of keys, you can do 2^MAXLEVEL * SKIP_KEYS_PER_NODE
+ */
+#define SKIP_MAXLEVEL 32 /* skiplist_get_new_level requires <= 32 */
+
+struct sl_node_ptr {
+	struct sl_node *prev;
+	struct sl_node *next;
+};
+
+/* possible values for sl_node->pending */
+#define SKIPLIST_LIVE 0			/* normal operations */
+#define SKIPLIST_PENDING_DEAD 1		/* active unlink in progress */
+#define SKIPLIST_PENDING_INSERT 2	/* active link in progress */
+
+/*
+ * sl_node must be last in the leaf data struct.  Allocate enough
+ * ram for a given size using either the sl_ptrs_size or sl_node_size
+ * helpers.
+ */
+struct sl_node {
+	/* level tells us how big the ptrs array is.  It cannot change */
+	unsigned long level;
+
+	/* pending is used to indicate a delete or link in progress */
+	unsigned long pending;
+
+	spinlock_t lock;
+
+	/*
+	 * ptrs must be at the bottom because it is variably sized
+	 * based on the level
+	 */
+	struct sl_node_ptr ptrs[];
+};
+
+/*
+ * The head list structure.  The head node has no data,
+ * but it does have the full array of pointers for all the levels.
+ */
+struct sl_list {
+	/*
+	 * in the head pointer, we use head->prev to point to
+	 * the highest item in the list.  But, the last item does
+	 * not point back to the head.  The head->prev items
+	 * are protected the by node lock on the last item
+	 */
+	struct sl_node *head;
+	unsigned int level;
+};
+
+/*
+ * If you are indexing extents, embed sl_slots into your structure and use
+ * sl_slot_entry to pull out your real struct.  The key and size must not
+ * change if you're using rcu.
+ */
+struct sl_slot {
+	/*
+	 * when rcu is on, we use this key to verify the pointer we pull
+	 * out of the array.  It must not change once the object is
+	 * inserted
+	 */
+	unsigned long key;
+
+	/*
+	 * the range searching functions follow pointers into this slot
+	 * struct and use this size field to find out how big the
+	 * range is.
+	 */
+	unsigned long size;
+
+	/*
+	 * a ref is taken when the slot is inserted, or
+	 * during non-rcu lookup. It's
+	 * the caller's job to drop the refs on deletion
+	 * and rcu lookup.
+	 */
+	atomic_t refs;
+};
+
+/*
+ * Larger values here make us faster when single threaded.  Lower values
+ * increase cache misses but give more chances for concurrency.
+ */
+#define SKIP_KEYS_PER_NODE 32
+
+/*
+ * For indexing extents, this is a leaf in our skip list tree.
+ * Each leaf has a number of pointers and the max field
+ * is used to figure out the key space covered.
+ */
+struct sl_leaf {
+	/* number of valid keys/ptrs in this leaf */
+	int nr;
+
+	/* reference count for this leaf */
+	atomic_t refs;
+
+	/*
+	 * max value of the range covered by this leaf.  This
+	 * includes the size field of the very last extent,
+	 * so max = keys[last_index] + ptrs[last_index]->size
+	 */
+	unsigned long max;
+
+	/*
+	 * sorted, all the keys
+	 */
+	unsigned long keys[SKIP_KEYS_PER_NODE];
+
+	/*
+	 * data pointers corresponding to the keys
+	 */
+	struct sl_slot *ptrs[SKIP_KEYS_PER_NODE];
+
+	/* for freeing our objects after the grace period */
+	struct rcu_head rcu_head;
+
+	/* this needs to be at the end. The size changes based on the level */
+	struct sl_node node;
+};
+
+/*
+ * for a given level, how much memory we need for an array of
+ * all the pointers
+ */
+static inline int sl_ptrs_size(int level)
+{
+	return sizeof(struct sl_node_ptr) * (level + 1);
+}
+
+/*
+ * for a given level, how much memory we need for the
+ * array of pointers and the sl_node struct
+ */
+static inline int sl_node_size(int level)
+{
+	return sizeof(struct sl_node) + sl_ptrs_size(level);
+}
+
+static inline int sl_leaf_size(int level)
+{
+	return sizeof(struct sl_leaf) + sl_ptrs_size(level);
+}
+
+#define sl_entry(ptr) container_of((ptr), struct sl_leaf, node)
+#define sl_slot_entry(ptr, type, member) container_of(ptr, type, member)
+
+static inline int sl_empty(const struct sl_node *head)
+{
+	return head->ptrs[0].next == NULL;
+}
+
+static inline int sl_node_dead(struct sl_node *node)
+{
+	return node->pending == SKIPLIST_PENDING_DEAD;
+}
+
+static inline int sl_node_inserting(struct sl_node *node)
+{
+	return node->pending == SKIPLIST_PENDING_INSERT;
+}
+
+int skiplist_preload(struct sl_list *list, gfp_t gfp_mask);
+int skiplist_get_new_level(struct sl_list *list, int max_level);
+int skiplist_insert(struct sl_list *list, struct sl_slot *slot,
+		    int preload_token, struct sl_leaf **cache);
+int sl_init_list(struct sl_list *list, gfp_t mask);
+struct sl_slot *skiplist_lookup(struct sl_list *list, unsigned long key, unsigned long size);
+struct sl_slot *skiplist_lookup_rcu(struct sl_list *list, unsigned long key, unsigned long size);
+struct sl_slot *skiplist_delete(struct sl_list *list, unsigned long key, unsigned long size);
+int skiplist_insert_hole(struct sl_list *list, unsigned long hint,
+			 unsigned long limit,
+			 unsigned long size, unsigned long align,
+			 struct sl_slot *slot,
+			 gfp_t gfp_mask);
+void sl_lock_node(struct sl_node *n);
+void sl_unlock_node(struct sl_node *n);
+unsigned long sl_highest_key(struct sl_list *list);
+int skiplist_search_leaf(struct sl_leaf *leaf, unsigned long key,
+			 unsigned long size, int *slot);
+struct sl_leaf *skiplist_lookup_leaf(struct sl_list *list,
+				     unsigned long key,
+				     unsigned long size);
+struct sl_leaf *skiplist_lookup_first_leaf(struct sl_list *list,
+					   unsigned long key,
+					   unsigned long size);
+struct sl_leaf *skiplist_lookup_leaf_rcu(struct sl_list *list,
+					 unsigned long key,
+					 unsigned long size);
+struct sl_leaf *skiplist_lookup_first_leaf_rcu(struct sl_list *list,
+					       unsigned long key,
+					       unsigned long size);
+void skiplist_unlock_leaf(struct sl_leaf *leaf);
+void skiplist_lock_leaf(struct sl_leaf *leaf);
+void skiplist_delete_leaf(struct sl_list *list, struct sl_leaf *leaf,
+			  struct sl_leaf **cache_next);
+struct sl_leaf *skiplist_next_leaf(struct sl_list *list,
+				   struct sl_leaf *leaf);
+struct sl_leaf *skiplist_next_leaf_rcu(struct sl_list *list,
+				       struct sl_leaf *leaf);
+struct sl_leaf *skiplist_first_leaf(struct sl_list *list);
+struct sl_leaf *skiplist_first_leaf_rcu(struct sl_list *list);
+void skiplist_get_leaf(struct sl_leaf *leaf);
+int skiplist_get_leaf_not_zero(struct sl_leaf *leaf);
+void skiplist_put_leaf(struct sl_leaf *leaf);
+void skiplist_wait_pending_insert(struct sl_node *node);
+#endif /* _SKIPLIST_H */
diff --git a/init/main.c b/init/main.c
index 63534a1..ad9b631 100644
--- a/init/main.c
+++ b/init/main.c
@@ -90,6 +90,7 @@ extern void fork_init(unsigned long);
 extern void mca_init(void);
 extern void sbus_init(void);
 extern void radix_tree_init(void);
+extern void skiplist_init(void);
 #ifndef CONFIG_DEBUG_RODATA
 static inline void mark_rodata_ro(void) { }
 #endif
@@ -566,6 +567,10 @@ asmlinkage void __init start_kernel(void)
 
 	kmem_cache_init_late();
 
+#ifdef CONFIG_SKIPLIST
+	skiplist_init();
+#endif
+
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
 	 * we've done PCI setups etc, and console_init() must be aware of
diff --git a/lib/Kconfig b/lib/Kconfig
index fe01d41..94bb235 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -393,6 +393,20 @@ config SIGNATURE
 	  Digital signature verification. Currently only RSA is supported.
 	  Implementation is done using GnuPG MPI library
 
+config SKIPLIST
+	bool "Concurrent Skiplists"
+	default n
+	help
+	  Concurrent skiplist indexing trees.  Just say N
+
+config SKIPLIST_TEST
+	tristate "Skiplist testing module"
+	default n
+	select SKIPLIST
+	help
+	  Testing module for the skiplist implementation
+
+#
 #
 # libfdt files, only selected if needed.
 #
diff --git a/lib/Makefile b/lib/Makefile
index 6e2cc56..7c14a12 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,9 @@ obj-$(CONFIG_NLATTR) += nlattr.o
 
 obj-$(CONFIG_LRU_CACHE) += lru_cache.o
 
+obj-$(CONFIG_SKIPLIST) += skiplist.o
+obj-$(CONFIG_SKIPLIST_TEST) += skiplist_test.o
+
 obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
 
 obj-$(CONFIG_GENERIC_CSUM) += checksum.o
diff --git a/lib/skiplist.c b/lib/skiplist.c
new file mode 100644
index 0000000..02cf04e
--- /dev/null
+++ b/lib/skiplist.c
@@ -0,0 +1,2106 @@
+/*
+ * (C) 2011  Liu Bo <liubo2009@cn.fujitsu.com>
+ * (C) 2013 Fusion-io
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+#include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/skiplist.h>
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+
+static struct kmem_cache *slab_caches[SKIP_MAXLEVEL];
+
+/*
+ * before starting an insert, we preload based on the current
+ * height of the list.  This holds the preload result
+ * in a per-cpu array
+ */
+struct skip_preload {
+	/*
+	 * the preload is filled in based on the highest possible level
+	 * of the list you're preloading.  So we basically end up with
+	 * one preloaded node for each max size.
+	 */
+	struct sl_leaf *preload[SKIP_MAXLEVEL + 1];
+};
+
+static DEFINE_PER_CPU(struct skip_preload, skip_preloads) = { {NULL,}, };
+
+static void sl_init_node(struct sl_node *node, int level)
+{
+	int i;
+	spin_lock_init(&node->lock);
+
+	/*
+	 * from a locking point of view, the skiplists are a dumb linked
+	 * list where we take up to three locks in order from left to
+	 * right.  I haven't been able to teach lockdep how to do this
+	 * yet
+	 */
+	lockdep_set_novalidate_class(&node->lock);
+
+	for (i = 0; i <= level; i++) {
+		node->ptrs[i].prev = NULL;
+		node->ptrs[i].next = NULL;
+	}
+	node->level = level;
+	node->pending = SKIPLIST_LIVE;
+}
+
+static void sl_init_leaf(struct sl_leaf *leaf, int level)
+{
+	atomic_set(&leaf->refs, 1);
+	sl_init_node(&leaf->node, level);
+}
+
+/*
+ * the rcu based searches need to block reuse until a given search round
+ * is done.  So, we use call_rcu for freeing the leaf structure.
+ */
+static void sl_free_rcu(struct rcu_head *head)
+{
+	struct sl_leaf *leaf = container_of(head, struct sl_leaf, rcu_head);
+	kmem_cache_free(slab_caches[leaf->node.level], leaf);
+}
+
+void skiplist_get_leaf(struct sl_leaf *leaf)
+{
+	atomic_inc(&leaf->refs);
+}
+EXPORT_SYMBOL(skiplist_get_leaf);
+
+int skiplist_get_leaf_not_zero(struct sl_leaf *leaf)
+{
+	return atomic_inc_not_zero(&leaf->refs);
+}
+EXPORT_SYMBOL(skiplist_get_leaf_not_zero);
+
+void skiplist_put_leaf(struct sl_leaf *leaf)
+{
+	BUG_ON(atomic_read(&leaf->refs) == 0);
+	if (atomic_dec_and_test(&leaf->refs))
+		call_rcu(&leaf->rcu_head, sl_free_rcu);
+}
+EXPORT_SYMBOL(skiplist_put_leaf);
+
+/*
+ * if a node is currently inserting, this spins until the
+ * insertion is complete.  Calling this with another node
+ * locked usually leads to deadlocks because the linking
+ * needs to take neighboring locks.
+ */
+void skiplist_wait_pending_insert(struct sl_node *node)
+{
+	while (sl_node_inserting(node)) {
+		cpu_relax();
+		smp_rmb();
+	}
+}
+EXPORT_SYMBOL(skiplist_wait_pending_insert);
+
+/*
+ * helper function to pull out the next live leaf at a given level.
+ * no locks are required or taken
+ *
+ * Note that if this returns NULL, you may have to wait for a pending
+ * insertion on 'node' before you can trust the answer
+ */
+static inline struct sl_leaf *sl_next_leaf(struct sl_list *list,
+					   struct sl_node *node, int l)
+{
+	do {
+		node = rcu_dereference(node->ptrs[l].next);
+		if (!node)
+			return NULL;
+		if (!sl_node_dead(node))
+			return sl_entry(node);
+	} while (1);
+}
+
+/*
+ * helper functions to wade through dead nodes pending deletion
+ * and return live ones.  This does wait on pending insertions
+ * as it walks backwards.
+ */
+static struct sl_node *find_live_prev(struct sl_list *list,
+				      struct sl_node *node, int level)
+{
+	struct sl_node *prev = NULL;
+	/* head->prev points to the max, this makes sure we don't loop */
+	if (node == list->head)
+		return NULL;
+
+	while (node) {
+		prev = rcu_dereference(node->ptrs[level].prev);
+		if (!prev) {
+			skiplist_wait_pending_insert(node);
+			prev = rcu_dereference(node->ptrs[level].prev);
+		}
+		node = prev;
+		/*
+		 * the head is never dead, so we'll never walk past
+		 * it down in this loop
+		 */
+		if (!sl_node_dead(prev))
+			break;
+	}
+
+	return node;
+}
+
+/*
+ * walks forward to find a live next pointer.  This does
+ * not wait on pending insertions because it would deadlock
+ * the callers
+ */
+static struct sl_node *find_live_next(struct sl_list *list,
+				      struct sl_node *node, int level)
+{
+	while (1) {
+		node = rcu_dereference(node->ptrs[level].next);
+		if (!node)
+			return NULL;
+		if (!sl_node_dead(node))
+			return node;
+	}
+}
+
+/*
+ * return the highest value for a given leaf.  This is cached
+ * in leaf->max so that we don't have to wander into
+ * the slot pointers.  The max is equal to the key + size of the
+ * last slot.
+ */
+static unsigned long sl_max_key(struct sl_leaf *leaf)
+{
+	smp_rmb();
+	return leaf->max;
+}
+
+/*
+ * return the lowest key for a given leaf. This comes out
+ * of the node key array and not the slots
+ */
+static unsigned long sl_min_key(struct sl_leaf *leaf)
+{
+	smp_rmb();
+	return leaf->keys[0];
+}
+
+void sl_lock_node(struct sl_node *n)
+{
+	spin_lock(&n->lock);
+}
+
+void sl_unlock_node(struct sl_node *n)
+{
+	if (n)
+		spin_unlock(&n->lock);
+}
+
+void skiplist_lock_leaf(struct sl_leaf *leaf)
+{
+	sl_lock_node(&leaf->node);
+}
+EXPORT_SYMBOL(skiplist_lock_leaf);
+
+void skiplist_unlock_leaf(struct sl_leaf *leaf)
+{
+	if (leaf)
+		sl_unlock_node(&leaf->node);
+}
+EXPORT_SYMBOL(skiplist_unlock_leaf);
+
+void assert_locked_node(struct sl_node *n)
+{
+	assert_spin_locked(&n->lock);
+}
+
+/*
+ * helper function to link a single level during an insert.
+ * prev must be locked, and it is the node we are linking after.
+ *
+ * This will find a live next pointer, lock it, and link it
+ * with our new node
+ */
+static void sl_link_one_level(struct sl_list *list,
+			      struct sl_node *prev,
+			      struct sl_node *node, int level)
+{
+	struct sl_node *next;
+	struct sl_node *test;
+
+	assert_locked_node(prev);
+	BUG_ON(sl_node_dead(prev));
+
+again:
+	next = find_live_next(list, prev, level);
+	if (next) {
+		/*
+		 * next may be pending insertion at this point
+		 * but it was linked far enough up for prev to
+		 * point to it.  So we can safely just use it
+		 */
+		sl_lock_node(next);
+		test = find_live_next(list, prev, level);
+		if (test != next || sl_node_dead(next)) {
+			sl_unlock_node(next);
+			goto again;
+		}
+		/*
+		 * make sure the our next and prev really point to each
+		 * other now that we have next locked.
+		 */
+		if (find_live_prev(list, next, level) != prev) {
+			sl_unlock_node(next);
+			goto again;
+		}
+	}
+
+	rcu_assign_pointer(node->ptrs[level].next, next);
+	rcu_assign_pointer(node->ptrs[level].prev, prev);
+	rcu_assign_pointer(prev->ptrs[level].next, node);
+
+	/*
+	 * if next is null, we're the last node on this level.
+	 * The head->prev pointer is used to cache this fact
+	 */
+	if (next)
+		rcu_assign_pointer(next->ptrs[level].prev, node);
+	else
+		rcu_assign_pointer(list->head->ptrs[level].prev, node);
+
+	sl_unlock_node(next);
+}
+
+/*
+ * for everything above level 0 in a new leaf, we link from the bottom
+ * up.  This walks the prev pointers in the new leaf to find higher
+ * leaves to link with.  The basic idea is to go down one level
+ * and walk backwards until you find a leaf of the right level.
+ *
+ * It looks like much more work than walking down from the top, but
+ * most of the leaves are at the lower levels.  Walking from the top
+ * is sure to waste time going past leaves we never use.
+ */
+static noinline struct sl_node *find_higher_prev(struct sl_list *list,
+					struct sl_leaf *ins, int level)
+{
+	struct sl_node *cur;
+	struct sl_node *prev;
+	struct sl_leaf *leaf;
+	int search_level = level - 1;
+
+	BUG_ON(level == 0);
+
+	/*
+	 * step one, walk backward on the lower level until
+	 * we find a leaf of the proper height.
+	 */
+	cur = &ins->node;
+	while (1) {
+		prev = find_live_prev(list, cur, search_level);
+		skiplist_wait_pending_insert(prev);
+		if (prev->level >= level)
+			break;
+		cur = prev;
+	}
+
+	/*
+	 * now we have a node at the right level, but while we
+	 * walked someone might have inserted leaves between
+	 * prev and the insertion point.  Walk forward to make
+	 * sure we have best leaf for linking.
+	 */
+	while (1) {
+		leaf = sl_next_leaf(list, prev, level);
+		if (!leaf || sl_max_key(ins) <= sl_min_key(leaf))
+			return prev;
+
+		prev = &leaf->node;
+		skiplist_wait_pending_insert(prev);
+	}
+}
+
+/*
+ * this does all of the hard work to finish linking a pending leaf.
+ * 'level' tells us how high the caller has already linked, and the
+ * caller must have at least linked level 0.
+ *
+ * leaf must be unlocked, nothing is locked when we return.
+ */
+static noinline void sl_link_pending_leaf(struct sl_list *list,
+				 struct sl_leaf *leaf, int level)
+{
+	struct sl_node *prev;
+	struct sl_node *search_prev;
+	struct sl_node *next;
+	struct sl_leaf *next_leaf;
+	struct sl_node *node = &leaf->node;
+	int last_level = node->level;
+
+	/* did the caller do enough already? */
+	if (level > node->level)
+		return;
+
+	while (level <= last_level) {
+		if (node->ptrs[level].prev)
+			prev = find_live_prev(list, node, level);
+		else
+			prev = find_higher_prev(list, leaf, level);
+
+		skiplist_wait_pending_insert(prev);
+		sl_lock_node(prev);
+
+		/* don't link with the dead */
+		if (sl_node_dead(prev)) {
+			sl_unlock_node(prev);
+			continue;
+		}
+
+		/* lock ourselves */
+		sl_lock_node(node);
+
+		/*
+		 * if prev and next are already set for this level,
+		 * we're done.
+		 */
+		if (node->ptrs[level].prev) {
+			if (node->ptrs[level].next) {
+				level++;
+				goto unlock_node;
+			}
+			/*
+			 * if our node already has a prev pointer
+			 * make sure the prev we found does point to us
+			 */
+			if (find_live_next(list, prev, level) != node)
+				goto unlock_node;
+		}
+again:
+		/*
+		 * if our node has a next pointer, use it.  Otherwise
+		 * use the next from prev
+		 */
+		if (node->ptrs[level].next)
+			next = find_live_next(list, node, level);
+		else
+			next = find_live_next(list, prev, level);
+
+		/*
+		 * if we followed prev, someone might have raced in
+		 * and linked this level of node.  Make sure we
+		 * don't deadlock by trying to node ourselves twice.
+		 */
+		if (next == node)
+			goto again;
+
+		if (next) {
+			/*
+			 * someone may have raced in and inserted a
+			 * node between prev and node.  Since we
+			 * have node locked, we have to make sure not
+			 * to break locking rules if next is actually
+			 * before node in the list.
+			 */
+			next_leaf = sl_entry(next);
+			if (sl_min_key(next_leaf) < sl_max_key(leaf))
+				goto unlock_node;
+
+			/*
+			 * we can't check next for pending here.
+			 * we'd just end up deadlocked on him
+			 * waiting for us to clear pending.
+			 */
+			sl_lock_node(next);
+
+			/*
+			 * make sure we've still got the right spot to
+			 * link things in
+			 */
+			if (sl_node_dead(next))
+				goto unlock_all;
+
+			if (sl_min_key(next_leaf) < sl_max_key(leaf))
+				goto unlock_all;
+
+			/*
+			 * finally make sure next really points back to
+			 * either prev or our node
+			 */
+			search_prev = find_live_prev(list, next, level);
+			if (search_prev != node && search_prev != prev)
+				goto unlock_all;
+		}
+		rcu_assign_pointer(node->ptrs[level].next, next);
+		rcu_assign_pointer(node->ptrs[level].prev, prev);
+		rcu_assign_pointer(prev->ptrs[level].next, node);
+
+		/*
+		 * if next is null, we're the last node on this level.
+		 * The head->prev pointer is used to cache this fact
+		 */
+		if (next)
+			rcu_assign_pointer(next->ptrs[level].prev, node);
+		else
+			rcu_assign_pointer(list->head->ptrs[level].prev, node);
+
+		level++;
+unlock_all:
+		sl_unlock_node(next);
+unlock_node:
+		sl_unlock_node(node);
+		sl_unlock_node(prev);
+	}
+}
+
+/*
+ * when we split a leaf to do an insert, this links our new leaf with the
+ * one we split.  We'll use any pointers we can from 'after'
+ */
+static void __sl_link_after_node(struct sl_list *list, struct sl_node *node,
+				 struct sl_node *after, int level)
+{
+	int i;
+
+	/* first use all the pointers from 'after' */
+	for (i = 0; i <= after->level && i <= level; i++)
+		sl_link_one_level(list, after, node, i);
+}
+
+/*
+ * final stage of linking, this is where we actually clear pending
+ */
+static void __link_pending_insert(struct sl_list *list, struct sl_node *node, int start_level)
+{
+	sl_link_pending_leaf(list, sl_entry(node), start_level);
+	smp_wmb();
+	node->pending = 0;
+}
+
+static void sl_link_after_node(struct sl_list *list, struct sl_node *node,
+			       struct sl_node *after, int level)
+{
+	__sl_link_after_node(list, node, after, level);
+
+	sl_unlock_node(node);
+	sl_unlock_node(after);
+
+	__link_pending_insert(list, node, after->level + 1);
+}
+
+/*
+ * returns the first leaf in the list, locked with an
+ * extra reference added
+ */
+struct sl_leaf *skiplist_first_leaf(struct sl_list *list)
+{
+	struct sl_leaf *leaf = NULL;
+	struct sl_node *p;
+
+	rcu_read_lock();
+	while (1) {
+		p = find_live_next(list, list->head, 0);
+		if (!p)
+			break;
+
+		sl_lock_node(p);
+		if (!sl_node_dead(p) &&
+		    find_live_next(list, list->head, 0) == p) {
+			leaf = sl_entry(p);
+			skiplist_get_leaf(leaf);
+			goto out;
+		}
+		sl_unlock_node(p);
+	}
+out:
+	rcu_read_unlock();
+	return leaf;
+}
+EXPORT_SYMBOL(skiplist_first_leaf);
+
+/*
+ * returns the first leaf in the list.  No locks are held and
+ * no references are added.  Must be called under rcu_read_lock()
+ */
+struct sl_leaf *skiplist_first_leaf_rcu(struct sl_list *list)
+{
+	struct sl_node *p;
+
+	p = find_live_next(list, list->head, 0);
+	if (p)
+		return sl_entry(p);
+	return NULL;
+
+}
+EXPORT_SYMBOL(skiplist_first_leaf_rcu);
+
+/*
+ * sequential search for lockless rcu.  The insert/deletion routines
+ * order their operations to make this rcu safe.
+ *
+ * If we find the key, we return zero and set 'slot' to the location.
+ *
+ * If we don't find anything, we return 1 and set 'slot' to the location
+ * where the insertion should take place
+ *
+ * case1:
+ *       [ key ... size ]
+ *  [found .. found size  ]
+ *
+ *  case2:
+ *  [key ... size ]
+ *      [found .. found size ]
+ *
+ *  case3:
+ *  [key ...                 size ]
+ *      [ found .. found size ]
+ *
+ *  case4:
+ *  [key ...size ]
+ *         [ found ... found size ]
+ *
+ *  case5:
+ *                       [key ...size ]
+ *         [ found ... found size ]
+ */
+int skiplist_search_leaf(struct sl_leaf *leaf, unsigned long key,
+			 unsigned long size, int *slot)
+{
+	int i;
+	int cur;
+	int last;
+	unsigned long this_key;
+	struct sl_slot *found;
+
+again:
+	cur = 0;
+	last = leaf->nr;
+
+	/* find the first slot greater than our key */
+	for (i = 0; i < last; i++) {
+		smp_rmb();
+		this_key = leaf->keys[i];
+		if (this_key >= key + size)
+			break;
+		cur = i;
+	}
+	if (leaf->keys[cur] < key + size) {
+		/*
+		 * if we're in the middle of an insert, pointer may
+		 * be null.  This little loop will wait for the insertion
+		 * to finish.
+		 */
+		while (1) {
+			found = rcu_dereference(leaf->ptrs[cur]);
+			if (found)
+				break;
+			cpu_relax();
+		}
+
+		/* insert is juggling our slots, try again */
+		if (found->key != leaf->keys[cur])
+			goto again;
+
+		/* case1, case2, case5 */
+		if (found->key < key + size && found->key + found->size > key) {
+			*slot = cur;
+			return 0;
+		}
+
+		/* case3, case4 */
+		if (found->key < key + size && found->key >= key) {
+			*slot = cur;
+			return 0;
+		}
+
+		*slot = cur + 1;
+		return 1;
+	}
+	*slot = cur;
+	return 1;
+}
+EXPORT_SYMBOL(skiplist_search_leaf);
+
+/*
+ * helper to put a cached leaf and zero out the
+ * pointer
+ */
+static void invalidate_cache(struct sl_leaf **cache)
+{
+	if (cache) {
+		skiplist_put_leaf(*cache);
+		*cache = NULL;
+	}
+}
+
+/*
+ * helper to grab a reference on a cached leaf.
+ */
+static void cache_leaf(struct sl_leaf *leaf, struct sl_leaf **cache)
+{
+	assert_locked_node(&leaf->node);
+	BUG_ON(sl_node_dead(&leaf->node));
+	if (cache && *cache != leaf) {
+		if (*cache)
+			skiplist_put_leaf(*cache);
+		skiplist_get_leaf(leaf);
+		*cache = leaf;
+	}
+}
+
+/*
+ * this does the dirty work of splitting and/or shifting a leaf
+ * to get a new slot inside.  The leaf must be locked.  slot
+ * tells us where into we should insert in the leaf and the cursor
+ * should have all the pointers we need to fully link any new nodes
+ * we have to create.
+ */
+static noinline int add_key_to_leaf(struct sl_list *list, struct sl_leaf *leaf,
+			   struct sl_slot *slot_ptr, unsigned long key,
+			   int slot, int preload_token, struct sl_leaf **cache)
+{
+	struct sl_leaf *split;
+	struct skip_preload *skp;
+	int level;
+	int finish_split_insert = 0;
+
+	/* no splitting required, just shift our way in */
+	if (leaf->nr < SKIP_KEYS_PER_NODE)
+		goto insert;
+
+	skp = &__get_cpu_var(skip_preloads);
+	split = skp->preload[preload_token];
+
+	/*
+	 * we need to insert a new leaf, but we try not to insert a new leaf at
+	 * the same height as our previous one, it's just a waste of high level
+	 * searching.  If the new node is the same level or lower than the
+	 * existing one, try to use a level 0 leaf instead.
+	 */
+	if (leaf->node.level > 0 && split->node.level <= leaf->node.level) {
+		if (skp->preload[0]) {
+			preload_token = 0;
+			split = skp->preload[0];
+		}
+	}
+	skp->preload[preload_token] = NULL;
+
+	level = split->node.level;
+
+	/*
+	 * bump our list->level to whatever we've found.  Nobody allocating
+	 * a new node is going to set it higher than list->level + 1
+	 */
+	if (level > list->level)
+		list->level = level;
+
+	/*
+	 * this locking is really only required for the small window where
+	 * we are linking the node and someone might be deleting one of the
+	 * nodes we are linking with.  The leaf passed in was already
+	 * locked.
+	 */
+	split->node.pending = SKIPLIST_PENDING_INSERT;
+	sl_lock_node(&split->node);
+
+	if (slot == leaf->nr) {
+		/*
+		 * our new slot just goes at the front of the new leaf, don't
+		 * bother shifting things in from the previous leaf.
+		 */
+		slot = 0;
+		split->nr = 1;
+		split->max = key + slot_ptr->size;
+		split->keys[0] = key;
+		split->ptrs[0] = slot_ptr;
+		smp_wmb();
+		cache_leaf(split, cache);
+		sl_link_after_node(list, &split->node, &leaf->node, level);
+		return 0;
+	} else {
+		int nr = SKIP_KEYS_PER_NODE / 2;
+		int mid = SKIP_KEYS_PER_NODE - nr;
+		int src_i = mid;
+		int dst_i = 0;
+		int orig_nr = leaf->nr;
+
+		/* split the previous leaf in half and copy items over */
+		split->nr = nr;
+		split->max = leaf->max;
+
+		while (src_i < slot) {
+			split->keys[dst_i] = leaf->keys[src_i];
+			split->ptrs[dst_i++] = leaf->ptrs[src_i++];
+		}
+
+		if (slot >= mid) {
+			split->keys[dst_i] = key;
+			split->ptrs[dst_i++] = slot_ptr;
+			split->nr++;
+		}
+
+		while (src_i < orig_nr) {
+			split->keys[dst_i] = leaf->keys[src_i];
+			split->ptrs[dst_i++] = leaf->ptrs[src_i++];
+		}
+
+		/*
+		 * this completes the initial link, but we still
+		 * have the original node and split locked
+		 */
+		__sl_link_after_node(list, &split->node, &leaf->node, level);
+
+		nr = SKIP_KEYS_PER_NODE - nr;
+
+		/*
+		 * now what we have all the items copied and our new
+		 * leaf inserted, update the nr in this leaf.  Anyone
+		 * searching in rculand will find the fully updated
+		 */
+		leaf->max = leaf->keys[nr - 1] + leaf->ptrs[nr - 1]->size;
+		smp_wmb();
+		leaf->nr = nr;
+
+		/*
+		 * if the slot was in split item, we're done,
+		 * otherwise we need to move down into the
+		 * code below that shifts the items and
+		 * inserts the new key
+		 */
+		if (slot >= mid) {
+			cache_leaf(split, cache);
+			sl_unlock_node(&split->node);
+			sl_unlock_node(&leaf->node);
+
+			/* finish linking split all the way to the top */
+			__link_pending_insert(list, &split->node,
+					      leaf->node.level + 1);
+			return 0;
+		}
+
+		/*
+		 * unlock our split node and let the code below finish
+		 * the key insertion.  We'll finish inserting split
+		 * after we unlock leaf
+		 */
+		sl_unlock_node(&split->node);
+		finish_split_insert = 1;
+	}
+insert:
+	if (slot < leaf->nr) {
+		int i;
+
+		/*
+		 * put something sane into the new last slot so rcu
+		 * searchers won't get confused
+		 */
+		leaf->keys[leaf->nr] = 0;
+		leaf->ptrs[leaf->nr] = NULL;
+		smp_wmb();
+
+		/* then bump the nr */
+		leaf->nr++;
+
+		/*
+		 * now step through each pointer after our
+		 * destination and bubble it forward.  memcpy
+		 * would be faster but rcu searchers will be
+		 * able to validate pointers as they go with
+		 * this method.
+		 */
+		for (i = leaf->nr - 1; i > slot; i--) {
+			leaf->keys[i] = leaf->keys[i - 1];
+			leaf->ptrs[i] = leaf->ptrs[i - 1];
+			/*
+			 * make sure the key/pointer pair is
+			 * fully visible in the new home before
+			 * we move forward
+			 */
+			smp_wmb();
+		}
+
+		/* finally stuff in our key */
+		leaf->keys[slot] = key;
+		leaf->ptrs[slot] = slot_ptr;
+		smp_wmb();
+	} else {
+		/*
+		 * just extending the leaf, toss
+		 * our key in and update things
+		 */
+		leaf->max = key + slot_ptr->size;
+		leaf->keys[slot] = key;
+		leaf->ptrs[slot] = slot_ptr;
+
+		smp_wmb();
+		leaf->nr++;
+	}
+	cache_leaf(leaf, cache);
+	sl_unlock_node(&leaf->node);
+	if (finish_split_insert)
+		__link_pending_insert(list, &split->node, leaf->node.level + 1);
+	return 0;
+}
+
+/*
+ * helper function for insert.  This will either return an existing
+ * key or insert a new slot into the list.  leaf must be locked.
+ */
+static noinline int find_or_add_key(struct sl_list *list, unsigned long key,
+				    unsigned long size, struct sl_leaf *leaf,
+				    struct sl_slot *slot_ptr,
+				    int preload_token, struct sl_leaf **cache)
+{
+	int ret;
+	int slot;
+
+	if (key < leaf->max) {
+		ret = skiplist_search_leaf(leaf, key, size, &slot);
+		if (ret == 0) {
+			ret = -EEXIST;
+			cache_leaf(leaf, cache);
+			sl_unlock_node(&leaf->node);
+			goto out;
+		}
+	} else {
+		slot = leaf->nr;
+	}
+
+	atomic_inc(&slot_ptr->refs);
+
+	/* add_key_to_leaf unlocks the node */
+	add_key_to_leaf(list, leaf, slot_ptr, key, slot, preload_token, cache);
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * pull a new leaf out of the prealloc area, and insert the slot/key into it
+ */
+static struct sl_leaf *alloc_leaf(struct sl_slot *slot_ptr, unsigned long key,
+				  int preload_token)
+{
+	struct sl_leaf *leaf;
+	struct skip_preload *skp;
+	int level;
+
+	skp = &__get_cpu_var(skip_preloads);
+	leaf = skp->preload[preload_token];
+	skp->preload[preload_token] = NULL;
+	level = leaf->node.level;
+
+	leaf->keys[0] = key;
+	leaf->ptrs[0] = slot_ptr;
+	leaf->nr = 1;
+	leaf->max = key + slot_ptr->size;
+	return leaf;
+}
+
+/*
+ * this returns with preempt disabled and the preallocation
+ * area setup for a new insert.  To get there, it may or
+ * may not allocate a new leaf for the next insert.
+ *
+ * If allocations are done, this will also try to preallocate a level 0
+ * leaf, which allows us to optimize insertion by not placing two
+ * adjacent nodes together with the same level.
+ *
+ * This returns < 0 on errors.  If everything works, it returns a preload
+ * token which you should use when fetching your preallocated items.
+ *
+ * The token allows us to preallocate based on the current
+ * highest level of the list.  For a list of level N, we won't allocate
+ * higher than N + 1.
+ */
+int skiplist_preload(struct sl_list *list, gfp_t gfp_mask)
+{
+	struct skip_preload *skp;
+	struct sl_leaf *leaf;
+	struct sl_leaf *leaf0 = NULL;
+	int alloc_leaf0 = 1;
+	int level;
+	int max_level = min_t(int, list->level + 1, SKIP_MAXLEVEL - 1);
+	int token = max_level;
+
+	preempt_disable();
+	skp = &__get_cpu_var(skip_preloads);
+	if (max_level && !skp->preload[0])
+		alloc_leaf0 = 1;
+
+	if (skp->preload[max_level])
+		return token;
+
+	preempt_enable();
+	level = skiplist_get_new_level(list, max_level);
+	leaf = kmem_cache_alloc(slab_caches[level], gfp_mask);
+	if (leaf == NULL)
+		return -ENOMEM;
+
+	if (alloc_leaf0)
+		leaf0 = kmem_cache_alloc(slab_caches[0], gfp_mask);
+
+	preempt_disable();
+	skp = &__get_cpu_var(skip_preloads);
+
+	if (leaf0) {
+		sl_init_leaf(leaf0, 0);
+		if (skp->preload[0] == NULL) {
+			skp->preload[0] = leaf0;
+		} else {
+			skiplist_put_leaf(leaf0);
+		}
+	}
+
+	sl_init_leaf(leaf, level);
+	if (skp->preload[max_level]) {
+		skiplist_put_leaf(leaf);
+		return token;
+	}
+	skp->preload[max_level] = leaf;
+
+	return token;
+}
+EXPORT_SYMBOL(skiplist_preload);
+
+/*
+ * use the kernel prandom call to pick a new random level.  This
+ * uses P = .50.  If you bump the SKIP_MAXLEVEL past 32 levels,
+ * this function needs updating.
+ */
+int skiplist_get_new_level(struct sl_list *list, int max_level)
+{
+	int level = 0;
+	unsigned long randseed;
+
+	randseed = prandom_u32();
+
+	while (randseed && (randseed & 1)) {
+		randseed >>= 1;
+		level++;
+		if (level == max_level)
+			break;
+	}
+	return (level >= SKIP_MAXLEVEL ? SKIP_MAXLEVEL - 1: level);
+}
+EXPORT_SYMBOL(skiplist_get_new_level);
+
+/*
+ * just return the level of the leaf we're going to use
+ * for the next insert
+ */
+static int pending_insert_level(int preload_token)
+{
+	struct skip_preload *skp;
+	skp = &__get_cpu_var(skip_preloads);
+	return skp->preload[preload_token]->node.level;
+}
+
+/*
+ * after a lockless search, this makes sure a given key is still
+ * inside the min/max of a leaf.  If not, you have to repeat the
+ * search and try again.
+ */
+static int verify_key_in_leaf(struct sl_leaf *leaf, unsigned long key,
+			      unsigned long size)
+{
+	if (sl_node_dead(&leaf->node))
+		return 0;
+
+	if (key + size < sl_min_key(leaf) ||
+	    key >= sl_max_key(leaf))
+		return 0;
+	return 1;
+}
+
+/*
+ * The insertion code tries to delay taking locks for as long as possible.
+ * Once we've found a good place to insert, we need to make sure the leaf
+ * we have picked is still a valid location.
+ *
+ * This checks the previous and next pointers to make sure everything is
+ * still correct for the insert.  You should send an unlocked leaf, and
+ * it will return 1 with the leaf locked if everything worked.
+ *
+ */
+static int verify_key_in_path(struct sl_list *list,
+			      struct sl_node *node, unsigned long key,
+			      unsigned long size)
+{
+	struct sl_leaf *prev = NULL;
+	struct sl_leaf *next;
+	struct sl_node *p;
+	struct sl_leaf *leaf = NULL;
+	struct sl_node *lock1;
+	struct sl_node *lock2;
+	struct sl_node *lock3;
+	int level = 0;
+	int ret = -EAGAIN;
+
+again:
+	lock1 = NULL;
+	lock2 = NULL;
+	lock3 = NULL;
+	if (node != list->head) {
+		p = rcu_dereference(node->ptrs[level].prev);
+		skiplist_wait_pending_insert(p);
+		sl_lock_node(p);
+		lock1 = p;
+
+		sl_lock_node(node);
+		lock2 = node;
+
+		if (sl_node_dead(node))
+			goto out;
+
+		if (p->ptrs[level].next != node) {
+			sl_unlock_node(lock1);
+			sl_unlock_node(lock2);
+			goto again;
+		}
+
+		if (sl_node_dead(p))
+			goto out;
+
+
+		if (p != list->head)
+			prev = sl_entry(p);
+
+		/*
+		 * once we have the locks, make sure everyone
+		 * still points to each other
+		 */
+		if (node->ptrs[level].prev != p) {
+			sl_unlock_node(lock1);
+			sl_unlock_node(lock2);
+			goto again;
+		}
+
+		leaf = sl_entry(node);
+	} else {
+		sl_lock_node(node);
+		lock2 = node;
+	}
+
+	/*
+	 * rule #1, the key must be greater than the max of the previous
+	 * leaf
+	 */
+	if (prev && key < sl_max_key(prev))
+		goto out;
+
+	/* we're done with prev, unlock it */
+	sl_unlock_node(lock1);
+	lock1 = NULL;
+
+	/* rule #2 is the key must be smaller than the min key
+	 * in the next node.  If the key is already smaller than
+	 * the max key of this node, we know we're safe for rule #2
+	 */
+	if (leaf && key < sl_max_key(leaf))
+		return 0;
+again_next:
+	p = node->ptrs[level].next;
+	if (p)
+		next = sl_entry(p);
+	else
+		next = NULL;
+
+	if (next) {
+		if (key >= sl_min_key(next))
+			goto out;
+
+		if (sl_node_inserting(&next->node)) {
+			sl_unlock_node(lock2);
+			skiplist_wait_pending_insert(&next->node);
+			goto again;
+		}
+		sl_lock_node(&next->node);
+		lock3 = &next->node;
+
+		if (sl_node_dead(&next->node)) {
+			sl_unlock_node(lock3);
+			sl_unlock_node(lock2);
+			goto again;
+		}
+
+		if (next->node.ptrs[level].prev != node) {
+			sl_unlock_node(lock3);
+			goto again_next;
+		}
+
+		/*
+		 * rule #2 the key must be smaller than the min key
+		 * in the next node
+		 */
+		if (key >= sl_min_key(next))
+			goto out;
+
+		if (key + size > sl_min_key(next)) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	/*
+	 * return with our leaf locked and sure that our leaf is the
+	 * best place for this key
+	 */
+	sl_unlock_node(lock1);
+	sl_unlock_node(lock3);
+	return 0;
+
+out:
+	sl_unlock_node(lock1);
+	sl_unlock_node(lock2);
+	sl_unlock_node(lock3);
+	return ret;
+}
+
+
+/*
+ * Before calling this you must have stocked the preload area by
+ * calling skiplist_preload, and you must have kept preemption
+ * off.  preload_token comes from skiplist_preload, pass in
+ * exactly what preload gave you.
+ *
+ * 'slot' will be inserted into the skiplist, returning 0
+ * on success or < 0 on failure
+ *
+ * If 'cache' isn't null, we try to insert into cache first.
+ * When we return a reference to the leaf where the insertion
+ * was done is added.
+ *
+ * More details in the comments below.
+ */
+int skiplist_insert(struct sl_list *list, struct sl_slot *slot,
+		    int preload_token, struct sl_leaf **cache)
+{
+	struct sl_node *p;
+	struct sl_node *ins_locked = NULL;
+	struct sl_leaf *leaf;
+	unsigned long key = slot->key;
+	unsigned long size = slot->size;
+	unsigned long min_key;
+	unsigned long max_key;
+	int level;
+	int ret;
+	int err;
+
+	rcu_read_lock();
+	ret = -EEXIST;
+
+	/* try our cache first */
+	if (cache && *cache) {
+		leaf = *cache;
+		skiplist_wait_pending_insert(&leaf->node);
+		err = verify_key_in_path(list, &leaf->node, key, size);
+		if (err == -EEXIST) {
+			ret = -EEXIST;
+			goto fail;
+		} else if (err == 0) {
+			ins_locked = &leaf->node;
+			level = 0;
+			goto find_or_add;
+		} else {
+			invalidate_cache(cache);
+		}
+	}
+
+again:
+	p = list->head;
+	level = list->level;
+
+	do {
+		while (1) {
+			leaf = sl_next_leaf(list, p, level);
+			if (!leaf) {
+				skiplist_wait_pending_insert(p);
+				/*
+				 * if we're at level 0 and p points to
+				 * the head, the list is just empty.  If
+				 * we're not at level 0 yet, keep walking
+				 * down.
+				 */
+				if (p == list->head || level != 0)
+					break;
+
+				err = verify_key_in_path(list, p, key, size);
+				if (err == -EEXIST) {
+					ret = -EEXIST;
+					goto fail;
+				} else if (err) {
+					goto again;
+				}
+
+				leaf = sl_next_leaf(list, p, level);
+				if (leaf) {
+					sl_unlock_node(p);
+					goto again;
+				}
+
+				/*
+				 * p was the last leaf on the bottom level,
+				 * We're here because 'key' was bigger than the
+				 * max key in p.  find_or_add will append into
+				 * the last leaf.
+				 */
+				ins_locked = p;
+				goto find_or_add;
+			}
+
+			max_key = sl_max_key(leaf);
+
+			/*
+			 * strictly speaking this test is covered again below.
+			 * But usually we have to walk forward through the
+			 * pointers, so this is the most common condition.  Try
+			 * it first.
+			 */
+			if (key >= max_key)
+				goto next;
+
+			min_key = sl_min_key(leaf);
+
+			if (key < min_key) {
+				/*
+				 * our key is smaller than the smallest key in
+				 * leaf.  If we're not in level 0 yet, we don't
+				 * want to cross over into the leaf
+				 */
+				if (level != 0)
+					break;
+
+				p = &leaf->node;
+				skiplist_wait_pending_insert(p);
+
+				err = verify_key_in_path(list, p, key, size);
+				if (err == -EEXIST) {
+					ret = -EEXIST;
+					goto fail;
+				} else if (err) {
+					goto again;
+				}
+
+				if (key >= sl_min_key(leaf)) {
+					sl_unlock_node(p);
+					goto again;
+				}
+
+				/*
+				 * we are in level 0, prepend our key
+				 * into this leaf
+				 */
+				ins_locked = p;
+				goto find_or_add;
+			}
+
+			if (key < sl_max_key(leaf)) {
+				/*
+				 * our key is smaller than the max
+				 * and bigger than the min, this is
+				 * the one true leaf for our key no
+				 * matter what level we're in
+				 */
+				p = &leaf->node;
+				skiplist_wait_pending_insert(p);
+				sl_lock_node(p);
+
+				if (sl_node_dead(p) ||
+				    key >= sl_max_key(leaf) ||
+				    key < sl_min_key(leaf)) {
+					sl_unlock_node(p);
+					goto again;
+				}
+				ins_locked = p;
+				goto find_or_add;
+			}
+next:
+			p = &leaf->node;
+		}
+
+		level--;
+	} while (level >= 0);
+
+	/*
+	 * we only get here if the list is completely empty.  FIXME
+	 * this can be folded into the find_or_add code below
+	 */
+
+	sl_lock_node(list->head);
+	if (list->head->ptrs[0].next != NULL) {
+		sl_unlock_node(list->head);
+		goto again;
+	}
+	atomic_inc(&slot->refs);
+	leaf = alloc_leaf(slot, key, preload_token);
+	level = leaf->node.level;
+	leaf->node.pending = SKIPLIST_PENDING_INSERT;
+	sl_lock_node(&leaf->node);
+
+	if (level > list->level)
+		list->level++;
+
+	cache_leaf(leaf, cache);
+
+	/* unlocks the leaf and the list head */
+	sl_link_after_node(list, &leaf->node, list->head, level);
+	ret = 0;
+	rcu_read_unlock();
+
+	return ret;
+
+find_or_add:
+
+	leaf = sl_entry(ins_locked);
+
+	/* ins_locked is unlocked */
+	ret = find_or_add_key(list, key, size, leaf, slot,
+			      preload_token, cache);
+fail:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(skiplist_insert);
+
+/*
+ * lookup has two stages.  First we find the leaf that should have
+ * our key, and then we go through all the slots in that leaf and
+ * look for the key.  This helper function is just the first stage
+ * and it must be called under rcu_read_lock().  You may be using the
+ * non-rcu final lookup variant, but this part must be rcu.
+ *
+ * We'll return NULL if we find nothing or the candidate leaf
+ * for you to search.
+ *
+ * last is set to one leaf before the leaf we checked.  skiplist_insert_hole
+ * uses this to search for ranges.
+ */
+struct sl_leaf *__skiplist_lookup_leaf(struct sl_list *list,
+				       struct sl_node **last,
+				       unsigned long key,
+				       unsigned long size)
+{
+	struct sl_node *p;
+	struct sl_leaf *leaf;
+	int level;
+	struct sl_leaf *leaf_ret = NULL;
+	unsigned long max_key = 0;
+	unsigned long min_key = 0;
+
+	level = list->level;
+	p = list->head;
+	do {
+		while (1) {
+			leaf = sl_next_leaf(list, p, level);
+			if (!leaf) {
+				if (sl_node_inserting(p)) {
+					skiplist_wait_pending_insert(p);
+					continue;
+				}
+				break;
+			}
+			max_key = sl_max_key(leaf);
+
+			if (key >= max_key)
+				goto next;
+
+			min_key = sl_min_key(leaf);
+
+			if (key < min_key)
+				break;
+
+			if (key < max_key) {
+				leaf_ret = leaf;
+				goto done;
+			}
+next:
+			p = &leaf->node;
+		}
+		level--;
+	} while (level >= 0);
+
+done:
+	if (last)
+		*last = p;
+	return leaf_ret;
+}
+
+/*
+ * return the first leaf containing this key/size.  NULL
+ * is returned if we find nothing.  Must be called under
+ * rcu_read_lock.
+ */
+struct sl_leaf *skiplist_lookup_leaf_rcu(struct sl_list *list,
+				     unsigned long key,
+				     unsigned long size)
+{
+	return __skiplist_lookup_leaf(list, NULL, key, size);
+}
+EXPORT_SYMBOL(skiplist_lookup_leaf_rcu);
+
+/*
+ * returns a locked leaf that contains key/size, or NULL
+ */
+struct sl_leaf *skiplist_lookup_leaf(struct sl_list *list,
+				     unsigned long key,
+				     unsigned long size)
+
+{
+	struct sl_leaf *leaf;
+	rcu_read_lock();
+again:
+	leaf = __skiplist_lookup_leaf(list, NULL, key, size);
+	if (leaf) {
+		sl_lock_node(&leaf->node);
+		if (!verify_key_in_leaf(leaf, key, size)) {
+			sl_unlock_node(&leaf->node);
+			goto again;
+		}
+	}
+	rcu_read_unlock();
+	return leaf;
+}
+EXPORT_SYMBOL(skiplist_lookup_leaf);
+
+/*
+ * returns a locked leaf that might contain key/size.  An
+ * extra reference is taken on the leaf we return
+ */
+struct sl_leaf *skiplist_lookup_first_leaf(struct sl_list *list,
+					   unsigned long key,
+					   unsigned long size)
+
+{
+	struct sl_leaf *leaf;
+	struct sl_node *last = NULL;
+	rcu_read_lock();
+again:
+	leaf = __skiplist_lookup_leaf(list, &last, key, size);
+	if (leaf) {
+		smp_rmb();
+		skiplist_wait_pending_insert(&leaf->node);
+		sl_lock_node(&leaf->node);
+		if (!verify_key_in_leaf(leaf, key, size)) {
+			sl_unlock_node(&leaf->node);
+			goto again;
+		}
+		skiplist_get_leaf(leaf);
+	} else if (last && last != list->head) {
+		smp_rmb();
+		if (sl_node_dead(last))
+			goto again;
+
+		skiplist_wait_pending_insert(last);
+		sl_lock_node(last);
+
+		if (sl_node_dead(last)) {
+			sl_unlock_node(last);
+			goto again;
+		}
+		leaf = sl_entry(last);
+		skiplist_get_leaf(leaf);
+	}
+	rcu_read_unlock();
+	return leaf;
+}
+EXPORT_SYMBOL(skiplist_lookup_first_leaf);
+
+/*
+ * rcu_read_lock must be held.  This returns a leaf that may
+ * contain key/size
+ */
+struct sl_leaf *skiplist_lookup_first_leaf_rcu(struct sl_list *list,
+					       unsigned long key,
+					       unsigned long size)
+
+{
+	struct sl_leaf *leaf;
+	struct sl_node *last;
+
+again:
+	last = NULL;
+	leaf = __skiplist_lookup_leaf(list, &last, key, size);
+	if (leaf) {
+		return leaf;
+	} else if (last && last != list->head) {
+		if (sl_node_dead(last))
+			goto again;
+		return sl_entry(last);
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(skiplist_lookup_first_leaf_rcu);
+
+/*
+ * given a locked leaf, this returns the next leaf in the
+ * skiplist (locked)
+ */
+struct sl_leaf *skiplist_next_leaf(struct sl_list *list,
+				   struct sl_leaf *leaf)
+{
+	struct sl_leaf *next_leaf = NULL;
+	struct sl_node *node;
+
+	rcu_read_lock();
+	while (1) {
+		node = find_live_next(list, &leaf->node, 0);
+		if (!node)
+			break;
+
+		next_leaf = sl_entry(node);
+
+		sl_lock_node(node);
+		if (!sl_node_dead(node) &&
+		    find_live_prev(list, node, 0) == &leaf->node) {
+			skiplist_get_leaf(next_leaf);
+			break;
+		}
+
+		sl_unlock_node(node);
+	}
+	rcu_read_unlock();
+
+	return next_leaf;
+}
+EXPORT_SYMBOL(skiplist_next_leaf);
+
+struct sl_leaf *skiplist_next_leaf_rcu(struct sl_list *list,
+				   struct sl_leaf *leaf)
+{
+	struct sl_node *next;
+
+	next = find_live_next(list, &leaf->node, 0);
+	if (next)
+		return sl_entry(next);
+	return NULL;
+}
+EXPORT_SYMBOL(skiplist_next_leaf_rcu);
+
+/*
+ * this lookup function expects RCU to protect the slots in the leaves
+ * as well as the skiplist indexing structures
+ *
+ * Note, you must call this with rcu_read_lock held, and you must verify
+ * the result yourself.  If the key field of the returned slot doesn't
+ * match your key, repeat the lookup.  Reference counting etc is also
+ * all your responsibility.
+ */
+struct sl_slot *skiplist_lookup_rcu(struct sl_list *list, unsigned long key,
+				    unsigned long size)
+{
+	struct sl_leaf *leaf;
+	struct sl_slot *slot_ret = NULL;
+	int slot;
+	int ret;
+
+again:
+	leaf = __skiplist_lookup_leaf(list, NULL, key, size);
+	if (leaf) {
+		ret = skiplist_search_leaf(leaf, key, size, &slot);
+		if (ret == 0)
+			slot_ret = rcu_dereference(leaf->ptrs[slot]);
+		else if (!verify_key_in_leaf(leaf, key, size))
+			goto again;
+	}
+	return slot_ret;
+
+}
+EXPORT_SYMBOL(skiplist_lookup_rcu);
+
+/*
+ * this lookup function only uses RCU to protect the skiplist indexing
+ * structs.  The actual slots are protected by full locks.
+ */
+struct sl_slot *skiplist_lookup(struct sl_list *list, unsigned long key,
+				unsigned long size)
+{
+	struct sl_leaf *leaf;
+	struct sl_slot *slot_ret = NULL;
+	struct sl_node *p;
+	int slot;
+	int ret;
+
+again:
+	rcu_read_lock();
+	leaf = __skiplist_lookup_leaf(list, &p, key, size);
+	if (leaf) {
+		sl_lock_node(&leaf->node);
+		if (!verify_key_in_leaf(leaf, key, size)) {
+			sl_unlock_node(&leaf->node);
+			rcu_read_unlock();
+			goto again;
+		}
+		ret = skiplist_search_leaf(leaf, key, size, &slot);
+		if (ret == 0) {
+			slot_ret = leaf->ptrs[slot];
+			if (atomic_inc_not_zero(&slot_ret->refs) == 0)
+				slot_ret = NULL;
+		}
+		sl_unlock_node(&leaf->node);
+	}
+	rcu_read_unlock();
+	return slot_ret;
+
+}
+EXPORT_SYMBOL(skiplist_lookup);
+
+/* helper for skiplist_insert_hole.  the iommu requires alignment */
+static unsigned long align_start(unsigned long val, unsigned long align)
+{
+	return (val + align - 1) & ~(align - 1);
+}
+
+/*
+ * this is pretty ugly, but it is used to find a free spot in the
+ * tree for a new iommu allocation.  We start from a given
+ * hint and try to find an aligned range of a given size.
+ *
+ * Send the slot pointer, and we'll update it with the location
+ * we found.
+ *
+ * This will return -EAGAIN if we found a good spot but someone
+ * raced in and allocated it before we could.  This gives the
+ * caller the chance to update their hint.
+ *
+ * This will return -EEXIST if we couldn't find anything at all
+ *
+ * returns 0 if all went well, or some other negative error
+ * if things went badly.
+ */
+int skiplist_insert_hole(struct sl_list *list, unsigned long hint,
+			 unsigned long limit,
+			 unsigned long size, unsigned long align,
+			 struct sl_slot *slot,
+			 gfp_t gfp_mask)
+{
+	unsigned long last_end = 0;
+	struct sl_node *p;
+	struct sl_leaf *leaf;
+	int i;
+	int ret = -EEXIST;
+	int preload_token;
+	int pending_level;
+
+	preload_token = skiplist_preload(list, gfp_mask);
+	if (preload_token < 0) {
+		return preload_token;
+	}
+	pending_level = pending_insert_level(preload_token);
+
+	/* step one, lets find our hint */
+	rcu_read_lock();
+again:
+
+	last_end = max(last_end, hint);
+	last_end = align_start(last_end, align);
+	slot->key = align_start(hint, align);
+	slot->size = size;
+	leaf = __skiplist_lookup_leaf(list, &p, hint, 1);
+	if (!p)
+		p = list->head;
+
+	if (leaf && !verify_key_in_leaf(leaf, hint, size)) {
+		goto again;
+	}
+
+again_lock:
+	sl_lock_node(p);
+	if (sl_node_dead(p)) {
+		sl_unlock_node(p);
+		goto again;
+	}
+
+	if (p != list->head) {
+		leaf = sl_entry(p);
+		/*
+		 * the leaf we found was past the hint,
+		 * go back one
+		 */
+		if (sl_max_key(leaf) > hint) {
+			struct sl_node *locked = p;
+			p = p->ptrs[0].prev;
+			sl_unlock_node(locked);
+			goto again_lock;
+		}
+		last_end = align_start(sl_max_key(sl_entry(p)), align);
+	}
+
+	/*
+	 * now walk at level 0 and find a hole.  We could use lockless walks
+	 * if we wanted to bang more on the insertion code, but this
+	 * instead holds the lock on each node as we inspect it
+	 *
+	 * This is a little sloppy, insert will return -eexist if we get it
+	 * wrong.
+	 */
+	while(1) {
+		leaf = sl_next_leaf(list, p, 0);
+		if (!leaf)
+			break;
+
+		/* p and leaf are locked */
+		sl_lock_node(&leaf->node);
+		if (last_end > sl_max_key(leaf))
+			goto next;
+
+		for (i = 0; i < leaf->nr; i++) {
+			if (last_end > leaf->keys[i])
+				continue;
+			if (leaf->keys[i] - last_end >= size) {
+
+				if (last_end + size > limit) {
+					sl_unlock_node(&leaf->node);
+					goto out_rcu;
+				}
+
+				sl_unlock_node(p);
+				slot->key = last_end;
+				slot->size = size;
+				goto try_insert;
+			}
+			last_end = leaf->keys[i] + leaf->ptrs[i]->size;
+			last_end = align_start(last_end, align);
+			if (last_end + size > limit) {
+				sl_unlock_node(&leaf->node);
+				goto out_rcu;
+			}
+		}
+next:
+		sl_unlock_node(p);
+		p = &leaf->node;
+	}
+
+	if (last_end + size <= limit) {
+		sl_unlock_node(p);
+		slot->key = last_end;
+		slot->size = size;
+		goto try_insert;
+	}
+
+out_rcu:
+	/* we've failed */
+	sl_unlock_node(p);
+	rcu_read_unlock();
+	preempt_enable();
+
+	return ret;
+
+try_insert:
+	/*
+	 * if the pending_level is zero or there is room in the
+	 * leaf, we're ready to insert.  This is true most of the
+	 * time, and we won't have to drop our lock and give others
+	 * the chance to race in and steal our spot.
+	 */
+	if (leaf && (pending_level == 0 || leaf->nr < SKIP_KEYS_PER_NODE) &&
+	    !sl_node_dead(&leaf->node) && (slot->key >= sl_min_key(leaf) &&
+	    slot->key + slot->size <= sl_max_key(leaf))) {
+		ret = find_or_add_key(list, slot->key, size, leaf, slot,
+				      preload_token, NULL);
+		rcu_read_unlock();
+		goto out;
+	}
+	/*
+	 * no such luck, drop our lock and try the insert the
+	 * old fashioned way
+	 */
+	if (leaf)
+		sl_unlock_node(&leaf->node);
+
+	rcu_read_unlock();
+	ret = skiplist_insert(list, slot, preload_token, NULL);
+
+out:
+	/*
+	 * if we get an EEXIST here, it just means we lost the race.
+	 * return eagain to the caller so they can update the hint
+	 */
+	if (ret == -EEXIST)
+		ret = -EAGAIN;
+
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(skiplist_insert_hole);
+
+/*
+ * we erase one level at a time, from top to bottom.
+ * The basic idea is to find a live prev and next pair,
+ * and make them point to each other.
+ *
+ * For a given level, this takes locks on prev, node, next
+ * makes sure they all point to each other and then
+ * removes node from the middle.
+ *
+ * The node must already be marked dead and it must already
+ * be empty.
+ */
+static void erase_one_level(struct sl_list *list,
+			    struct sl_node *node, int level)
+{
+	struct sl_node *prev;
+	struct sl_node *next;
+	struct sl_node *test_prev;
+	struct sl_node *test_next;
+
+again:
+	prev = find_live_prev(list, node, level);
+	skiplist_wait_pending_insert(prev);
+	sl_lock_node(prev);
+
+	test_prev = find_live_prev(list, node, level);
+	if (test_prev != prev || sl_node_dead(prev)) {
+		sl_unlock_node(prev);
+		goto again;
+	}
+
+again_next:
+	next = find_live_next(list, prev, level);
+	if (next) {
+		if (sl_node_inserting(next)) {
+			sl_unlock_node(prev);
+			skiplist_wait_pending_insert(next);
+			goto again;
+		}
+		sl_lock_node(next);
+		test_next = find_live_next(list, prev, level);
+		if (test_next != next || sl_node_dead(next)) {
+			sl_unlock_node(next);
+			goto again_next;
+		}
+		test_prev = find_live_prev(list, next, level);
+		test_next = find_live_next(list, prev, level);
+		if (test_prev != prev || test_next != next) {
+			sl_unlock_node(prev);
+			sl_unlock_node(next);
+			goto again;
+		}
+	}
+	rcu_assign_pointer(prev->ptrs[level].next, next);
+	if (next)
+		rcu_assign_pointer(next->ptrs[level].prev, prev);
+	else if (prev != list->head)
+		rcu_assign_pointer(list->head->ptrs[level].prev, prev);
+	else
+		rcu_assign_pointer(list->head->ptrs[level].prev, NULL);
+
+	sl_unlock_node(prev);
+	sl_unlock_node(next);
+}
+
+/*
+ * for a marked dead, unlink all the levels.  The leaf must
+ * not be locked
+ */
+void sl_erase(struct sl_list *list, struct sl_leaf *leaf)
+{
+	int i;
+	int level = leaf->node.level;
+
+	for (i = level; i >= 0; i--)
+		erase_one_level(list, &leaf->node, i);
+}
+
+/*
+ * helper for skiplist_delete, this pushes pointers
+ * around to remove a single slot
+ */
+static void delete_slot(struct sl_leaf *leaf, int slot)
+{
+	if (slot != leaf->nr - 1) {
+		int i;
+		for (i = slot; i <= leaf->nr - 1; i++) {
+			leaf->keys[i] = leaf->keys[i + 1];
+			leaf->ptrs[i] = leaf->ptrs[i + 1];
+			smp_wmb();
+		}
+	} else if (leaf->nr > 1) {
+		leaf->max = leaf->keys[leaf->nr - 2] +
+			leaf->ptrs[leaf->nr - 2]->size;
+		smp_wmb();
+	}
+	leaf->nr--;
+}
+
+/*
+ * find a given [key, size] in the skiplist and remove it.
+ * If we find anything, we return the slot pointer that
+ * was stored in the tree.
+ *
+ * deletion involves a mostly lockless lookup to
+ * find the right leaf.  Then we take the lock and find the
+ * correct slot.
+ *
+ * The slot is removed from the leaf, and if the leaf
+ * is now empty, it is removed from the skiplist.
+ */
+struct sl_slot *skiplist_delete(struct sl_list *list,
+				       unsigned long key,
+				       unsigned long size)
+{
+	struct sl_slot *slot_ret = NULL;
+	struct sl_leaf *leaf;
+	int slot;
+	int ret;
+
+	rcu_read_lock();
+again:
+	leaf = __skiplist_lookup_leaf(list, NULL, key, size);
+	if (!leaf)
+		goto out;
+
+	skiplist_wait_pending_insert(&leaf->node);
+
+	sl_lock_node(&leaf->node);
+	if (!verify_key_in_leaf(leaf, key, size)) {
+		sl_unlock_node(&leaf->node);
+		goto again;
+	}
+
+	ret = skiplist_search_leaf(leaf, key, size, &slot);
+	if (ret == 0) {
+		slot_ret = leaf->ptrs[slot];
+	} else {
+		sl_unlock_node(&leaf->node);
+		goto out;
+	}
+
+	delete_slot(leaf, slot);
+	if (leaf->nr == 0) {
+		/*
+		 * sl_erase has to mess wit the prev pointers, so
+		 * we need to unlock it here
+		 */
+		leaf->node.pending = SKIPLIST_PENDING_DEAD;
+		sl_unlock_node(&leaf->node);
+		sl_erase(list, leaf);
+		skiplist_put_leaf(leaf);
+	} else {
+		sl_unlock_node(&leaf->node);
+	}
+out:
+	rcu_read_unlock();
+	return slot_ret;
+}
+EXPORT_SYMBOL(skiplist_delete);
+
+/*
+ * this deletes an entire leaf, the caller must have some
+ * other way to free all the slots that are linked in.  The
+ * leaf must be locked.
+ */
+void skiplist_delete_leaf(struct sl_list *list, struct sl_leaf *leaf,
+			  struct sl_leaf **cache_next)
+{
+	struct sl_node *next = NULL;
+
+	rcu_read_lock();
+	assert_locked_node(&leaf->node);
+	leaf->nr = 0;
+
+	BUG_ON(sl_node_inserting(&leaf->node));
+
+	leaf->node.pending = SKIPLIST_PENDING_DEAD;
+
+	if (cache_next) {
+		*cache_next = NULL;
+		next = find_live_next(list, &leaf->node, 0);
+	}
+
+	sl_unlock_node(&leaf->node);
+	sl_erase(list, leaf);
+	/* once for the skiplist */
+	skiplist_put_leaf(leaf);
+	/* once for the caller */
+	skiplist_put_leaf(leaf);
+
+	if (cache_next && next && !sl_node_dead(next)) {
+		int ret;
+
+		leaf = sl_entry(next);
+		ret = skiplist_get_leaf_not_zero(leaf);
+		if (ret)
+			*cache_next = leaf;
+	}
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(skiplist_delete_leaf);
+
+int sl_init_list(struct sl_list *list, gfp_t mask)
+{
+	list->head = kmalloc(sl_node_size(SKIP_MAXLEVEL), mask);
+	if (!list->head)
+		return -ENOMEM;
+	sl_init_node(list->head, SKIP_MAXLEVEL);
+	list->level = 0;
+	return 0;
+}
+EXPORT_SYMBOL(sl_init_list);
+
+
+static int skiplist_callback(struct notifier_block *nfb,
+			     unsigned long action,
+			     void *hcpu)
+{
+	int cpu = (long)hcpu;
+	struct skip_preload *skp;
+	struct sl_leaf *l;
+	int level;
+	int i;
+
+	/* Free per-cpu pool of preloaded nodes */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		skp = &per_cpu(skip_preloads, cpu);
+		for (i = 0; i < SKIP_MAXLEVEL + 1; i++) {
+			l = skp->preload[i];
+			if (!l)
+				continue;
+			level = l->node.level;
+			kmem_cache_free(slab_caches[level], l);
+			skp->preload[i] = NULL;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+void __init skiplist_init(void)
+{
+	char buffer[16];
+	int i;
+
+	hotcpu_notifier(skiplist_callback, 0);
+	for (i = 0; i <= SKIP_MAXLEVEL; i++) {
+		snprintf(buffer, 16, "skiplist-%d", i);
+		slab_caches[i] = kmem_cache_create(buffer,
+				   sl_leaf_size(i), 0,
+				   SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				   SLAB_DESTROY_BY_RCU,
+				   NULL);
+	}
+}
+
diff --git a/lib/skiplist_test.c b/lib/skiplist_test.c
new file mode 100644
index 0000000..8f44829
--- /dev/null
+++ b/lib/skiplist_test.c
@@ -0,0 +1,882 @@
+/*
+ * (C) 2013 Fusion-io
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free Software
+ * Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/moduleparam.h>
+#include <linux/skiplist.h>
+#include <linux/kthread.h>
+#include <linux/rbtree.h>
+#include <linux/random.h>
+
+static int threads = 1;
+static int rounds = 100;
+static int items = 100;
+static int module_exiting;
+static struct completion startup = COMPLETION_INITIALIZER(startup);
+static DEFINE_MUTEX(fill_mutex);
+static int filled;
+
+static struct timespec *times;
+
+
+#define FILL_TIME_INDEX 0
+#define CHECK_TIME_INDEX 1
+#define DEL_TIME_INDEX 2
+#define RANDOM_INS_INDEX 3
+#define RANDOM_LOOKUP_INDEX 4
+#define FIRST_THREAD_INDEX 5
+
+#define SKIPLIST_RCU_BENCH 1
+#define SKIPLIST_BENCH 2
+#define RBTREE_BENCH 3
+
+static int benchmark = SKIPLIST_RCU_BENCH;
+
+module_param(threads, int, 0);
+module_param(rounds, int, 0);
+module_param(items, int, 0);
+module_param(benchmark, int, 0);
+
+MODULE_PARM_DESC(threads, "number of threads to run");
+MODULE_PARM_DESC(rounds, "how many random operations to run");
+MODULE_PARM_DESC(items, "number of items to fill the list with");
+MODULE_PARM_DESC(benchmark, "benchmark to run 1=skiplist-rcu 2=skiplist-locking 3=rbtree");
+MODULE_LICENSE("GPL");
+
+static atomic_t threads_running = ATOMIC_INIT(0);
+
+/*
+ * since the skiplist code is more concurrent, it is also more likely to
+ * have races into the same slot during our delete/insert bashing run.
+ * This makes counts the number of delete/insert pairs done so we can
+ * make sure the results are roughly accurate
+ */
+static atomic_t pops_done = ATOMIC_INIT(0);
+
+static struct kmem_cache *slot_cache;
+struct sl_list skiplist;
+
+spinlock_t rbtree_lock;
+struct rb_root rb_root = RB_ROOT;
+
+struct rbtree_item {
+	struct rb_node rb_node;
+	unsigned long key;
+	unsigned long size;
+};
+
+static int __insert_one_rbtree(struct rb_root *root, struct rbtree_item *ins)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct rbtree_item *item;
+	unsigned long key = ins->key;
+	int ret = -EEXIST;
+
+	while (*p) {
+		parent = *p;
+		item = rb_entry(parent, struct rbtree_item, rb_node);
+
+		if (key < item->key)
+			p = &(*p)->rb_left;
+		else if (key >= item->key + item->size)
+			p = &(*p)->rb_right;
+		else
+			goto out;
+	}
+
+	rb_link_node(&ins->rb_node, parent, p);
+	rb_insert_color(&ins->rb_node, root);
+	ret = 0;
+out:
+
+	return ret;
+}
+
+static int insert_one_rbtree(struct rb_root *root, unsigned long key,
+			     unsigned long size)
+{
+	int ret;
+	struct rbtree_item *ins;
+
+	ins = kmalloc(sizeof(*ins), GFP_KERNEL);
+	ins->key = key;
+	ins->size = size;
+
+	spin_lock(&rbtree_lock);
+	ret = __insert_one_rbtree(root, ins);
+	spin_unlock(&rbtree_lock);
+
+	if (ret) {
+		kfree(ins);
+	}
+	return ret;
+}
+
+static struct rbtree_item *__lookup_one_rbtree(struct rb_root *root, unsigned long key)
+{
+	struct rb_node *p = root->rb_node;
+	struct rbtree_item *item;
+	struct rbtree_item *ret;
+
+	while (p) {
+		item = rb_entry(p, struct rbtree_item, rb_node);
+
+		if (key < item->key)
+			p = p->rb_left;
+		else if (key >= item->key + item->size)
+			p = p->rb_right;
+		else {
+			ret = item;
+			goto out;
+		}
+	}
+
+	ret = NULL;
+out:
+	return ret;
+}
+
+static struct rbtree_item *__lookup_first_rbtree(struct rb_root *root, unsigned long key)
+{
+	struct rb_node *p = root->rb_node;
+	struct rbtree_item *item;
+	struct rbtree_item *ret;
+
+	while (p) {
+		item = rb_entry(p, struct rbtree_item, rb_node);
+
+		if (key < item->key)
+			p = p->rb_left;
+		else if (key >= item->key + item->size)
+			p = p->rb_right;
+		else {
+			ret = item;
+			goto out;
+		}
+	}
+
+	if (p)
+		ret = rb_entry(p, struct rbtree_item, rb_node);
+	else
+		ret = NULL;
+out:
+	return ret;
+}
+
+static int lookup_one_rbtree(struct rb_root *root, unsigned long key)
+{
+	struct rbtree_item *item;
+	int ret;
+
+	spin_lock(&rbtree_lock);
+	item = __lookup_one_rbtree(root, key);
+	if (item)
+		ret = 0;
+	else
+		ret = -ENOENT;
+	spin_unlock(&rbtree_lock);
+
+	return ret;
+}
+
+static int delete_many_rbtree(struct rb_root *root, unsigned long key)
+{
+	int ret = 0;
+	struct rbtree_item *item;
+	struct rbtree_item **victims;
+	struct rb_node *next;
+	int nr_victims = 128;
+	int found = 0;
+	int i;
+
+	nr_victims = min(nr_victims, items / 2);
+
+	victims = kzalloc(nr_victims * sizeof(victims[0]), GFP_KERNEL);
+	spin_lock(&rbtree_lock);
+	item = __lookup_first_rbtree(root, key);
+	if (!item) {
+		spin_unlock(&rbtree_lock);
+		goto out;
+	}
+
+	while (found < nr_victims) {
+		victims[found] = item;
+		next = rb_next(&item->rb_node);
+		rb_erase(&item->rb_node, root);
+		found++;
+		if (!next)
+			break;
+		item = rb_entry(next, struct rbtree_item, rb_node);
+	}
+	spin_unlock(&rbtree_lock);
+
+	for (i = 0; i < found; i++) {
+		item = victims[i];
+		spin_lock(&rbtree_lock);
+		ret = __insert_one_rbtree(root, item);
+		if (ret) {
+			printk(KERN_CRIT "delete_many unable to insert %lu\n", key);
+			kfree(item);
+		}
+		spin_unlock(&rbtree_lock);
+	}
+out:
+	atomic_add(found, &pops_done);
+	kfree(victims);
+	return ret;
+
+}
+
+static int delete_one_rbtree(struct rb_root *root, unsigned long key)
+{
+	int ret = -ENOENT;
+	struct rbtree_item *item;
+
+	spin_lock(&rbtree_lock);
+	item = __lookup_first_rbtree(root, key);
+	if (!item)
+		goto out;
+
+	rb_erase(&item->rb_node, root);
+
+	ret = __insert_one_rbtree(root, item);
+	if (ret) {
+		printk(KERN_CRIT "delete_one unable to insert %lu\n", key);
+		goto out;
+	}
+	ret = 0;
+out:
+	spin_unlock(&rbtree_lock);
+	atomic_inc(&pops_done);
+	return ret;
+
+}
+
+static int run_initial_fill_rbtree(void)
+{
+	unsigned long i;
+	unsigned long key;
+	int ret;
+	int inserted = 0;
+
+	for (i = 0; i < items; i++) {
+		key = i * 4096;
+		ret = insert_one_rbtree(&rb_root, key, 4096);
+		if (ret)
+			return ret;
+		inserted++;
+	}
+	printk("rbtree inserted %d items\n", inserted);
+	return 0;
+}
+
+static unsigned long tester_random(void)
+{
+	return (prandom_u32() % items) * 4096;
+}
+
+static void random_insert_rbtree(void)
+{
+	unsigned long i;
+	unsigned long key;
+	int ret;
+	int inserted = 0;
+
+	for (i = 0; i < items; i++) {
+		key = tester_random();
+		ret = insert_one_rbtree(&rb_root, key, 4096);
+		if (!ret)
+			inserted++;
+	}
+	printk("rbtree radomly inserted %d items\n", inserted);
+}
+
+static void random_lookup_rbtree(void)
+{
+	int i;
+	unsigned long key;
+	int ret;
+	int found = 0;
+
+	for (i = 0; i < items; i++) {
+		key = tester_random();
+		ret = lookup_one_rbtree(&rb_root, key);
+		if (!ret)
+			found++;
+	}
+	printk("rbtree randomly searched %d items\n", found);
+}
+
+static void check_post_work_rbtree(void)
+{
+	unsigned long key = 0;
+	int errors = 0;
+	struct rb_node *p = rb_first(&rb_root);
+	struct rbtree_item *item;
+
+	while (p) {
+		item = rb_entry(p, struct rbtree_item, rb_node);
+		if (item->key != key) {
+			if (!errors)
+				printk("rbtree failed to find key %lu\n", key);
+			errors++;
+		}
+		key += 4096;
+		p = rb_next(p);
+	}
+	printk(KERN_CRIT "rbtree check found %d errors\n", errors);
+}
+
+static void delete_all_items_rbtree(int check)
+{
+	unsigned long key = 0;
+	int errors = 0;
+	struct rb_node *p = rb_first(&rb_root);
+	struct rb_node *next;
+	struct rbtree_item *item;
+
+	while (p) {
+		item = rb_entry(p, struct rbtree_item, rb_node);
+		if (check && item->key != key) {
+			if (!errors)
+				printk("rbtree failed to find key %lu\n", key);
+			errors++;
+		}
+		key += 4096;
+		next = rb_next(p);
+		rb_erase(p, &rb_root);
+		kfree(item);
+		p = next;
+	}
+	printk(KERN_CRIT "rbtree deletion found %d errors\n", errors);
+}
+
+static void put_slot(struct sl_slot *slot)
+{
+	if (atomic_dec_and_test(&slot->refs))
+		kmem_cache_free(slot_cache, slot);
+
+}
+static int insert_one_skiplist(struct sl_list *skiplist, unsigned long key,
+			       unsigned long size, struct sl_leaf **cache)
+{
+	int ret;
+	int preload_token;
+	struct sl_slot *slot;
+
+	slot = kmem_cache_alloc(slot_cache, GFP_KERNEL);
+	if (!slot)
+		return -ENOMEM;
+
+	slot->key = key;
+	slot->size = size;
+	atomic_set(&slot->refs, 0);
+
+	preload_token = skiplist_preload(skiplist, GFP_KERNEL);
+	if (preload_token < 0) {
+		ret = preload_token;
+		goto out;
+	}
+
+	ret = skiplist_insert(skiplist, slot, preload_token, cache);
+	preempt_enable();
+
+out:
+	if (ret)
+		kmem_cache_free(slot_cache, slot);
+
+	return ret;
+}
+
+static int run_initial_fill_skiplist(void)
+{
+	unsigned long i;
+	unsigned long key;
+	int ret = 0;
+	int inserted = 0;
+	struct sl_leaf *cache = NULL;
+
+	sl_init_list(&skiplist, GFP_KERNEL);
+
+	for (i = 0; i < items; i++) {
+		key = i * 4096;
+		ret = insert_one_skiplist(&skiplist, key, 4096, &cache);
+		if (ret)
+			break;
+		inserted++;
+	}
+	if (cache)
+		skiplist_put_leaf(cache);
+
+	printk("skiplist inserted %d items\n", inserted);
+	return ret;
+}
+
+static void check_post_work_skiplist(void)
+{
+	unsigned long i;
+	unsigned long key = 0;
+	struct sl_slot *slot;
+	struct sl_leaf *leaf;
+	struct sl_leaf *next;
+	int errors = 0;
+	unsigned long found = 0;
+
+	leaf = skiplist_first_leaf(&skiplist);
+	while (leaf) {
+		for (i = 0; i < leaf->nr; i++) {
+			slot = leaf->ptrs[i];
+			if (slot->key != key) {
+				if (errors == 0)
+					printk("key mismatch wanted %lu found %lu\n", key, slot->key);
+				errors++;
+			}
+			key += 4096;
+		}
+		found += leaf->nr;
+		next = skiplist_next_leaf(&skiplist, leaf);
+		skiplist_unlock_leaf(leaf);
+		skiplist_put_leaf(leaf);
+		leaf = next;
+	}
+	if (found != items)
+		printk("skiplist check only found %lu items\n", found);
+	printk(KERN_CRIT "skiplist check found %lu items with %d errors\n", found, errors);
+}
+
+static void delete_all_items_skiplist(int check)
+{
+	unsigned long i;
+	unsigned long key = 0;
+	struct sl_slot *slot;
+	struct sl_leaf *leaf;
+	int errors = 0;
+	unsigned long slots_empty = 0;
+	unsigned long total_leaves = 0;
+	unsigned long found = 0;
+
+	/*
+	 * the benchmark is done at this point, so we can safely
+	 * just free all the items.  In the real world you should
+	 * not free anything until the leaf is fully removed from
+	 * the tree.
+	 */
+	while (1) {
+		leaf = skiplist_first_leaf(&skiplist);
+		if (!leaf)
+			break;
+		for (i = 0; i < leaf->nr; i++) {
+			slot = leaf->ptrs[i];
+			if (check && slot->key != key) {
+				if (errors == 0)
+					printk("delete key mismatch wanted %lu found %lu\n", key, slot->key);
+				errors++;
+			}
+			key += 4096;
+			/* delete the one ref from the skiplist */
+			put_slot(slot);
+		}
+		found += leaf->nr;
+		slots_empty += SKIP_KEYS_PER_NODE - leaf->nr;
+		total_leaves++;
+		skiplist_delete_leaf(&skiplist, leaf, NULL);
+	}
+	printk(KERN_CRIT "skiplist delete found %lu items with %d errors %lu empty slots %lu leaves\n", found, errors, slots_empty, total_leaves);
+}
+
+static int lookup_one_skiplist(struct sl_list *skiplist, unsigned long key)
+{
+	int ret = 0;
+	struct sl_slot *slot = NULL;
+
+	if (benchmark == SKIPLIST_RCU_BENCH) {
+		slot = skiplist_lookup_rcu(skiplist, key, 4096);
+	} else if (benchmark == SKIPLIST_BENCH) {
+		slot = skiplist_lookup(skiplist, key, 4096);
+		if (slot)
+			put_slot(slot);
+	}
+	if (!slot)
+		ret = -ENOENT;
+	return ret;
+}
+
+static int delete_one_skiplist(struct sl_list *skiplist, unsigned long key)
+{
+	int ret = 0;
+	struct sl_slot *slot = NULL;
+	int preload_token;
+
+	slot = skiplist_delete(skiplist, key, 1);
+	if (!slot)
+		return -ENOENT;
+
+	preload_token = skiplist_preload(skiplist, GFP_KERNEL);
+	if (preload_token < 0) {
+		ret = preload_token;
+		goto out_no_preempt;
+	}
+
+	ret = skiplist_insert(skiplist, slot, preload_token, NULL);
+	if (ret) {
+		printk(KERN_CRIT "failed to insert key %lu ret %d\n", key, ret);
+		goto out;
+	}
+	put_slot(slot);
+	ret = 0;
+	atomic_inc(&pops_done);
+out:
+	preempt_enable();
+out_no_preempt:
+	return ret;
+}
+
+static int delete_many_skiplist(struct sl_list *skiplist, unsigned long key)
+{
+	int ret = 0;
+	int preload_token;
+	struct sl_slot **victims;
+	struct sl_leaf *leaf;
+	struct sl_leaf *next = NULL;
+	int nr_victims = 128;
+	int found = 0;
+	int i;
+	struct sl_leaf *cache = NULL;
+
+	nr_victims = min(nr_victims, items / 2);
+
+	victims = kzalloc(nr_victims * sizeof(victims[0]), GFP_KERNEL);
+	/*
+	 * this is intentionally deleting adjacent items to empty
+	 * skiplist leaves.  The goal is to find races between
+	 * leaf deletion and the rest of the code
+	 */
+	leaf = skiplist_lookup_first_leaf(skiplist, key, 1);
+	while (leaf) {
+		memcpy(victims + found, leaf->ptrs, sizeof(victims[0]) * leaf->nr);
+		found += leaf->nr;
+
+		skiplist_delete_leaf(skiplist, leaf, &next);
+		leaf = next;
+		if (!leaf)
+			break;
+
+		if (leaf->nr + found > nr_victims) {
+			skiplist_put_leaf(leaf);
+			break;
+		}
+
+		skiplist_wait_pending_insert(&leaf->node);
+
+		skiplist_lock_leaf(leaf);
+		if (sl_node_dead(&leaf->node) ||
+		    leaf->nr + found > nr_victims) {
+			skiplist_put_leaf(leaf);
+			skiplist_unlock_leaf(leaf);
+			break;
+		}
+	}
+
+	for (i = 0; i < found; i++) {
+		preload_token = skiplist_preload(skiplist, GFP_KERNEL);
+		if (preload_token < 0) {
+			ret = preload_token;
+			goto out;
+		}
+
+		ret = skiplist_insert(skiplist, victims[i], preload_token, &cache);
+		if (ret) {
+			printk(KERN_CRIT "failed to insert key %lu ret %d\n", key, ret);
+			preempt_enable();
+			goto out;
+		}
+		/* insert added an extra ref, take it away here */
+		put_slot(victims[i]);
+		ret = 0;
+		preempt_enable();
+	}
+out:
+	if (cache)
+		skiplist_put_leaf(cache);
+
+	atomic_add(found, &pops_done);
+	kfree(victims);
+	return ret;
+}
+
+static void random_lookup_skiplist(void)
+{
+	int i;
+	unsigned long key;
+	int ret;
+	int found = 0;
+
+	for (i = 0; i < items; i++) {
+		key = tester_random();
+		ret = lookup_one_skiplist(&skiplist, key);
+		if (!ret)
+			found++;
+	}
+	printk("skiplist randomly searched %d items\n", found);
+}
+
+static void random_insert_skiplist(void)
+{
+	int i;
+	unsigned long key;
+	int ret;
+	int inserted = 0;
+
+	for (i = 0; i < items; i++) {
+		key = tester_random();
+		ret = insert_one_skiplist(&skiplist, key, 4096, NULL);
+		if (!ret)
+			inserted++;
+	}
+	printk("skiplist randomly inserted %d items\n", inserted);
+}
+
+void tvsub(struct timespec * tdiff, struct timespec * t1, struct timespec * t0)
+{
+	tdiff->tv_sec = t1->tv_sec - t0->tv_sec;
+	tdiff->tv_nsec = t1->tv_nsec - t0->tv_nsec;
+	if (tdiff->tv_nsec < 0 && tdiff->tv_sec > 0) {
+		tdiff->tv_sec--;
+		tdiff->tv_nsec += 1000000000ULL;
+	}
+
+	/* time shouldn't go backwards!!! */
+	if (tdiff->tv_nsec < 0 || t1->tv_sec < t0->tv_sec) {
+		tdiff->tv_sec = 0;
+		tdiff->tv_nsec = 0;
+	}
+}
+
+static void pretty_time(struct timespec *ts, unsigned long long *seconds, unsigned long long *ms)
+{
+	unsigned long long m;
+
+	*seconds = ts->tv_sec;
+
+	m = ts->tv_nsec / 1000000ULL;
+	*ms = m;
+}
+
+static void runbench(int thread_index)
+{
+	int ret = 0;
+	unsigned long i;
+	unsigned long key;
+	struct timespec start;
+	struct timespec cur;
+	unsigned long long sec;
+	unsigned long long ms;
+	char *tag = "skiplist-rcu";
+
+	if (benchmark == SKIPLIST_BENCH)
+		tag = "skiplist-locking";
+	else if (benchmark == RBTREE_BENCH)
+		tag = "rbtree";
+
+	mutex_lock(&fill_mutex);
+
+	if (filled == 0) {
+		start = current_kernel_time();
+
+		printk(KERN_CRIT "Running %s benchmark\n", tag);
+
+		if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+			ret = run_initial_fill_skiplist();
+		else if (benchmark == RBTREE_BENCH)
+			ret = run_initial_fill_rbtree();
+
+		if (ret < 0) {
+			printk(KERN_CRIT "failed to setup initial tree ret %d\n", ret);
+			filled = ret;
+		} else {
+			filled = 1;
+		}
+		cur = current_kernel_time();
+		tvsub(times + FILL_TIME_INDEX, &cur, &start);
+		printk("initial fill done\n");
+	}
+
+	mutex_unlock(&fill_mutex);
+	if (filled < 0)
+		return;
+
+	start = current_kernel_time();
+
+	for (i = 0; i < rounds; i++) {
+		key = tester_random();
+
+		if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+			ret = lookup_one_skiplist(&skiplist, key);
+		else if (benchmark == RBTREE_BENCH)
+			ret = lookup_one_rbtree(&rb_root, key);
+
+		cond_resched();
+
+		key = tester_random();
+
+		if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+			ret = delete_many_skiplist(&skiplist, key);
+		else if (benchmark == RBTREE_BENCH)
+			ret = delete_many_rbtree(&rb_root, key);
+
+		cond_resched();
+
+		key = tester_random();
+
+		if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+			ret = delete_one_skiplist(&skiplist, key);
+		else if (benchmark == RBTREE_BENCH)
+			ret = delete_one_rbtree(&rb_root, key);
+
+		cond_resched();
+	}
+
+	cur = current_kernel_time();
+	tvsub(times + FIRST_THREAD_INDEX + thread_index, &cur, &start);
+
+	if (!atomic_dec_and_test(&threads_running)) {
+		return;
+	}
+
+	start = current_kernel_time();
+	if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+		check_post_work_skiplist();
+	else if (benchmark == RBTREE_BENCH)
+		check_post_work_rbtree();
+
+	cur = current_kernel_time();
+
+	tvsub(times + CHECK_TIME_INDEX, &cur, &start);
+
+	start = current_kernel_time();
+	if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH)
+		delete_all_items_skiplist(1);
+	else if (benchmark == RBTREE_BENCH)
+		delete_all_items_rbtree(1);
+	cur = current_kernel_time();
+
+	tvsub(times + DEL_TIME_INDEX, &cur, &start);
+
+	start = current_kernel_time();
+	if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH) {
+		random_insert_skiplist();
+	} else if (benchmark == RBTREE_BENCH) {
+		random_insert_rbtree();
+	}
+	cur = current_kernel_time();
+
+	tvsub(times + RANDOM_INS_INDEX, &cur, &start);
+
+	start = current_kernel_time();
+	if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH) {
+		random_lookup_skiplist();
+	} else if (benchmark == RBTREE_BENCH) {
+		random_lookup_rbtree();
+	}
+	cur = current_kernel_time();
+
+	tvsub(times + RANDOM_LOOKUP_INDEX, &cur, &start);
+
+	if (benchmark == SKIPLIST_RCU_BENCH || benchmark == SKIPLIST_BENCH) {
+		delete_all_items_skiplist(0);
+	} else if (benchmark == RBTREE_BENCH) {
+		delete_all_items_rbtree(0);
+	}
+
+	pretty_time(&times[FILL_TIME_INDEX], &sec, &ms);
+	printk("%s fill time %llu s %llu ms\n", tag, sec, ms);
+	pretty_time(&times[CHECK_TIME_INDEX], &sec, &ms);
+	printk("%s check time %llu s %llu ms\n", tag, sec, ms);
+	pretty_time(&times[DEL_TIME_INDEX], &sec, &ms);
+	printk("%s del time %llu s %llu ms \n", tag, sec, ms);
+	pretty_time(&times[RANDOM_INS_INDEX], &sec, &ms);
+	printk("%s random insert time %llu s %llu ms \n", tag, sec, ms);
+	pretty_time(&times[RANDOM_LOOKUP_INDEX], &sec, &ms);
+	printk("%s random lookup time %llu s %llu ms \n", tag, sec, ms);
+	for (i = 0; i < threads; i++) {
+		pretty_time(&times[FIRST_THREAD_INDEX + i], &sec, &ms);
+		printk("%s thread %lu time %llu s %llu ms\n", tag, i, sec, ms);
+	}
+
+	printk("worker thread pops done %d\n", atomic_read(&pops_done));
+
+	kfree(times);
+}
+
+static int skiptest_thread(void *index)
+{
+	unsigned long thread_index = (unsigned long)index;
+	complete(&startup);
+	runbench(thread_index);
+	complete(&startup);
+	return 0;
+}
+
+static int __init skiptest_init(void)
+{
+	unsigned long i;
+	init_completion(&startup);
+	slot_cache = kmem_cache_create("skiplist_slot", sizeof(struct sl_slot), 0,
+					   SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+					   SLAB_DESTROY_BY_RCU, NULL);
+
+	if (!slot_cache)
+		return -ENOMEM;
+
+	spin_lock_init(&rbtree_lock);
+
+	printk("skiptest benchmark module (%d threads) (%d items) (%d rounds)\n", threads, items, rounds);
+
+	times = kmalloc(sizeof(times[0]) * (threads + FIRST_THREAD_INDEX),
+			GFP_KERNEL);
+
+	atomic_set(&threads_running, threads);
+	for (i = 0; i < threads; i++) {
+		kthread_run(skiptest_thread, (void *)i, "skiptest_thread");
+	}
+	for (i = 0; i < threads; i++)
+		wait_for_completion(&startup);
+	return 0;
+}
+
+static void __exit skiptest_exit(void)
+{
+	int i;
+	module_exiting = 1;
+
+	for (i = 0; i < threads; i++) {
+		wait_for_completion(&startup);
+	}
+
+	synchronize_rcu();
+	kmem_cache_destroy(slot_cache);
+	printk("all skiptest threads done\n");
+	return;
+}
+
+module_init(skiptest_init);
+module_exit(skiptest_exit);
-- 
1.8.2

next prev parent reply	other threads:[~2013-06-16 14:57 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-16 14:56 [PATCH RFC 0/2] rcu skiplists v2 Chris Mason
2013-06-16 14:57 ` Chris Mason [this message]
2013-07-14 14:06   ` [PATCH RFC 1/2] Skiplist: rcu range index Dong Fang
2013-06-16 14:58 ` [PATCH RFC 2/2] Switch the IOMMU over to the skiplists Chris Mason
2013-06-26 23:02 ` [PATCH RFC 0/2] rcu skiplists v2 Mathieu Desnoyers
2013-06-26 23:51   ` Chris Mason
2013-06-27  2:29     ` Mathieu Desnoyers
2013-06-27  4:46       ` Chris Mason
2013-06-27 11:42         ` Mathieu Desnoyers
2013-06-27  5:19       ` Dave Chinner
2013-06-27 10:55         ` [BULK] " Chris Mason
2013-06-27 12:12         ` Mathieu Desnoyers

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:04cfe5e dfblob:63534a1 dfblob:ad9b631 dfblob:fe01d41
dfblob:94bb235 dfblob:6e2cc56 dfblob:7c14a12 dfblob:02cf04e
dfblob:8f44829 )
 OR (
bs:"[PATCH RFC 1/2] Skiplist: rcu range index" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130616145725.4914.62425@localhost.localdomain \
    --to=chris.mason@fusionio.com \
    --cc=David.Woodhouse@intel.com \
    --cc=bo.li.liu@oracle.com \
    --cc=clmason@fusionio.com \
    --cc=dchinner@redhat.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=rp@svcs.cs.pdx.edu \
    --cc=shemminger@vyatta.com \
    --cc=stern@rowland.harvard.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).