All of lore.kernel.org
 help / color / mirror / Atom feed
From: joern@logfs.org
To: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mtd@lists.infradead.org
Subject: [patch 10/15] fs/logfs/memtree.c
Date: Tue, 01 Apr 2008 20:13:08 +0200	[thread overview]
Message-ID: <20080401181332.853833010@logfs.org> (raw)
In-Reply-To: 20080401181308.512473173@logfs.org

--- /dev/null	2008-04-02 16:29:12.813336657 +0200
+++ linux-2.6.24logfs/fs/logfs/memtree.c	2008-04-01 21:43:14.593326689 +0200
@@ -0,0 +1,405 @@
+/*
+ * fs/logfs/memtree.c	- Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007 Joern Engel <joern@logfs.org>
+ *
+ *
+ * This could possibly get moved to lib/.
+ *
+ * A relatively simple B+Tree implementation.  I have written it as a learning
+ * excercise to understand how B+Trees work.  Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware).  Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequite for a long time.  More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased.  Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space.  Between 25% and 50% of the memory is
+ * occupied with valid pointers.  When densely populated, radix trees contain
+ * ~98% pointers - hard to beat.  Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * Two tricks were used that are not commonly found in textbooks.  First, the
+ * lowest values are to the right, not to the left.  All used slots within a
+ * node are on the left, all unused slots contain NUL values.  Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ *
+ * Second trick is to special-case the key "0" or NUL.  As seen above, this
+ * value indicates an unused slot, so such a value should not be stored in the
+ * tree itself.  Instead it is stored in the null_ptr field in the btree_head.
+ */
+/* FIXME: use mempool for allocations */
+#include "logfs.h"
+
+/*
+ * Prerequisite of B+Trees performing well is that node lookup is much slower
+ * than a large number of operations within a node.  That can be true if the
+ * node size is identical to cacheline size.  All that is highly
+ * machine-dependent, just like the #define below is not.
+ *
+ * Patches to do something smarter are welcome.  Just beware that too small
+ * node with less than 8 slots have a bad fan-out and won't perform well
+ * either.
+ */
+#if BITS_PER_LONG == 32
+#define BTREE_NODES 20	/* 32bit, 240 byte nodes */
+#else
+#define BTREE_NODES 16	/* 64bit, 256 byte nodes */
+#endif
+
+struct btree_node {
+	u64 key;
+	struct btree_node *node;
+};
+
+void btree_init(struct btree_head *head)
+{
+	head->node = NULL;
+	head->height = 0;
+	head->null_ptr = NULL;
+}
+
+#if 0
+static void __dump_tree(struct btree_node *node, int height)
+{
+	int i;
+
+	if (!height)
+		return;
+
+	printk(KERN_DEBUG"%p ", node);
+	for (i = 0; i < BTREE_NODES; i++)
+		printk("(%llx,%p) ", node[i].key, node[i].node);
+	printk("\n");
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key)
+			__dump_tree(node[i].node, height-1);
+}
+
+static void dump_tree(struct btree_head *head)
+{
+	printk(KERN_DEBUG"%p\n", head->null_ptr);
+	__dump_tree(head->node, head->height);
+}
+#endif
+
+static u64 btree_last(struct btree_head *head)
+{
+	int height = head->height;
+	struct btree_node *node = head->node;
+
+	if (height == 0)
+		return 0;
+
+	for ( ; height > 1; height--)
+		node = node[0].node;
+
+	return node[0].key;
+}
+
+void *btree_lookup(struct btree_head *head, u64 key)
+{
+	int i, height = head->height;
+	struct btree_node *node = head->node;
+
+	if (key == 0)
+		return head->null_ptr;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+		node = node[i].node;
+		if (!node)
+			return NULL;
+	}
+
+	if (!node)
+		return NULL;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key == key)
+			return node[i].node;
+
+	return NULL;
+}
+
+/*
+ * Returns two values:
+ * pos - the position of the first slot equal or less than key
+ * fill - the number of positions filled with any value
+ */
+static void find_pos(struct btree_node *node, u64 key, int *pos, int *fill)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES; i++)
+		if (node[i].key <= key)
+			break;
+	*pos = i;
+	for (i = *pos; i < BTREE_NODES; i++)
+		if (node[i].key == 0)
+			break;
+	*fill = i;
+}
+
+/*
+ * locate the correct leaf node in the btree
+ */
+static struct btree_node *find_level(struct btree_head *head, u64 key,
+		int level)
+{
+	struct btree_node *node = head->node;
+	int i, height;
+
+	for (height = head->height; height > level; height--) {
+		for (i = 0; i < BTREE_NODES; i++)
+			if (node[i].key <= key)
+				break;
+
+		if ((i == BTREE_NODES) || !node[i].key) {
+			/* right-most key is too large, update it */
+			i--;
+			node[i].key = key;
+		}
+		BUG_ON(i < 0);
+		node = node[i].node;
+	}
+	BUG_ON(!node);
+	return node;
+}
+
+static int btree_grow(struct btree_head *head)
+{
+	struct btree_node *node;
+
+	node = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+	BUG_ON(!node);
+	if (!node)
+		return -ENOMEM;
+	if (head->node) {
+		node->key = head->node[BTREE_NODES-1].key;
+		node->node = head->node;
+	}
+	head->node = node;
+	head->height++;
+	return 0;
+}
+
+static int btree_insert_level(struct btree_head *head, u64 key, void *ptr,
+		int level)
+{
+	struct btree_node *node;
+	int i, pos, fill, err;
+
+	BUG_ON(!ptr);
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		BUG_ON(level != 1);
+		head->null_ptr = ptr;
+		return 0;
+	}
+
+	if (head->height < level) {
+		err = btree_grow(head);
+		if (err)
+			return err;
+	}
+
+retry:
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	BUG_ON(node[pos].key == key);
+
+	if (fill == BTREE_NODES) {
+		/* need to split node */
+		struct btree_node *new;
+
+		new = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+		BUG_ON(!new);
+		if (!new)
+			return -ENOMEM;
+		err = btree_insert_level(head, node[BTREE_NODES/2 - 1].key, new,
+				level+1);
+		if (err) {
+			kfree(new);
+			return err;
+		}
+		for (i = 0; i < BTREE_NODES / 2; i++) {
+			new[i].key = node[i].key;
+			new[i].node = node[i].node;
+			node[i].key = node[i + BTREE_NODES/2].key;
+			node[i].node = node[i + BTREE_NODES/2].node;
+			node[i + BTREE_NODES/2].key = 0;
+			node[i + BTREE_NODES/2].node = NULL;
+		}
+		goto retry;
+	}
+	BUG_ON(fill >= BTREE_NODES);
+
+	/* shift and insert */
+	for (i = fill; i > pos; i--) {
+		node[i].key = node[i-1].key;
+		node[i].node = node[i-1].node;
+	}
+	node[pos].key = key;
+	node[pos].node = ptr;
+
+	return 0;
+}
+
+int btree_insert(struct btree_head *head, u64 key, void *ptr)
+{
+	BUG_ON(!ptr);
+	return btree_insert_level(head, key, ptr, 1);
+}
+
+static void *btree_remove_level(struct btree_head *head, u64 key, int level)
+{
+	struct btree_node *node;
+	int i, pos, fill;
+	void *ret;
+
+	if (level > head->height) {
+		/* we recursed all the way up */
+		head->height = 0;
+		head->node = NULL;
+		return NULL;
+	}
+
+	node = find_level(head, key, level);
+	find_pos(node, key, &pos, &fill);
+	if ((level == 1) && (node[pos].key != key))
+		return NULL;
+	ret = node[pos].node;
+
+	/* remove and shift */
+	for (i = pos; i < fill-1; i++) {
+		node[i].key = node[i+1].key;
+		node[i].node = node[i+1].node;
+	}
+	node[fill-1].key = 0;
+	node[fill-1].node = NULL;
+
+	if (fill-1 < BTREE_NODES/2) {
+		/*
+		 * At this point there *should* be code to either merge with
+		 * a neighboring node or steal some entries from it to preserve
+		 * the btree invariant of only having nodes with n/2..n
+		 * elements.
+		 *
+		 * As you can see, that code is left as an excercise to the
+		 * reader or anyone noticing severe performance problems in
+		 * very rare cases.
+		 *
+		 * As-is this code "implements" a method called lazy deletion,
+		 * which according to text books is relatively common in
+		 * databases and usually works quite well.
+		 * Not so usually, the btree can degrade into very long lists
+		 * of 1-element nodes and perform accordingly.
+		 */
+	}
+	if (fill-1 == 0) {
+		btree_remove_level(head, key, level+1);
+		kfree(node);
+	}
+
+	return ret;
+}
+
+void *btree_remove(struct btree_head *head, u64 key)
+{
+	void *ret;
+
+	if (key == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		ret = head->null_ptr;
+		head->null_ptr = NULL;
+		return ret;
+	}
+	if (head->height == 0)
+		return NULL;
+
+	return btree_remove_level(head, key, 1);
+}
+
+int btree_merge(struct btree_head *target, struct btree_head *victim)
+{
+	struct btree_node *node;
+	u64 key;
+	int err;
+
+	BUG_ON(target == victim);
+
+	if (!(target->node || target->null_ptr)) {
+		/* target is empty, just copy fields over */
+		target->null_ptr = victim->null_ptr;
+		target->node = victim->node;
+		target->height = victim->height;
+		btree_init(victim);
+		return 0;
+	}
+
+	for (;;) {
+		key = btree_last(victim);
+		node = btree_remove(victim, key);
+		if (!node)
+			break;
+		err = btree_insert(target, key, node);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void __btree_for_each(struct btree_node *node, long opaque,
+		void (*func)(void *elem, long opaque, u64 key),  int reap,
+		int height)
+{
+	int i;
+
+	for (i = 0; i < BTREE_NODES && node[i].key; i++) {
+		if (height > 1)
+			__btree_for_each(node[i].node, opaque, func, reap,
+					height-1);
+		else
+			func(node[i].node, opaque, node[i].key);
+	}
+	if (reap)
+		kfree(node);
+}
+
+void btree_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 0, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+}
+
+void btree_grim_visitor(struct btree_head *head, long opaque,
+		void (*func)(void *elem, long opaque, u64 key))
+{
+	if (head->node)
+		__btree_for_each(head->node, opaque, func, 1, head->height);
+	if (head->null_ptr)
+		func(head->null_ptr, opaque, 0);
+	btree_init(head);
+}

  parent reply	other threads:[~2008-04-03 17:28 UTC|newest]

Thread overview: 85+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-04-01 18:13 [patch 0/15] LogFS take five joern
2008-04-01 18:13 ` joern
2008-04-01 18:13 ` [patch 12/15] fs/logfs/segment.c joern
2008-04-01 18:13 ` [patch 2/15] fs/logfs/logfs_abi.h joern
2008-04-08  0:24   ` Arnd Bergmann
2008-04-08  0:24     ` Arnd Bergmann
2008-04-08  0:24     ` Arnd Bergmann
2008-04-08  9:39     ` Jörn Engel
2008-04-08  9:39       ` Jörn Engel
2008-04-08  9:39       ` Jörn Engel
2008-04-08 21:52       ` Andres Salomon
2008-04-08 21:52         ` Andres Salomon
2008-04-09 12:08         ` Jörn Engel
2008-04-09 12:08           ` Jörn Engel
2008-04-09 12:08           ` Jörn Engel
2008-04-01 18:13 ` [patch 15/15] fs/logfs/dev_mtd.c joern
2008-04-01 18:13 ` [patch 4/15] fs/logfs/compr.c joern
2008-04-10 14:13   ` Arnd Bergmann
2008-04-10 14:13     ` Arnd Bergmann
2008-04-10 14:13     ` Arnd Bergmann
2008-04-11 10:41     ` Jörn Engel
2008-04-11 10:41       ` Jörn Engel
2008-04-01 18:13 ` [patch 1/15] Makefiles and Kconfig joern
2008-04-07  8:28   ` Christian Borntraeger
2008-04-07  8:28     ` Christian Borntraeger
2008-04-07  8:40     ` Jörn Engel
2008-04-07  8:40       ` Jörn Engel
2008-04-07  8:40       ` Jörn Engel
2008-04-08  0:30   ` Arnd Bergmann
2008-04-08  0:30     ` Arnd Bergmann
2008-04-08  8:33     ` Jörn Engel
2008-04-08  8:33       ` Jörn Engel
2008-04-08  8:33       ` Jörn Engel
2008-04-08 13:41       ` Arnd Bergmann
2008-04-08 13:41         ` Arnd Bergmann
2008-04-08 13:41         ` Arnd Bergmann
2008-04-08 13:52         ` Jörn Engel
2008-04-08 13:52           ` Jörn Engel
2008-04-08 13:52           ` Jörn Engel
2008-04-01 18:13 ` [patch 13/15] fs/logfs/super.c joern
2008-04-01 18:13 ` [patch 7/15] fs/logfs/gc.c joern
2008-04-01 18:13 ` joern [this message]
2008-04-10 14:07   ` [patch 10/15] fs/logfs/memtree.c Arnd Bergmann
2008-04-10 14:07     ` Arnd Bergmann
2008-04-10 14:07     ` Arnd Bergmann
2008-04-11 10:37     ` Jörn Engel
2008-04-11 10:37       ` Jörn Engel
2008-04-11 10:37       ` Jörn Engel
2008-04-01 18:13 ` [patch 11/15] fs/logfs/readwrite.c joern
2008-04-01 18:13 ` [patch 5/15] fs/logfs/dir.c joern
2008-04-04  6:22   ` Kyungmin Park
2008-04-04  6:22     ` Kyungmin Park
2008-04-01 18:13 ` [patch 14/15] fs/logfs/dev_bdev.c joern
2008-04-01 18:13 ` [patch 9/15] fs/logfs/journal.c joern
2008-04-01 18:13 ` [patch 6/15] fs/logfs/file.c joern
2008-04-01 18:13 ` [patch 8/15] fs/logfs/inode.c joern
2008-04-04  6:57   ` Kyungmin Park
2008-04-04  6:57     ` Kyungmin Park
2008-04-07 11:12     ` Jörn Engel
2008-04-07 11:12       ` Jörn Engel
2008-04-07 11:12       ` Jörn Engel
2008-04-01 18:13 ` [patch 3/15] fs/logfs/logfs.h joern
2008-04-08  0:35   ` Arnd Bergmann
2008-04-08  0:35     ` Arnd Bergmann
2008-04-08  0:35     ` Arnd Bergmann
2008-04-08  9:41     ` Jörn Engel
2008-04-08  9:41       ` Jörn Engel
2008-04-08  9:41       ` Jörn Engel
2008-04-03 17:13 ` [patch 0/15] LogFS take five^Wsix Jörn Engel
2008-04-03 17:13   ` Jörn Engel
2008-04-03 17:13   ` Jörn Engel
2008-04-04 11:46 ` [patch 0/15] LogFS take five Jens Axboe
2008-04-04 11:46   ` Jens Axboe
2008-04-07  8:22   ` Jörn Engel
2008-04-07  8:22     ` Jörn Engel
2008-04-07  8:22     ` Jörn Engel
2008-04-07  8:28     ` Jens Axboe
2008-04-07  8:28       ` Jens Axboe
2008-04-07  8:28       ` Jens Axboe
2008-04-07  9:10       ` Jörn Engel
2008-04-07  9:10         ` Jörn Engel
2008-04-07  9:10         ` Jörn Engel
2008-04-07  9:17         ` Jens Axboe
2008-04-07  9:17           ` Jens Axboe
2008-04-07  9:17           ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080401181332.853833010@logfs.org \
    --to=joern@logfs.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mtd@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.