linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Jörn Engel" <joern@lazybastard.org>
To: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mtd@lists.infradead.org
Cc: akpm@osdl.org, Sam Ravnborg <sam@ravnborg.org>,
	John Stoffel <john@stoffel.org>,
	David Woodhouse <dwmw2@infradead.org>,
	Jamie Lokier <jamie@shareable.org>,
	Artem Bityutskiy <dedekind@infradead.org>, CaT <cat@zip.com.au>,
	Jan Engelhardt <jengelh@linux01.gwdg.de>,
	Evgeniy Polyakov <johnpol@2ka.mipt.ru>,
	David Weinehall <tao@acc.umu.se>, Arnd Bergmann <arnd@arndb.de>,
	Willy Tarreau <w@1wt.eu>, Kyle Moffett <mrmacman_g4@mac.com>,
	Dongjun Shin <djshin90@gmail.com>, Pavel Machek <pavel@ucw.cz>,
	Bill Davidsen <davidsen@tmr.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Albert Cahalan <acahalan@gmail.com>,
	Pekka Enberg <penberg@cs.helsinki.fi>,
	Roland Dreier <rdreier@cisco.com>,
	Ondrej Zajicek <santiago@crfreenet.org>,
	Ulisses Furquim <ulissesf@gmail.com>
Subject: [Patch 12/18] fs/logfs/memtree.c
Date: Sun, 3 Jun 2007 20:47:40 +0200	[thread overview]
Message-ID: <20070603184740.GM8952@lazybastard.org> (raw)
In-Reply-To: <20070603183845.GA8952@lazybastard.org>

--- /dev/null	2007-03-13 19:15:28.862769062 +0100
+++ linux-2.6.21logfs/fs/logfs/memtree.c	2007-06-03 19:18:57.000000000 +0200
@@ -0,0 +1,258 @@
+/*
+ * fs/logfs/memtree.c	- Simple In-memory B+Tree
+ *
+ * As should be obvious for Linux kernel code, license is GPLv2
+ *
+ * Copyright (c) 2007 Joern Engel
+ *
+ *
+ * This could possibly get moved to lib/.
+ *
+ * A relatively simple B+Tree implementation.  I have written it as a learning
+ * excercise to understand how B+Trees work.  Turned out to be useful as well.
+ *
+ * B+Trees can be used similar to Linux radix trees (which don't have anything
+ * in common with textbook radix trees, beware).  Prerequisite for them working
+ * well is that access to a random tree node is much faster than a large number
+ * of operations within each node.
+ *
+ * Disks have fulfilled the prerequite for a long time.  More recently DRAM
+ * has gained similar properties, as memory access times, when measured in cpu
+ * cycles, have increased.  Cacheline sizes have increased as well, which also
+ * helps B+Trees.
+ *
+ * Compared to radix trees, B+Trees are more efficient when dealing with a
+ * sparsely populated address space.  Between 25% and 50% of the memory is
+ * occupied with valid pointers.  When densely populated, radix trees contain
+ * ~98% pointers - hard to beat.  Very sparse radix trees contain only ~2%
+ * pointers.
+ *
+ * This particular implementation stores pointers identified by a long value.
+ * Storing NULL pointers is illegal, lookup will return NULL when no entry
+ * was found.
+ *
+ * Two tricks were used that are not commonly found in textbooks.  First, the
+ * lowest values are to the right, not to the left.  All used slots within a
+ * node are on the left, all unused slots contain NUL values.  Most operations
+ * simply loop once over all slots and terminate on the first NUL.
+ *
+ * Second trick is to special-case the key "0" or NUL.  As seen above, this
+ * value indicates an unused slot, so such a value should not be stored in the
+ * tree itself.  Instead it is stored in the null_ptr field in the btree_head.
+ */
+#include "logfs.h"
+
+/*
+ * Prerequisite of B+Trees performing well is that node lookup is much slower
+ * than a large number of operations within a node.  That can be true if the
+ * node size is identical to cacheline size.  All that is highly
+ * machine-dependent, just like the #define below is not.
+ *
+ * Patches to do something smarter are welcome.  Just beware that too small
+ * node with less than 8 slots have a bad fan-out and won't perform well
+ * either.
+ */
+#define BTREE_NODES 16	/* 32bit, 128 byte cacheline */
+
+struct btree_node {
+	long val;
+	struct btree_node *node;
+};
+
+void btree_init(struct btree_head *head)
+{
+	head->node = NULL;
+	head->height = 0;
+	head->null_ptr = NULL;
+}
+
+void *btree_lookup(struct btree_head *head, long val)
+{
+	int i, height = head->height;
+	struct btree_node *node = head->node;
+
+	if (val == 0)
+		return head->null_ptr;
+
+	if (height == 0)
+		return NULL;
+
+	for ( ; height > 1; height--) {
+		for (i=0; i<BTREE_NODES; i++)
+			if (node[i].val <= val)
+				break;
+		node = node[i].node;
+	}
+
+	for (i=0; i<BTREE_NODES; i++)
+		if (node[i].val == val)
+			return node[i].node;
+
+	return NULL;
+}
+
+static void find_pos(struct btree_node *node, long val, int *pos, int *fill)
+{
+	int i;
+
+	for (i=0; i<BTREE_NODES; i++)
+		if (node[i].val <= val)
+			break;
+	*pos = i;
+	for (i=*pos; i<BTREE_NODES; i++)
+		if (node[i].val == 0)
+			break;
+	*fill = i;
+}
+
+static struct btree_node *find_level(struct btree_head *head, long val,
+		int level)
+{
+	struct btree_node *node = head->node;
+	int i, height = head->height;
+
+	for ( ; height > level; height--) {
+		for (i=0; i<BTREE_NODES; i++)
+			if (node[i].val <= val)
+				break;
+		node = node[i].node;
+	}
+	return node;
+}
+
+static int btree_grow(struct btree_head *head)
+{
+	struct btree_node *node;
+
+	node = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+	if (head->node) {
+		node->val = head->node[BTREE_NODES-1].val;
+		node->node = head->node;
+	}
+	head->node = node;
+	head->height++;
+	return 0;
+}
+
+static int btree_insert_level(struct btree_head *head, long val, void *ptr,
+		int level)
+{
+	struct btree_node *node;
+	int i, pos, fill, err;
+
+	if (val == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		BUG_ON(level != 1);
+		head->null_ptr = ptr;
+		return 0;
+	}
+
+	if (head->height < level) {
+		err = btree_grow(head);
+		if (err)
+			return err;
+	}
+
+retry:
+	node = find_level(head, val, level);
+	find_pos(node, val, &pos, &fill);
+	BUG_ON(node[pos].val == val);
+
+	if (fill == BTREE_NODES) {
+		/* need to split node */
+		struct btree_node *new;
+
+		new = kcalloc(BTREE_NODES, sizeof(*node), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+		err = btree_insert_level(head, node[BTREE_NODES/2 - 1].val, new,
+				level+1);
+		if (err) {
+			kfree(new);
+			return err;
+		}
+		for (i=0; i<BTREE_NODES/2; i++) {
+			new[i].val = node[i].val;
+			new[i].node = node[i].node;
+			node[i].val = node[i + BTREE_NODES/2].val;
+			node[i].node = node[i + BTREE_NODES/2].node;
+			node[i + BTREE_NODES/2].val = 0;
+			node[i + BTREE_NODES/2].node = NULL;
+		}
+		goto retry;
+	}
+	BUG_ON(fill >= BTREE_NODES);
+
+	/* shift and insert */
+	for (i=fill; i>pos; i--) {
+		node[i].val = node[i-1].val;
+		node[i].node = node[i-1].node;
+	}
+	node[pos].val = val;
+	node[pos].node = ptr;
+
+	return 0;
+}
+
+int btree_insert(struct btree_head *head, long val, void *ptr)
+{
+	return btree_insert_level(head, val, ptr, 1);
+}
+
+static int btree_remove_level(struct btree_head *head, long val, int level)
+{
+	struct btree_node *node;
+	int i, pos, fill;
+
+	if (val == 0) {
+		/* 0 identifies empty slots, so special-case this */
+		head->null_ptr = NULL;
+		return 0;
+	}
+
+	node = find_level(head, val, level);
+	find_pos(node, val, &pos, &fill);
+	if (level == 1)
+		BUG_ON(node[pos].val != val);
+
+	/* remove and shift */
+	for (i=pos; i<fill-1; i++) {
+		node[i].val = node[i+1].val;
+		node[i].node = node[i+1].node;
+	}
+	node[fill-1].val = 0;
+	node[fill-1].node = NULL;
+
+	if (fill-1 < BTREE_NODES/2) {
+		/*
+		 * At this point there *should* be code to either merge with
+		 * a neighboring node or steal some entries from it to preserve
+		 * the btree invariant of only having nodes with n/2..n
+		 * elements.
+		 *
+		 * As you can see, that code is left as an excercise to the
+		 * reader or anyone noticing severe performance problems in
+		 * very rare cases.
+		 *
+		 * As-is this code "implements" a method called lazy deletion,
+		 * which according to text books is relatively common in
+		 * databases and usually works quite well.
+		 * Not so usually, the btree can degrade into very long lists
+		 * of 1-element nodes and perform accordingly.
+		 */
+	}
+	if (fill-1 == 0) {
+		btree_remove_level(head, val, level+1);
+		kfree(node);
+		return 0;
+	}
+
+	return 0;
+}
+
+int btree_remove(struct btree_head *head, long val)
+{
+	return btree_remove_level(head, val, 1);
+}

  parent reply	other threads:[~2007-06-03 18:52 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-06-03 18:38 LogFS take four Jörn Engel
2007-06-03 18:40 ` [Patch 01/18] fs/Kconfig Jörn Engel
2007-06-03 18:40 ` [Patch 02/18] fs/Makefile Jörn Engel
2007-06-03 18:41 ` [Patch 03/18] fs/logfs/Makefile Jörn Engel
2007-06-03 18:42 ` [Patch 04/18] include/linux/logfs.h Jörn Engel
2007-06-03 21:42   ` Arnd Bergmann
2007-06-04  9:12     ` Jörn Engel
2007-06-04 13:38       ` David Woodhouse
2007-06-04 14:02         ` Jörn Engel
2007-06-05 15:49         ` Segher Boessenkool
2007-06-05 15:53           ` David Woodhouse
2007-06-05 18:49             ` Segher Boessenkool
2007-06-06  8:50               ` David Woodhouse
2007-06-06  8:59                 ` Andreas Schwab
2007-06-06 12:42                 ` Arnd Bergmann
2007-06-05 20:39           ` Bill Davidsen
2007-06-03 18:43 ` [Patch 05/18] fs/logfs/logfs.h Jörn Engel
2007-06-03 21:50   ` Arnd Bergmann
2007-06-04  8:17     ` Jan Engelhardt
2007-06-04  9:11     ` Jörn Engel
2007-06-06 11:29   ` Paulo Marques
2007-06-06 11:29     ` Jörn Engel
2007-06-03 18:43 ` [Patch 06/18] fs/logfs/compr.c Jörn Engel
2007-06-03 21:58   ` Arnd Bergmann
2007-06-04  8:54     ` Jörn Engel
2007-06-04 13:53       ` David Woodhouse
2007-06-03 18:44 ` [Patch 07/18] fs/logfs/dir.c Jörn Engel
2007-06-15  8:59   ` Evgeniy Polyakov
2007-06-15 11:57     ` Jörn Engel
2007-06-03 18:45 ` [Patch 08/18] fs/logfs/file.c Jörn Engel
2007-06-03 18:46 ` [Patch 09/18] fs/logfs/gc.c Jörn Engel
2007-06-03 22:07   ` Arnd Bergmann
2007-06-04  9:01     ` Jörn Engel
2007-06-15  9:03   ` Evgeniy Polyakov
2007-06-15 11:14     ` Jörn Engel
2007-06-15 13:03       ` Evgeniy Polyakov
2007-06-03 18:46 ` [Patch 10/18] fs/logfs/inode.c Jörn Engel
2007-06-10 17:24   ` Arnd Bergmann
2007-06-10 17:40     ` Jörn Engel
2007-06-11 23:28     ` Jörn Engel
2007-06-11 23:51       ` Arnd Bergmann
2007-06-11 23:57         ` Jörn Engel
2007-06-03 18:47 ` [Patch 11/18] fs/logfs/journal.c Jörn Engel
2007-06-03 18:47 ` Jörn Engel [this message]
2007-06-03 18:48 ` [Patch 13/18] fs/logfs/readwrite.c Jörn Engel
2007-06-03 18:48 ` [Patch 14/18] fs/logfs/segment.c Jörn Engel
2007-06-03 22:21   ` Arnd Bergmann
2007-06-04  9:07     ` Jörn Engel
2007-06-03 18:49 ` [Patch 15/18] fs/logfs/super.c Jörn Engel
2007-06-10 16:27   ` Arnd Bergmann
2007-06-10 17:38     ` Jörn Engel
2007-06-10 18:33       ` Arnd Bergmann
2007-06-10 19:10         ` Jörn Engel
2007-06-10 19:20           ` Willy Tarreau
2007-06-03 18:50 ` [Patch 16/18] fs/logfs/progs/fsck.c Jörn Engel
2007-06-03 18:50 ` [Patch 17/18] fs/logfs/progs/mkfs.c Jörn Engel
2007-06-03 18:51 ` [Patch 18/18] fs/logfs/Locking Jörn Engel
2007-06-03 19:17 ` LogFS take four Jan-Benedict Glaw
2007-06-03 19:19   ` Jörn Engel
2007-06-03 22:18 ` Arnd Bergmann
2007-06-04  9:05   ` Jörn Engel
2007-06-15  8:37 ` Evgeniy Polyakov
2007-06-15 11:10   ` Jörn Engel

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070603184740.GM8952@lazybastard.org \
    --to=joern@lazybastard.org \
    --cc=acahalan@gmail.com \
    --cc=akpm@osdl.org \
    --cc=arnd@arndb.de \
    --cc=cat@zip.com.au \
    --cc=davidsen@tmr.com \
    --cc=dedekind@infradead.org \
    --cc=djshin90@gmail.com \
    --cc=dwmw2@infradead.org \
    --cc=jamie@shareable.org \
    --cc=jengelh@linux01.gwdg.de \
    --cc=john@stoffel.org \
    --cc=johnpol@2ka.mipt.ru \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mtd@lists.infradead.org \
    --cc=mrmacman_g4@mac.com \
    --cc=pavel@ucw.cz \
    --cc=penberg@cs.helsinki.fi \
    --cc=rdreier@cisco.com \
    --cc=sam@ravnborg.org \
    --cc=santiago@crfreenet.org \
    --cc=tao@acc.umu.se \
    --cc=tglx@linutronix.de \
    --cc=ulissesf@gmail.com \
    --cc=w@1wt.eu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).