* Page cache using B-trees benchmark results
@ 2006-08-18 1:43 Vishal Patil
2006-08-18 13:52 ` Bill Davidsen
2006-08-18 16:25 ` Andi Kleen
0 siblings, 2 replies; 4+ messages in thread
From: Vishal Patil @ 2006-08-18 1:43 UTC (permalink / raw)
To: Linux Kernel Mailing List; +Cc: Andrea Arcangeli
[-- Attachment #1: Type: text/plain, Size: 595 bytes --]
Folks
I am attaching the benchmark results for Page Cache Implementation
using B-trees. I basically ran the tio (threaded i/o) benchmark
against my kernel (with the B-tree implementation) and the Linux
kernel shipped with FC5. Radix tree implementation is definately
better however the B-tree implementation did not suck that bad :)
Also I attaching a new patch which was used for measuring the
benchmarks. Also henceforth changes to the page will be tracked using
the projected hosted at http://code.google.com/p/btreepc
Thanks
- Vishal
--
Motivation will almost always beat mere talent.
[-- Attachment #2: btree.linux2.6.16.2.patch --]
[-- Type: text/x-patch, Size: 55283 bytes --]
diff -urN linux-2.6.16.2/mm/filemap.c linux-2.6.16.2-btree/mm/filemap.c
--- linux-2.6.16.2/mm/filemap.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/filemap.c 2006-08-17 21:11:21.000000000 -0400
@@ -112,10 +112,16 @@
*/
void __remove_from_page_cache(struct page *page)
{
+ struct page * bpage;
struct address_space *mapping = page->mapping;
+ unsigned long index = page->index;
- radix_tree_delete(&mapping->page_tree, page->index);
- page->mapping = NULL;
+ bpage = btree_delete(&mapping->btree,mapping->btree.root,index).val;
+ if ( bpage != page) {
+ panic("DEBUG: Deleting wrong page in remove from page"
+ "cache %d,0x%x\n",index,bpage);
+ }
+ page->mapping = NULL;
mapping->nrpages--;
pagecache_acct(-1);
}
@@ -395,12 +401,14 @@
int add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-
- if (error == 0) {
+ int btree_error = btree_preload(gfp_mask & ~__GFP_HIGHMEM);
+
+ if (btree_error == 0) {
write_lock_irq(&mapping->tree_lock);
- error = radix_tree_insert(&mapping->page_tree, offset, page);
- if (!error) {
+ btree_error =
+ btree_insert(&mapping->btree,offset,page);
+
+ if (!btree_error) {
page_cache_get(page);
SetPageLocked(page);
page->mapping = mapping;
@@ -409,9 +417,11 @@
pagecache_acct(1);
}
write_unlock_irq(&mapping->tree_lock);
- radix_tree_preload_end();
+ if(btree_error == 0) {
+ btree_preload_end();
+ }
}
- return error;
+ return btree_error;
}
EXPORT_SYMBOL(add_to_page_cache);
@@ -522,7 +532,7 @@
struct page *page;
read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = btree_lookup(&mapping->btree,offset);
if (page)
page_cache_get(page);
read_unlock_irq(&mapping->tree_lock);
@@ -539,7 +549,7 @@
struct page *page;
read_lock_irq(&mapping->tree_lock);
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = btree_lookup(&mapping->btree,offset);
if (page && TestSetPageLocked(page))
page = NULL;
read_unlock_irq(&mapping->tree_lock);
@@ -563,10 +573,9 @@
unsigned long offset)
{
struct page *page;
-
read_lock_irq(&mapping->tree_lock);
repeat:
- page = radix_tree_lookup(&mapping->page_tree, offset);
+ page = btree_lookup(&mapping->btree,offset);
if (page) {
page_cache_get(page);
if (TestSetPageLocked(page)) {
@@ -658,8 +667,10 @@
unsigned int ret;
read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup(&mapping->page_tree,
- (void **)pages, start, nr_pages);
+
+ ret = btree_gang_lookup(&mapping->btree,(void **)pages,
+ start,nr_pages);
+
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
read_unlock_irq(&mapping->tree_lock);
@@ -677,8 +688,9 @@
unsigned int ret;
read_lock_irq(&mapping->tree_lock);
- ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
+ ret = btree_gang_lookup_tag(&mapping->btree,
(void **)pages, *index, nr_pages, tag);
+
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
if (ret)
diff -urN linux-2.6.16.2/mm/page-writeback.c linux-2.6.16.2-btree/mm/page-writeback.c
--- linux-2.6.16.2/mm/page-writeback.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/page-writeback.c 2006-08-17 21:11:21.000000000 -0400
@@ -634,8 +634,16 @@
BUG_ON(mapping2 != mapping);
if (mapping_cap_account_dirty(mapping))
inc_page_state(nr_dirty);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
+ btree_tag_set(&mapping->btree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+
+ if (btree_tag_get(&mapping->btree,
+ page_index(page), PAGECACHE_TAG_DIRTY) != 1 ) {
+ panic("__set_page_dirty_nobuffers "
+ " Problem setting the tag\n");
+ }
+
+
}
write_unlock_irq(&mapping->tree_lock);
if (mapping->host) {
@@ -714,9 +722,16 @@
if (mapping) {
write_lock_irqsave(&mapping->tree_lock, flags);
if (TestClearPageDirty(page)) {
- radix_tree_tag_clear(&mapping->page_tree,
+ btree_tag_clear(&mapping->btree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+
+ if (btree_tag_get(&mapping->btree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY) == 1) {
+ panic("Problem clearing the tag\n");
+ }
+
write_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping_cap_account_dirty(mapping))
dec_page_state(nr_dirty);
@@ -769,10 +784,17 @@
write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestClearPageWriteback(page);
- if (ret)
- radix_tree_tag_clear(&mapping->page_tree,
+ if (ret) {
+ btree_tag_clear(&mapping->btree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
+ if (btree_tag_get(&mapping->btree,
+ page_index(page),
+ PAGECACHE_TAG_WRITEBACK) == 1) {
+ panic("Problem clearing the tag\n");
+ }
+
+ }
write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestClearPageWriteback(page);
@@ -790,14 +812,29 @@
write_lock_irqsave(&mapping->tree_lock, flags);
ret = TestSetPageWriteback(page);
- if (!ret)
- radix_tree_tag_set(&mapping->page_tree,
+ if (!ret) {
+ btree_tag_set(&mapping->btree,
page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->page_tree,
+ if (btree_tag_get(&mapping->btree,
+ page_index(page),
+ PAGECACHE_TAG_WRITEBACK) != 1) {
+ panic("test_set_page_writeback"
+ "Problem setting the tag\n");
+ }
+
+ }
+ if (!PageDirty(page)) {
+ btree_tag_clear(&mapping->btree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+ if (btree_tag_get(&mapping->btree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY) == 1) {
+ panic("Problem clearing the tag");
+ }
+
+ }
write_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
@@ -817,7 +854,7 @@
int ret;
read_lock_irqsave(&mapping->tree_lock, flags);
- ret = radix_tree_tagged(&mapping->page_tree, tag);
+ ret = btree_tagged(&mapping->btree,tag);
read_unlock_irqrestore(&mapping->tree_lock, flags);
return ret;
}
diff -urN linux-2.6.16.2/mm/readahead.c linux-2.6.16.2-btree/mm/readahead.c
--- linux-2.6.16.2/mm/readahead.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/readahead.c 2006-08-17 21:11:21.000000000 -0400
@@ -282,7 +282,8 @@
if (page_offset > end_index)
break;
- page = radix_tree_lookup(&mapping->page_tree, page_offset);
+ page = btree_lookup(&mapping->btree,page_offset);
+
if (page)
continue;
diff -urN linux-2.6.16.2/mm/swap_state.c linux-2.6.16.2-btree/mm/swap_state.c
--- linux-2.6.16.2/mm/swap_state.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/swap_state.c 2006-08-17 21:11:21.000000000 -0400
@@ -37,6 +37,7 @@
struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+ .btree = BTREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.tree_lock = RW_LOCK_UNLOCKED,
.a_ops = &swap_aops,
.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
@@ -71,16 +72,21 @@
static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
gfp_t gfp_mask)
{
- int error;
+ int btree_error;
BUG_ON(PageSwapCache(page));
BUG_ON(PagePrivate(page));
- error = radix_tree_preload(gfp_mask);
- if (!error) {
+
+ btree_error = btree_preload(gfp_mask);
+
+ if (!btree_error) {
write_lock_irq(&swapper_space.tree_lock);
- error = radix_tree_insert(&swapper_space.page_tree,
+ if (!btree_error) {
+ btree_error = btree_insert(&swapper_space.btree,
entry.val, page);
- if (!error) {
+ }
+
+ if (!btree_error) {
page_cache_get(page);
SetPageLocked(page);
SetPageSwapCache(page);
@@ -89,9 +95,12 @@
pagecache_acct(1);
}
write_unlock_irq(&swapper_space.tree_lock);
- radix_tree_preload_end();
+
+ if (!btree_error) {
+ btree_preload_end();
+ }
}
- return error;
+ return btree_error;
}
static int add_to_swap_cache(struct page *page, swp_entry_t entry)
@@ -127,7 +136,10 @@
BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page));
- radix_tree_delete(&swapper_space.page_tree, page_private(page));
+ if (btree_delete(&swapper_space.btree,swapper_space.btree.root,
+ page_private(page)).val != page) {
+ panic("DEBUG: Deleting wrong page from swap cache\n");
+ }
set_page_private(page, 0);
ClearPageSwapCache(page);
total_swapcache_pages--;
diff -urN linux-2.6.16.2/mm/vmscan.c linux-2.6.16.2-btree/mm/vmscan.c
--- linux-2.6.16.2/mm/vmscan.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/vmscan.c 2006-08-17 21:11:21.000000000 -0400
@@ -692,7 +692,7 @@
struct page *page, int nr_refs)
{
struct address_space *mapping = page_mapping(page);
- struct page **radix_pointer;
+ struct page * btree_pointer;
/*
* Avoid doing any of the following work if the page count
@@ -733,12 +733,12 @@
write_lock_irq(&mapping->tree_lock);
- radix_pointer = (struct page **)radix_tree_lookup_slot(
- &mapping->page_tree,
+ btree_pointer = (struct page *)btree_lookup_slot(
+ &mapping->btree,
page_index(page));
if (!page_mapping(page) || page_count(page) != nr_refs ||
- *radix_pointer != page) {
+ btree_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -759,7 +759,7 @@
set_page_private(newpage, page_private(page));
}
- *radix_pointer = newpage;
+ btree_pointer = newpage;
__put_page(page);
write_unlock_irq(&mapping->tree_lock);
diff -urN linux-2.6.16.2/lib/btree.c linux-2.6.16.2-btree/lib/btree.c
--- linux-2.6.16.2/lib/btree.c 1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.16.2-btree/lib/btree.c 2006-08-17 21:11:36.000000000 -0400
@@ -0,0 +1,1291 @@
+/*
+ * btree.c
+ *
+ * Copyright (C) 2006 Vishal Patil (vishpat AT gmail DOT com)
+ *
+*/
+
+#include <linux/btree.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/radix-tree.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+
+typedef enum {left = -1,right = 1} position_t;
+
+typedef struct {
+ btree_node * node;
+ unsigned int index;
+}node_pos;
+
+struct btree_node {
+ struct btree_node * next; // Pointer used for linked list
+ bool leaf; // Used to indicate whether leaf or not
+ unsigned int nr_active; // Number of active keys
+ unsigned int level; // Level in the B-Tree
+ bt_key_val key_vals[2*BTREE_ORDER - 1]; // Array of keys and values
+ struct btree_node * children[2*BTREE_ORDER]; // Array of pointers to child nodes
+ unsigned long tags[BTREE_TAGS][BTREE_TAG_LONGS];
+ // Bitmap required by page
+ // cache
+ unsigned long padding[6];
+} ;
+
+/*
+* B-Tree node and key val cache
+*/
+static kmem_cache_t *btree_node_cachep;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct btree_preload {
+ int nr;
+ struct btree_node *nodes[BTREE_MAX_PATH];
+};
+DEFINE_PER_CPU(struct btree_preload, btree_preloads) = { 0, };
+
+static btree_node * allocate_btree_node (btree * btree,unsigned int order);
+static node_pos get_btree_node(btree * btree,unsigned long key);
+static bt_key_val remove_key_from_leaf(btree * btree, node_pos * node_pos);
+static btree_node * merge_nodes(btree * btree, btree_node * n1, bt_key_val kv ,
+ btree_node * n2);
+static void move_key(btree * btree, btree_node * node, unsigned int index,
+ position_t pos);
+static node_pos get_max_key_pos(btree * btree, btree_node * subtree);
+static node_pos get_min_key_pos(btree * btree, btree_node * subtree);
+static btree_node * merge_siblings(btree * btree, btree_node * parent,
+ unsigned int index,position_t pos);
+static unsigned int __lookup(btree * btree, void ** results,
+ unsigned long first_index,unsigned int max_items, int tag);
+static void print_single_node(btree *btree, btree_node * node);
+static void transfer_tags(btree_node * source_node, int source_index,
+ btree_node * dest_node,int dest_index);
+static void clear_tags(btree_node * node, int index);
+
+static void
+btree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) {
+ memset(node, 0, sizeof(struct btree_node));
+}
+
+void __init btree_init(void) {
+ btree_node_cachep = kmem_cache_create("btree_node",
+ sizeof(btree_node), 0,
+ SLAB_PANIC, btree_node_ctor, NULL);
+}
+
+unsigned long btree_key_fn (void * page) {
+ return (((struct page*)page)->index);
+}
+
+unsigned long btree_value_fn (unsigned long key) {
+ return *((unsigned long*)key);
+}
+
+void btree_print_fn (unsigned long key) {
+ printk("%lu ", *((unsigned long*)key));
+}
+
+static inline void tag_set(struct btree_node *node, int tag, int offset)
+{
+ __set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct btree_node *node, int tag, int offset)
+{
+ __clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct btree_node *node, int tag, int offset)
+{
+ return test_bit(offset, node->tags[tag]);
+}
+
+/*
+ * A modified form of binary search that searches for index position of the key
+ * In case the key is not found the index of the key lesser than the search key
+ * is returned
+ */
+static inline int search_key(btree_node * node, long key) {
+ int left_index = 0;
+ int right_index = node->nr_active - 1;
+ int mid_index = -1;
+
+ if (key > node->key_vals[right_index].key) {
+ return right_index + 1;
+ }
+
+ while (left_index <= right_index) {
+ mid_index = left_index + (right_index - left_index)/2;
+
+ if (key < node->key_vals[mid_index].key) {
+ right_index = mid_index - 1;
+ } else
+ if (key > node->key_vals[mid_index].key) {
+ left_index = mid_index + 1;
+ } else {
+ return mid_index;
+ }
+ }
+ return right_index + 1;
+}
+
+
+
+/*
+ * Load up this CPU's btree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail. On
+ * success, return zero, with preemption disabled. On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int btree_preload(gfp_t gfp_mask) {
+ struct btree_preload *btp;
+ struct btree_node *node;
+ int ret = -ENOMEM;
+
+ preempt_disable();
+ btp = &__get_cpu_var(btree_preloads);
+ while (btp->nr < ARRAY_SIZE(btp->nodes)) {
+ preempt_enable();
+ node = kmem_cache_alloc(btree_node_cachep, gfp_mask);
+ if (node == NULL)
+ goto out;
+ preempt_disable();
+ btp = &__get_cpu_var(btree_preloads);
+ if (btp->nr < ARRAY_SIZE(btp->nodes))
+ btp->nodes[btp->nr++] = node;
+ else
+ kmem_cache_free(btree_node_cachep, node);
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+/**
+* Function used to allocate memory for the btree node
+* @param btree The btree
+* @param order Order of the B-Tree
+* @return The allocated B-tree node
+*/
+static btree_node * allocate_btree_node (btree * btree,unsigned int order) {
+ btree_node * node;
+
+ // Allocate memory for the node
+ node = kmem_cache_alloc(btree_node_cachep,btree->gfp_mask);
+
+ // If this is NULL get preloaded node
+ if (node == NULL && !(btree->gfp_mask & __GFP_WAIT)) {
+ struct btree_preload *btp;
+
+ btp = &__get_cpu_var(btree_preloads);
+ if (btp->nr) {
+ node = btp->nodes[btp->nr - 1];
+ btp->nodes[btp->nr - 1] = NULL;
+ btp->nr--;
+ }
+ }
+
+ // Initialize the number of active nodes
+ node->nr_active = 0;
+
+ // Use to determine whether it is a leaf
+ node->leaf = true;
+
+ // Use to determine the level in the tree
+ node->level = 0;
+
+ //Initialize the linked list pointer to NULL
+ node->next = NULL;
+
+ return node;
+}
+
+/**
+* Function used to free the memory allocated to the b-tree
+* @param node The node to be freed
+* @param order Order of the B-Tree
+* @return The allocated B-tree node
+*/
+static int free_btree_node (btree_node * node) {
+ kmem_cache_free(btree_node_cachep,node);
+ return 0;
+}
+
+/**
+* Used to split the child node and adjust the parent so that
+* it has two children
+* @param parent Parent Node
+* @param index Index of the child node
+* @param child Full child node
+*
+*/
+static void btree_split_child(btree * btree, btree_node * parent,
+ unsigned int index,
+ btree_node * child) {
+ int i = 0;
+ unsigned int order = btree->order;
+
+ btree_node * new_child = allocate_btree_node(btree,btree->order);
+ new_child->leaf = child->leaf;
+ new_child->level = child->level;
+ new_child->nr_active = btree->order - 1;
+
+ // Copy the higher order keys to the new child
+ for(i=0;i<order - 1;i++) {
+ new_child->key_vals[i] = child->key_vals[i + order];
+ transfer_tags(child,i + order,new_child,i);
+
+ if(!child->leaf) {
+ new_child->children[i] =
+ child->children[i + order];
+ }
+ }
+
+ // Copy the last child pointer
+ if(!child->leaf) {
+ new_child->children[i] =
+ child->children[i + order];
+ }
+
+ child->nr_active = order - 1;
+
+ for(i = parent->nr_active + 1;i > index + 1;i--) {
+ parent->children[i] = parent->children[i - 1];
+ }
+ parent->children[index + 1] = new_child;
+
+ for(i = parent->nr_active;i > index;i--) {
+ parent->key_vals[i] = parent->key_vals[i - 1];
+ transfer_tags(parent,i - 1,parent,i);
+ }
+
+ parent->key_vals[index] = child->key_vals[order - 1];
+ transfer_tags(child,order - 1,parent,index);
+ parent->nr_active++;
+
+}
+
+/**
+* Used to insert a key in the non-full node
+* @param btree The btree
+* @param node The node to which the key will be added
+* @param the key value pair
+* @return void
+*/
+
+static int btree_insert_nonfull (btree * btree, btree_node * parent_node,
+ bt_key_val * key_val) {
+
+ unsigned long key = key_val->key;
+ int i,j,index;
+ btree_node * child;
+ btree_node * node = parent_node;
+
+insert: i = node->nr_active - 1;
+ if(node->leaf) {
+ while(i >= 0 && key < node->key_vals[i].key) {
+ node->key_vals[i + 1] = node->key_vals[i];
+ transfer_tags(node,i,node,i + 1);
+ i--;
+ }
+
+ node->key_vals[i + 1].key = key;
+ node->key_vals[i + 1].val = key_val->val;
+
+ // Identical key found
+ if (i >= 0 &&
+ key == node->key_vals[i].key) {
+ for (j = i + 1; j <node->nr_active; j++ ) {
+ node->key_vals[j] =
+ node->key_vals[j + 1];
+ transfer_tags(node,j,node,j + 1);
+ }
+ return -EEXIST;
+ } else
+ node->nr_active++;
+ } else {
+ while (i >= 0 && key < node->key_vals[i].key) {
+ i--;
+ }
+
+ // Identical key found
+ if (i >= 0 && key == node->key_vals[i].key) {
+ return -EEXIST;
+ }
+
+ i++;
+ child = node->children[i];
+
+ if(child->nr_active == 2*btree->order - 1) {
+
+ // Check for identical key here
+ index = search_key(child, key);
+ if (key == child->key_vals[index].key) {
+ return -EEXIST;
+ }
+
+ btree_split_child(btree,node,i,child);
+ if(key >
+ node->key_vals[i].key) {
+ i++;
+ }
+ }
+ node = node->children[i];
+ goto insert;
+ }
+ return 0;
+}
+
+
+/**
+* Function used to insert node into a B-Tree
+* @param root Root of the B-Tree
+* @param node The node to be inserted
+* @param compare Function used to compare the two nodes of the tree
+* @return success or failure
+*/
+int btree_insert(btree * btree, unsigned long key, void * val) {
+ btree_node * rnode = NULL;
+ bt_key_val key_val;
+ int ret = -1;
+
+ key_val.key = key;
+ key_val.val = val;
+
+ // During the B-tree initialization
+ if(unlikely(btree->root == NULL)) {
+ btree->root = allocate_btree_node(btree,btree->order);
+ }
+ rnode = btree->root;
+
+ if(rnode->nr_active == (2*btree->order - 1)) {
+ btree_node * new_root;
+ new_root = allocate_btree_node(btree,btree->order);
+ new_root->level = btree->root->level + 1;
+ btree->root = new_root;
+ new_root->leaf = false;
+ new_root->nr_active = 0;
+ new_root->children[0] = rnode;
+ btree_split_child(btree,new_root,0,rnode);
+ ret = btree_insert_nonfull(btree,new_root,&key_val);
+ } else
+ ret = btree_insert_nonfull(btree,rnode,&key_val);
+ return ret;
+}
+EXPORT_SYMBOL(btree_insert);
+
+/**
+* Used to get the position of the MAX key within the subtree
+* @param btree The btree
+* @param subtree The subtree to be searched
+* @return The node containing the key and position of the key
+*/
+static node_pos get_max_key_pos(btree * btree, btree_node * subtree) {
+ node_pos node_pos;
+ btree_node * node = subtree;
+
+ node_pos.node = NULL;
+ node_pos.index = 0;
+
+ while(true) {
+ if(node == NULL) {
+ break;
+ }
+
+ if(node->leaf) {
+ node_pos.node = node;
+ node_pos.index = node->nr_active - 1;
+ return node_pos;
+ } else {
+ node_pos.node = node;
+ node_pos.index = node->nr_active - 1;
+ node = node->children[node->nr_active];
+ }
+ }
+ return node_pos;
+}
+
+/**
+* Used to get the position of the MAX key within the subtree
+* @param btree The btree
+* @param subtree The subtree to be searched
+* @return The node containing the key and position of the key
+*/
+static node_pos get_min_key_pos(btree * btree, btree_node * subtree) {
+ node_pos node_pos;
+ btree_node * node = subtree;
+
+ node_pos.node = NULL;
+ node_pos.index = 0;
+
+ while(true) {
+ if(node == NULL) {
+ break;
+ }
+
+ if(node->leaf) {
+ node_pos.node = node;
+ node_pos.index = 0;
+ return node_pos;
+ } else {
+ node_pos.node = node;
+ node_pos.index = 0;
+ node = node->children[0];
+ }
+ }
+ return node_pos;
+}
+
+/**
+* Merge nodes n1 and n2 (case 3b from Cormen)
+* @param btree The btree
+* @param node The parent node
+* @param index of the child
+* @param pos left or right
+* @return none
+*/
+static btree_node * merge_siblings(btree * btree, btree_node * parent, unsigned int index ,
+ position_t pos) {
+ unsigned int j;
+ btree_node * new_node;
+ btree_node * n1, * n2;
+
+ if (index == (parent->nr_active)) {
+ index--;
+ n1 = parent->children[parent->nr_active - 1];
+ n2 = parent->children[parent->nr_active];
+ } else {
+ n1 = parent->children[index];
+ n2 = parent->children[index + 1];
+ }
+
+ //Merge the current node with the left node
+ new_node = allocate_btree_node(btree,btree->order);
+ new_node->level = n1->level;
+ new_node->leaf = n1->leaf;
+
+ for(j=0;j<btree->order - 1; j++) {
+ new_node->key_vals[j] = n1->key_vals[j];
+ transfer_tags(n1,j,new_node,j);
+ new_node->children[j] = n1->children[j];
+ }
+
+ new_node->key_vals[btree->order - 1] = parent->key_vals[index];
+ transfer_tags(parent,index,new_node,btree->order - 1);
+ new_node->children[btree->order - 1] = n1->children[btree->order - 1];
+
+ for(j=0;j<btree->order - 1; j++) {
+ new_node->key_vals[j + btree->order] = n2->key_vals[j];
+ transfer_tags(n2,j,new_node,j + btree->order);
+ new_node->children[j + btree->order] = n2->children[j];
+ }
+ new_node->children[2*btree->order - 1] = n2->children[btree->order - 1];
+
+ parent->children[index] = new_node;
+
+ for(j = index;j<parent->nr_active;j++) {
+ parent->key_vals[j] = parent->key_vals[j + 1];
+ transfer_tags(parent,j + 1,parent,j);
+ parent->children[j + 1] = parent->children[j + 2];
+ }
+
+ new_node->nr_active = n1->nr_active + n2->nr_active + 1;
+ parent->nr_active--;
+
+ free_btree_node(n1);
+ free_btree_node(n2);
+
+ if (parent->nr_active == 0 && btree->root == parent) {
+ free_btree_node(parent);
+ btree->root = new_node;
+ if(new_node->level)
+ new_node->leaf = false;
+ else
+ new_node->leaf = true;
+ }
+
+ return new_node;
+}
+
+/**
+* Move the key from node to another
+* @param btree The B-Tree
+* @param node The parent node
+* @param index of the key to be moved done
+* @param pos the position of the child to receive the key
+* @return none
+*/
+static void move_key(btree * btree, btree_node * node, unsigned int index, position_t pos) {
+ btree_node * lchild;
+ btree_node * rchild;
+ unsigned int i;
+
+ if(pos == right) {
+ index--;
+ }
+ lchild = node->children[index];
+ rchild = node->children[index + 1];
+
+ // Move the key from the parent to the left child
+ if(pos == left) {
+ lchild->key_vals[lchild->nr_active] = node->key_vals[index];
+ transfer_tags(node,index,lchild,lchild->nr_active);
+
+ lchild->children[lchild->nr_active + 1] = rchild->children[0];
+ rchild->children[0] = NULL;
+ lchild->nr_active++;
+
+ node->key_vals[index] = rchild->key_vals[0];
+ transfer_tags(rchild,0,node,index);
+
+ for(i=0;i<rchild->nr_active - 1;i++) {
+ rchild->key_vals[i] = rchild->key_vals[i + 1];
+ transfer_tags(rchild,i + 1,rchild,i);
+ rchild->children[i] = rchild->children[i + 1];
+ }
+ rchild->children[rchild->nr_active - 1] =
+ rchild->children[rchild->nr_active];
+ rchild->nr_active--;
+ } else {
+ // Move the key from the parent to the right child
+ for(i=rchild->nr_active;i > 0 ; i--) {
+ rchild->key_vals[i] = rchild->key_vals[i - 1];
+ transfer_tags(rchild,i - 1,rchild,i);
+ rchild->children[i + 1] = rchild->children[i];
+ }
+ rchild->children[1] = rchild->children[0];
+ rchild->children[0] = NULL;
+
+ rchild->key_vals[0] = node->key_vals[index];
+ transfer_tags(node,index,rchild,0);
+
+ rchild->children[0] = lchild->children[lchild->nr_active];
+ lchild->children[lchild->nr_active] = NULL;
+
+ node->key_vals[index] = lchild->key_vals[lchild->nr_active - 1];
+ transfer_tags(lchild,lchild->nr_active - 1,node,index);
+
+ lchild->nr_active--;
+ rchild->nr_active++;
+ }
+}
+
+/**
+* Merge nodes n1 and n2
+* @param n1 First node
+* @param n2 Second node
+* @return combined node
+*/
+static btree_node * merge_nodes(btree * btree, btree_node * n1, bt_key_val kv,
+ btree_node * n2) {
+ btree_node * new_node;
+ unsigned int i;
+
+ new_node = allocate_btree_node(btree,btree->order);
+ new_node->leaf = true;
+
+ for(i=0;i<n1->nr_active;i++) {
+ new_node->key_vals[i] = n1->key_vals[i];
+ transfer_tags(n1,i,new_node,i);
+ new_node->children[i] = n1->children[i];
+ }
+ new_node->children[n1->nr_active] = n1->children[n1->nr_active];
+ new_node->key_vals[n1->nr_active] = kv;
+
+ for(i=0;i<n2->nr_active;i++) {
+ new_node->key_vals[i + n1->nr_active + 1] = n2->key_vals[i];
+ transfer_tags(n2,i,new_node,i + n1->nr_active + 1);
+ new_node->children[i + n1->nr_active + 1] = n2->children[i];
+ }
+ new_node->children[2*btree->order - 1] = n2->children[n2->nr_active];
+
+ new_node->nr_active = n1->nr_active + n2->nr_active + 1;
+ new_node->leaf = n1->leaf;
+ new_node->level = n1->level;
+
+ free_btree_node(n1);
+ free_btree_node(n2);
+
+ return new_node;
+}
+
+/**
+* Used to remove a key from the B-tree node
+* @param btree The btree
+* @param node The node from which the key is to be removed
+* @param key The key to be removed
+* @return 0 on success -1 on error
+*/
+
+bt_key_val remove_key_from_leaf(btree * btree, node_pos * node_pos) {
+ unsigned int keys_max = 2*btree->order - 1;
+ unsigned int i;
+ bt_key_val key_val;
+ btree_node * node = node_pos->node;
+
+ key_val.key = -1;
+ key_val.val = NULL;
+
+ if(node->leaf == false) {
+ return key_val;
+ }
+
+ key_val = node->key_vals[node_pos->index];
+
+ for(i=node_pos->index;i< keys_max - 1;i++) {
+ node->key_vals[i] = node->key_vals[i + 1];
+ transfer_tags(node,i + 1,node,i);
+ }
+
+ node->nr_active--;
+
+ if(node->nr_active == 0 ) {
+ if (btree->root == node) {
+ btree->root = NULL;
+ }
+ free_btree_node(node);
+ }
+ return key_val;
+}
+
+/**
+* Function used to remove a node from a B-Tree
+* @param btree The B-Tree
+* @param key Key of the node to be removed
+* @param value function to map the key to an unique integer value
+* @param compare Function used to compare the two nodes of the tree
+* @return The removed key value pair
+*/
+
+bt_key_val btree_delete(btree * btree,btree_node * subtree,unsigned long key) {
+ unsigned int i,index;
+ btree_node * node = NULL, * rsibling, *lsibling;
+ btree_node * comb_node, * parent;
+ node_pos sub_node_pos;
+ node_pos node_pos;
+ bt_key_val key_val, removed_kv,to_remove_kv;
+ unsigned long kv = key;
+
+ node = subtree;
+ parent = NULL;
+ removed_kv.key = -1;
+ removed_kv.val = NULL;
+
+ // Empty subtree
+ if (node == NULL) {
+ return removed_kv;
+ }
+
+del_loop:for (i = 0;;i = 0) {
+
+ //If there are no keys simply return
+ if(!node->nr_active)
+ return removed_kv;
+
+ // Fix the index of the key greater than or equal
+ // to the key that we would like to search
+ i = search_key(node,kv);
+ index = i;
+
+ // If we find such key break
+ if(i < node->nr_active &&
+ kv == node->key_vals[i].key) {
+ break;
+ }
+ if(node->leaf)
+ return removed_kv;
+
+ //Store the parent node
+ parent = node;
+
+ // To get a child node
+ node = node->children[i];
+
+ //If NULL not found
+ if (node == NULL)
+ return removed_kv;
+
+ if (index == (parent->nr_active)) {
+ lsibling = parent->children[parent->nr_active - 1];
+ rsibling = NULL;
+ } else if (index == 0) {
+ lsibling = NULL;
+ rsibling = parent->children[1];
+ } else {
+ lsibling = parent->children[i - 1];
+ rsibling = parent->children[i + 1];
+ }
+
+ if (node->nr_active == btree->order - 1 && parent) {
+ // The current node has (t - 1) keys but the right sibling has > (t - 1)
+ // keys
+ if (rsibling && (rsibling->nr_active > btree->order - 1)) {
+ move_key(btree,parent,i,left);
+ } else
+ // The current node has (t - 1) keys but the left sibling has (t - 1)
+ // keys
+ if (lsibling && (lsibling->nr_active > btree->order - 1)) {
+ move_key(btree,parent,i,right);
+ } else
+ // Left sibling has (t - 1) keys
+ if(lsibling && (lsibling->nr_active == btree->order - 1)) {
+ node = merge_siblings(btree,parent,i,left);
+ } else
+ // Right sibling has (t - 1) keys
+ if(rsibling && (rsibling->nr_active == btree->order - 1)) {
+ node = merge_siblings(btree,parent,i,right);
+ }
+ }
+ }
+
+ //Case 1 : The node containing the key is found and is the leaf node.
+ //Also the leaf node has keys greater than the minimum required.
+ //Simply remove the key
+ if(node->leaf && (node->nr_active > btree->order - 1)) {
+ node_pos.node = node;
+ node_pos.index = index;
+ return remove_key_from_leaf(btree,&node_pos);
+ }
+
+ //If the leaf node is the root permit deletion even if the number of keys is
+ //less than (t - 1)
+ if(node->leaf && (node == btree->root)) {
+ node_pos.node = node;
+ node_pos.index = index;
+ return remove_key_from_leaf(btree,&node_pos);
+ }
+
+
+ //Case 2: The node containing the key is found and is an internal node
+ if(node->leaf == false) {
+ if(node->children[index]->nr_active > btree->order - 1 ) {
+ to_remove_kv = node->key_vals[index];
+
+ sub_node_pos =
+ get_max_key_pos(btree,node->children[index]);
+ key_val =
+ sub_node_pos.node->key_vals[sub_node_pos.index];
+ node->key_vals[index] = key_val;
+ transfer_tags(node,index,sub_node_pos.node,
+ sub_node_pos.index);
+
+ sub_node_pos.node->key_vals[sub_node_pos.index] =
+ to_remove_kv;
+ node = node->children[index];
+ goto del_loop;
+
+ } else if ((node->children[index + 1]->nr_active > btree->order - 1) ) {
+ to_remove_kv = node->key_vals[index];
+
+ sub_node_pos =
+ get_min_key_pos(btree,node->children[index + 1]);
+ key_val = sub_node_pos.node->key_vals[sub_node_pos.index];
+ node->key_vals[index] = key_val;
+ transfer_tags(node,index,sub_node_pos.node,
+ sub_node_pos.index);
+
+ sub_node_pos.node->key_vals[sub_node_pos.index] =
+ to_remove_kv;
+
+ node = node->children[index + 1];
+ goto del_loop;
+
+ } else if (
+ node->children[index]->nr_active == btree->order - 1 &&
+ node->children[index + 1]->nr_active == btree->order - 1) {
+
+ comb_node = merge_nodes(btree,node->children[index],
+ node->key_vals[index],
+ node->children[index + 1]);
+ node->children[index] = comb_node;
+
+ for(i=index + 1;i<node->nr_active;i++) {
+ node->children[i] = node->children[i + 1];
+ node->key_vals[i - 1] = node->key_vals[i];
+ transfer_tags(node,i,node,i - 1);
+ }
+ node->nr_active--;
+ if (node->nr_active == 0 && btree->root == node) {
+ free_btree_node(node);
+ btree->root = comb_node;
+ }
+ node = comb_node;
+ goto del_loop;
+ }
+ }
+
+ // Case 3:
+ // In this case start from the top of the tree and continue
+ // moving to the leaf node making sure that each node that
+ // we encounter on the way has atleast 't' (order of the tree)
+ // keys
+ if(node->leaf && (node->nr_active > btree->order - 1)) {
+ node_pos.node = node;
+ node_pos.index = index;
+ return remove_key_from_leaf(btree,&node_pos);
+ }
+
+ return removed_kv;
+}
+
+/**
+* Function used to get the node containing the given key
+* @param btree The btree to be searched
+* @param key The the key to be searched
+* @return The node and position of the key within the node
+*/
+node_pos get_btree_node(btree * btree,unsigned long key) {
+ node_pos kp;
+ unsigned long key_val = key;
+ btree_node * node;
+ unsigned int i = 0;
+
+ node = btree->root;
+ kp.node = NULL;
+ kp.index = 0;
+
+ // Empty tree
+ if (node == NULL) {
+ return kp;
+ }
+
+ for (;;i = 0) {
+
+ // Fix the index of the key greater than or equal
+ // to the key that we would like to search
+ i = search_key(node,key_val);
+
+ // If we find such key return the key-value pair
+ if(i < node->nr_active &&
+ key_val == node->key_vals[i].key) {
+ kp.node = node;
+ kp.index = i;
+ return kp;
+ }
+
+ // If the node is leaf and if we did not find the key
+ // return NULL
+ if(node->leaf) {
+ return kp;
+ }
+
+ // To got a child node
+ node = node->children[i];
+ }
+ return kp;
+
+}
+
+/**
+* Used to destory btree
+* @param btree The B-tree
+* @return none
+*/
+void btree_destroy(btree * btree) {
+ int i = 0;
+ unsigned int current_level;
+
+ btree_node * head, * tail, * node;
+ btree_node * child, * del_node;
+
+ node = btree->root;
+ current_level = node->level;
+ head = node;
+ tail = node;
+
+ while(true) {
+ if(head == NULL) {
+ break;
+ }
+ if (head->level < current_level) {
+ current_level = head->level;
+ }
+
+ if(head->leaf == false) {
+ for(i = 0 ; i < head->nr_active + 1; i++) {
+ child = head->children[i];
+ tail->next = child;
+ tail = child;
+ child->next = NULL;
+ }
+ }
+ del_node = head;
+ head = head->next;
+ free_btree_node(del_node);
+ }
+
+}
+
+int btree_tagged(btree * btree, int tag) {
+ int i = 0;
+ btree_node * head, * tail;
+ btree_node * child;
+
+ head = btree->root;
+ tail = btree->root;
+
+ while(true) {
+ if(head == NULL) {
+ break;
+ }
+
+ for (i = 0; i < head->nr_active; i++ ) {
+
+ if (tag != -1 && tag_get(head,tag,i))
+ return 1;
+
+ }
+
+ if(head->leaf == false) {
+ for(i = 0 ; i < head->nr_active + 1; i++) {
+ child = head->children[i];
+ tail->next = child;
+ tail = child;
+ child->next = NULL;
+ }
+ }
+ head = head->next;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(btree_tagged);
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct btree_node *node, int tag)
+{
+ int idx;
+ for (idx = 0; idx < BTREE_TAG_LONGS; idx++) {
+ if (node->tags[tag][idx])
+ return 1;
+ }
+ return 0;
+}
+
+static void clear_tags(btree_node * node, int index) {
+ int tag;
+ for (tag = 0;tag< BTREE_TAGS; tag++) {
+ tag_clear(node,tag,index);
+ }
+}
+
+static void transfer_tags(btree_node * source_node, int source_index,
+ btree_node * dest_node,int dest_index) {
+
+ int tag;
+ for (tag = 0;tag< BTREE_TAGS; tag++) {
+ if (tag_get(source_node,tag,source_index)) {
+ tag_set(dest_node,tag,dest_index);
+ tag_clear(source_node,tag,source_index);
+ } else {
+ tag_clear(dest_node,tag,dest_index);
+ }
+ }
+}
+
+static unsigned int __lookup(btree * btree, void ** results,
+ unsigned long first_index,unsigned int max_items, int tag) {
+
+ int i = 0, j = 0;
+ int ret = 0;
+ unsigned long key;
+ btree_node * head, * tail;
+ btree_node * child;
+
+ head = btree->root;
+ tail = btree->root;
+
+ while(true) {
+ if(head == NULL) {
+ break;
+ }
+
+ if ((head->key_vals[head->nr_active - 1].key
+ >= first_index) && (ret < max_items)) {
+ for (i = 0; i < head->nr_active; i++ ) {
+
+ if (tag != -1 && !tag_get(head,tag,i))
+ continue;
+
+ if ((key = head->key_vals[i].key)
+ >= first_index) {
+ // Insertion sort
+ for (j = (ret - 1);j>=0;j--) {
+ if (btree->key(results[j]) > key) {
+ results[j + 1] =
+ results[j];
+ } else {
+ break;
+ }
+ }
+
+ results[++j] = head->key_vals[i].val;
+ ret++;
+
+ if(ret == max_items) {
+ break;
+ }
+ }
+ }
+ }
+
+ if(head->leaf == false) {
+ for(i = 0 ; i < head->nr_active + 1; i++) {
+ child = head->children[i];
+ tail->next = child;
+ tail = child;
+ child->next = NULL;
+ }
+ }
+ head = head->next;
+ }
+ return ret;
+}
+
+/**
+* Perform multiple lookup on the B-Tree
+* @param btree The btree
+* @param results where the results will be placed
+* @param first_index start the lookup from this tree
+* @param max_items Place upto these many items in results
+* @return The number of items found
+*/
+unsigned int btree_gang_lookup(btree * btree, void ** results,
+ unsigned long first_index,unsigned int max_items) {
+ return __lookup(btree,results,first_index,max_items, -1);
+}
+
+
+EXPORT_SYMBOL(btree_gang_lookup);
+
+/**
+* Perform multiple lookup on the B-Tree
+* @param btree The btree
+* @param results where the results will be placed
+* @param first_index start the lookup from this tree
+* @param max_items Place upto these many items in results
+* @return The number of items found
+*/
+unsigned int btree_gang_lookup_tag(btree * btree, void ** results,
+ unsigned long first_index,unsigned int max_items,int tag) {
+ return __lookup(btree,results,first_index,max_items, tag);
+}
+
+
+EXPORT_SYMBOL(btree_gang_lookup_tag);
+
+
+/**
+* Function used to search a node in a B-Tree
+* @param btree The B-tree to be searched
+* @param key Key of the node to be search
+* @return The key-value pair
+*/
+void * btree_lookup(btree * btree,unsigned long key) {
+
+ bt_key_val key_val;
+ node_pos kp;
+
+ if (btree->root == NULL) {
+ return NULL;
+ }
+
+ kp = get_btree_node(btree,key);
+ key_val.val = NULL;
+
+ if(kp.node) {
+ key_val = kp.node->key_vals[kp.index];
+ }
+ return key_val.val;
+}
+EXPORT_SYMBOL(btree_lookup);
+
+/**
+* Function used to get the slot in a node of a B-Tree
+* @param btree The B-tree to be searched
+* @param key Key of the node to be search
+* @return The key-value pair
+*/
+bt_key_val * btree_lookup_slot(btree * btree,unsigned long key) {
+
+ bt_key_val * key_val = NULL;
+ node_pos kp;
+
+ if (btree->root == NULL) {
+ return NULL;
+ }
+
+ kp = get_btree_node(btree,key);
+
+ if(kp.node) {
+ key_val = &kp.node->key_vals[kp.index];
+ }
+ return key_val;
+}
+EXPORT_SYMBOL(btree_lookup_slot);
+
+/**
+* Used to set the tag for a node
+* @param btree The btree
+* @param key The key of the page for which the tag is to be set
+* @param tag The tag to be set
+* @return The btree key and val for which the tag is set
+*/
+
+bt_key_val * btree_tag_set(btree * btree,unsigned long key, int tag) {
+ bt_key_val * key_val = NULL;
+ node_pos kp = get_btree_node(btree,key);
+
+ if(kp.node) {
+ key_val = &kp.node->key_vals[kp.index];
+ if(!tag_get(kp.node,tag,kp.index))
+ tag_set(kp.node,tag,kp.index);
+ }
+ BUG_ON(key_val == NULL);
+ return key_val;
+}
+EXPORT_SYMBOL(btree_tag_set);
+
+/**
+* Used to clear the tag for a node
+* @param btree The btree
+* @param key The key of the page for which the tag is to be set
+* @param tag The tag to be set
+* @return The btree key and val for which the tag is cleared
+*/
+
+bt_key_val * btree_tag_clear(btree * btree,unsigned long key, int tag) {
+ bt_key_val * key_val = NULL;
+ node_pos kp = get_btree_node(btree,key);
+
+ if(kp.node) {
+ key_val = &kp.node->key_vals[kp.index];
+ if(tag_get(kp.node,tag,kp.index))
+ tag_clear(kp.node,tag,kp.index);
+ }
+ BUG_ON(key_val == NULL);
+ return key_val;
+}
+EXPORT_SYMBOL(btree_tag_clear);
+
+/**
+* Used to verify whether a tag is set
+* @param btree The btree
+* @param key The key
+* @param tag The tag
+* @return
+* 0: tag not present
+* 1: tag present, set
+* -1: tag present, unset
+*/
+int btree_tag_get(btree * btree,unsigned long key, int tag) {
+ int ret = 0;
+ bt_key_val * key_val = NULL;
+ node_pos kp = get_btree_node(btree,key);
+
+ if(kp.node) {
+ key_val = &kp.node->key_vals[kp.index];
+ ret = tag_get(kp.node,tag,kp.index);
+ return ret? 1 : -1;
+ }
+ BUG_ON(key_val == NULL);
+ return ret;
+}
+EXPORT_SYMBOL(btree_tag_get);
+
+
+/**
+* Get the max key in the btree
+* @param btree The btree
+* @return The max key
+*/
+unsigned long btree_get_max_key(btree * btree) {
+ node_pos node_pos;
+ node_pos = get_max_key_pos(btree,btree->root);
+ return node_pos.node->key_vals[node_pos.index].key;
+}
+
+/**
+* Get the min key in the btree
+* @param btree The btree
+* @return The max key
+*/
+unsigned long btree_get_min_key(btree * btree) {
+ node_pos node_pos;
+ node_pos = get_min_key_pos(btree,btree->root);
+ return node_pos.node->key_vals[node_pos.index].key;
+}
+
+/**
+* Used to print the keys of the btree_node
+* @param node The node whose keys are to be printed
+* @return none
+*/
+
+static void print_single_node(btree *btree, btree_node * node) {
+
+ int i = 0;
+
+ printk(" { ");
+ while(i < node->nr_active) {
+ printk("%d(0x%x) ", node->key_vals[i].key,
+ node->key_vals[i].val);
+ i++;
+ }
+ printk("} (0x%x,%d,%d) ", node,node->nr_active,node->level);
+}
+
+/**
+* Function used to print the B-tree
+* @param root Root of the B-Tree
+* @param print_key Function used to print the key value
+* @return none
+*/
+
+void print_subtree(btree *btree,btree_node * node) {
+
+ int i = 0;
+ unsigned int current_level;
+
+ btree_node * head, * tail;
+
+ current_level = node->level;
+ head = node;
+ tail = node;
+
+ printk("DEBUG: Printing subtree\n");
+ while(true) {
+ if(head == NULL) {
+ break;
+ }
+ if (head->level < current_level) {
+ current_level = head->level;
+ printk("\n");
+ }
+ print_single_node(btree,head);
+
+ if(head->leaf == false) {
+ for(i = 0 ; i < head->nr_active + 1; i++) {
+ tail->next = head->children[i];
+ tail = head->children[i];
+ head->children[i]->next = NULL;
+ }
+ }
+ head = head->next;
+ }
+ printk("\n");
+}
+EXPORT_SYMBOL(print_subtree);
+
diff -urN linux-2.6.16.2/lib/Makefile linux-2.6.16.2-btree/lib/Makefile
--- linux-2.6.16.2/lib/Makefile 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/lib/Makefile 2006-08-17 21:11:21.000000000 -0400
@@ -3,7 +3,7 @@
#
lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
- bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
+ bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o btree.o\
idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
sha1.o
diff -urN linux-2.6.16.2/init/main.c linux-2.6.16.2-btree/init/main.c
--- linux-2.6.16.2/init/main.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/init/main.c 2006-08-17 21:11:21.000000000 -0400
@@ -84,6 +84,7 @@
extern void pidmap_init(void);
extern void prio_tree_init(void);
extern void radix_tree_init(void);
+extern void btree_init(void);
extern void free_initmem(void);
extern void populate_rootfs(void);
extern void driver_init(void);
@@ -530,6 +531,7 @@
security_init();
vfs_caches_init(num_physpages);
radix_tree_init();
+ btree_init();
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
diff -urN linux-2.6.16.2/fs/buffer.c linux-2.6.16.2-btree/fs/buffer.c
--- linux-2.6.16.2/fs/buffer.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/fs/buffer.c 2006-08-17 21:11:21.000000000 -0400
@@ -859,9 +859,16 @@
if (page->mapping) { /* Race with truncate? */
if (mapping_cap_account_dirty(mapping))
inc_page_state(nr_dirty);
- radix_tree_tag_set(&mapping->page_tree,
+ btree_tag_set(&mapping->btree,
page_index(page),
PAGECACHE_TAG_DIRTY);
+
+ if (btree_tag_get(&mapping->btree,
+ page_index(page),PAGECACHE_TAG_DIRTY) != 1) {
+ panic("__set_page_dirty_buffers"
+ " Problem setting the tag\n");
+ }
+
}
write_unlock_irq(&mapping->tree_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff -urN linux-2.6.16.2/fs/inode.c linux-2.6.16.2-btree/fs/inode.c
--- linux-2.6.16.2/fs/inode.c 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/fs/inode.c 2006-08-17 21:11:21.000000000 -0400
@@ -16,6 +16,7 @@
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <linux/hash.h>
+#include <linux/btree.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/pagemap.h>
@@ -196,6 +197,7 @@
mutex_init(&inode->i_mutex);
init_rwsem(&inode->i_alloc_sem);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ INIT_BTREE(&inode->i_data.btree, GFP_ATOMIC);
rwlock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
INIT_LIST_HEAD(&inode->i_data.private_list);
diff -urN linux-2.6.16.2/include/linux/btree.h linux-2.6.16.2-btree/include/linux/btree.h
--- linux-2.6.16.2/include/linux/btree.h 1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.16.2-btree/include/linux/btree.h 2006-08-17 21:11:21.000000000 -0400
@@ -0,0 +1,89 @@
+/*
+ * btree.h
+ *
+ * Copyright (C) 2006 Vishal Patil (vishpat AT gmail DOT com)
+ *
+*/
+
+#ifndef _BTREE_H_
+#define _BTREE_H_
+
+#include <linux/sched.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
+
+#define BTREE_ORDER 8
+#define BTREE_TAGS 2
+#define BTREE_TAG_LONGS \
+ ((BTREE_ORDER + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define BTREE_MAX_PATH 8
+
+
+typedef enum {false,true} bool;
+typedef struct btree_node btree_node;
+
+typedef struct {
+ unsigned long key;
+ void * val;
+} bt_key_val;
+
+
+typedef struct btree {
+ unsigned int order; // B-Tree order
+ btree_node * root; // Root of the B-Tree
+ unsigned long (*value)(unsigned long key); // Generate uint value for the key
+ unsigned long (*key)(void * data); // Get the value of the key from
+ // from the data
+ void (*print_key)(unsigned long key); // Print the key
+ gfp_t gfp_mask;
+} btree;
+
+extern int btree_insert(btree * btree, unsigned long key, void * val);
+extern bt_key_val btree_delete(btree * btree,btree_node * subtree,
+ unsigned long key);
+extern void * btree_lookup(btree * btree, unsigned long key);
+extern unsigned int btree_gang_lookup(btree * btree, void ** results,
+ unsigned long first_index,unsigned int max_items);
+extern unsigned int btree_gang_lookup_tag(btree * btree, void ** results,
+ unsigned long first_index,unsigned int max_items,int tag);
+extern bt_key_val * btree_lookup_slot(btree * btree, unsigned long key);
+extern bt_key_val * btree_tag_set(btree * btree,unsigned long key, int tag);
+extern int btree_tag_get(btree * btree,unsigned long key, int tag);
+extern bt_key_val * btree_tag_clear(btree * btree,unsigned long key, int tag);
+extern void btree_destroy(btree * btree);
+extern unsigned long btree_get_max_key(btree * btree);
+extern unsigned long btree_get_min_key(btree * btree);
+extern unsigned long btree_value_fn(unsigned long key);
+extern unsigned long btree_key_fn(void * page);
+extern void btree_print_fn(unsigned long key);
+extern int btree_preload(gfp_t gfp_mask);
+extern int btree_tagged(btree * btree, int tag);
+
+static inline void btree_preload_end(void)
+{
+ preempt_enable();
+}
+
+#define BTREE_INIT(mask) { \
+ .order = BTREE_ORDER, \
+ .root = NULL, \
+ .value = btree_value_fn, \
+ .key = btree_key_fn, \
+ .print_key = btree_print_fn, \
+ .gfp_mask = (mask) \
+}
+
+#define INIT_BTREE(btree,mask) \
+do { \
+ (btree)->order = BTREE_ORDER; \
+ (btree)->root = NULL; \
+ (btree)->value = btree_value_fn; \
+ (btree)->key = btree_key_fn; \
+ (btree)->print_key = btree_print_fn; \
+ (btree)->gfp_mask = (mask); \
+} while (0)
+
+
+extern void print_subtree(btree * btree,btree_node * node);
+
+#endif
diff -urN linux-2.6.16.2/include/linux/fs.h linux-2.6.16.2-btree/include/linux/fs.h
--- linux-2.6.16.2/include/linux/fs.h 2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/include/linux/fs.h 2006-08-17 21:11:21.000000000 -0400
@@ -214,6 +214,7 @@
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/radix-tree.h>
+#include <linux/btree.h>
#include <linux/prio_tree.h>
#include <linux/init.h>
#include <linux/sched.h>
@@ -372,6 +373,7 @@
struct address_space {
struct inode *host; /* owner: inode, block_device */
struct radix_tree_root page_tree; /* radix tree of all pages */
+ struct btree btree; /* BTree of all pages*/
rwlock_t tree_lock; /* and rwlock protecting it */
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
[-- Attachment #3: btree.txt --]
[-- Type: text/plain, Size: 1894 bytes --]
File Blk Num Avg Maximum Lat% Lat% CPU
Identifier Size Size Thr Rate (CPU%) Latency Latency >2s >10s Eff
------------------- ------ ----- --- ------ ------ --------- ----------- -------- -------- -----
Sequential Reads
2.6.16.2-btree 1004 4096 1 23.89 5.196% 0.163 298.47 0.00000 0.00000 460
2.6.16.2-btree 1004 4096 2 41.37 22.74% 0.186 137.93 0.00000 0.00000 182
2.6.16.2-btree 1004 4096 4 39.82 29.40% 0.356 336.10 0.00000 0.00000 135
2.6.16.2-btree 1004 4096 8 36.22 45.19% 0.767 943.55 0.00000 0.00000 80
Random Reads
2.6.16.2-btree 1004 4096 1 0.51 0.679% 7.646 40.24 0.00000 0.00000 75
2.6.16.2-btree 1004 4096 2 0.43 0.475% 18.071 111.75 0.00000 0.00000 91
2.6.16.2-btree 1004 4096 4 0.46 1.239% 31.134 226.30 0.00000 0.00000 37
2.6.16.2-btree 1004 4096 8 0.45 2.915% 60.470 378.33 0.00000 0.00000 15
Sequential Writes
2.6.16.2-btree 1004 4096 1 16.14 65.46% 0.239 2163.35 0.00039 0.00000 25
2.6.16.2-btree 1004 4096 2 17.24 75.47% 0.430 7099.42 0.00195 0.00000 23
2.6.16.2-btree 1004 4096 4 15.64 66.73% 0.857 9904.39 0.00856 0.00000 23
2.6.16.2-btree 1004 4096 8 13.73 80.91% 1.858 13682.20 0.03398 0.00078 17
Random Writes
2.6.16.2-btree 1004 4096 1 7.66 35.28% 0.500 361.49 0.00000 0.00000 22
2.6.16.2-btree 1004 4096 2 0.86 1.654% 0.668 125.78 0.00000 0.00000 52
2.6.16.2-btree 1004 4096 4 0.96 2.904% 0.620 113.66 0.00000 0.00000 33
2.6.16.2-btree 1004 4096 8 1.04 4.632% 0.461 175.49 0.00000 0.00000 22
[-- Attachment #4: radixtree.txt --]
[-- Type: text/plain, Size: 2083 bytes --]
File Blk Num Avg Maximum Lat% Lat% CPU
Identifier Size Size Thr Rate (CPU%) Latency Latency >2s >10s Eff
---------------------------- ------ ----- --- ------ ------ --------- ----------- -------- -------- -----
Sequential Reads
2.6.15-1.2054_FC5smp 1004 4096 1 54.31 9.370% 0.071 29.98 0.00000 0.00000 580
2.6.15-1.2054_FC5smp 1004 4096 2 39.66 11.51% 0.195 158.49 0.00000 0.00000 344
2.6.15-1.2054_FC5smp 1004 4096 4 42.39 27.79% 0.360 371.22 0.00000 0.00000 152
2.6.15-1.2054_FC5smp 1004 4096 8 36.02 37.36% 0.750 870.62 0.00000 0.00000 96
Random Reads
2.6.15-1.2054_FC5smp 1004 4096 1 0.46 0.516% 8.523 39.20 0.00000 0.00000 89
2.6.15-1.2054_FC5smp 1004 4096 2 0.41 0.500% 18.946 136.92 0.00000 0.00000 81
2.6.15-1.2054_FC5smp 1004 4096 4 0.50 1.235% 30.916 226.78 0.00000 0.00000 40
2.6.15-1.2054_FC5smp 1004 4096 8 0.46 2.755% 61.054 590.53 0.00000 0.00000 17
Sequential Writes
2.6.15-1.2054_FC5smp 1004 4096 1 47.80 24.17% 0.071 1608.05 0.00000 0.00000 198
2.6.15-1.2054_FC5smp 1004 4096 2 45.82 51.59% 0.135 2742.27 0.00078 0.00000 89
2.6.15-1.2054_FC5smp 1004 4096 4 44.34 99.81% 0.291 3190.65 0.00272 0.00000 44
2.6.15-1.2054_FC5smp 1004 4096 8 42.57 181.1% 0.455 4463.81 0.00234 0.00000 24
Random Writes
2.6.15-1.2054_FC5smp 1004 4096 1 1.46 0.561% 0.086 29.08 0.00000 0.00000 260
2.6.15-1.2054_FC5smp 1004 4096 2 1.44 1.547% 0.072 42.84 0.00000 0.00000 93
2.6.15-1.2054_FC5smp 1004 4096 4 1.47 2.707% 0.015 0.55 0.00000 0.00000 54
2.6.15-1.2054_FC5smp 1004 4096 8 1.43 7.313% 0.015 0.60 0.00000 0.00000 20
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: Page cache using B-trees benchmark results
2006-08-18 1:43 Page cache using B-trees benchmark results Vishal Patil
@ 2006-08-18 13:52 ` Bill Davidsen
2006-08-18 16:25 ` Andi Kleen
1 sibling, 0 replies; 4+ messages in thread
From: Bill Davidsen @ 2006-08-18 13:52 UTC (permalink / raw)
To: linux-kernel; +Cc: Andrea Arcangeli
Vishal Patil wrote:
> Folks
>
> I am attaching the benchmark results for Page Cache Implementation
> using B-trees. I basically ran the tio (threaded i/o) benchmark
> against my kernel (with the B-tree implementation) and the Linux
> kernel shipped with FC5. Radix tree implementation is definately
> better however the B-tree implementation did not suck that bad :)
>
> Also I attaching a new patch which was used for measuring the
> benchmarks. Also henceforth changes to the page will be tracked using
> the projected hosted at http://code.google.com/p/btreepc
>
Thanks for this. I guess a purist would say that you need to run against
the base kernel and base kernel plus your patches, but these numbers are
certainly enough to support your conclusion.
What's next?
--
Bill Davidsen <davidsen@tmr.com>
Obscure bug of 2004: BASH BUFFER OVERFLOW - if bash is being run by a
normal user and is setuid root, with the "vi" line edit mode selected,
and the character set is "big5," an off-by-one errors occurs during
wildcard (glob) expansion.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: Page cache using B-trees benchmark results
2006-08-18 1:43 Page cache using B-trees benchmark results Vishal Patil
2006-08-18 13:52 ` Bill Davidsen
@ 2006-08-18 16:25 ` Andi Kleen
2006-08-18 16:37 ` Vishal Patil
1 sibling, 1 reply; 4+ messages in thread
From: Andi Kleen @ 2006-08-18 16:25 UTC (permalink / raw)
To: Vishal Patil; +Cc: Andrea Arcangeli, linux-kernel
"Vishal Patil" <vishpat@gmail.com> writes:
> I am attaching the benchmark results for Page Cache Implementation
> using B-trees. I basically ran the tio (threaded i/o) benchmark
> against my kernel (with the B-tree implementation) and the Linux
I suppose you'll need some more varied benchmarks to get
more solid data.
> kernel shipped with FC5. Radix tree implementation is definately
> better however the B-tree implementation did not suck that bad :)
Have you considered trying it again instead of radix tree with
another data structure? There are still plenty of other big
hash tables in the kernel that might benefit from trying
a different approach:
> dmesg | grep -i hash
PID hash table entries: 4096 (order: 12, 131072 bytes)
Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes)
Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes)
Mount-cache hash table entries: 256
Dquot-cache hash table entries: 512 (order 0, 4096 bytes)
IP route cache hash table entries: 65536 (order: 7, 524288 bytes)
TCP established hash table entries: 262144 (order: 9, 2097152 bytes)
TCP bind hash table entries: 65536 (order: 7, 524288 bytes)
TCP: Hash tables configured (established 262144 bind 65536)
e.g. the dentry/inode hashes are an obvious attack point.
Of course you'll need benchmarks that actually stress them.
-Andi
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: Page cache using B-trees benchmark results
2006-08-18 16:25 ` Andi Kleen
@ 2006-08-18 16:37 ` Vishal Patil
0 siblings, 0 replies; 4+ messages in thread
From: Vishal Patil @ 2006-08-18 16:37 UTC (permalink / raw)
To: Andi Kleen; +Cc: Andrea Arcangeli, linux-kernel
Hey Andi....This is useful information....I will look into it and let
you know....
Many thanks.
- Vishal
On 18 Aug 2006 18:25:54 +0200, Andi Kleen <ak@suse.de> wrote:
> "Vishal Patil" <vishpat@gmail.com> writes:
>
> > I am attaching the benchmark results for Page Cache Implementation
> > using B-trees. I basically ran the tio (threaded i/o) benchmark
> > against my kernel (with the B-tree implementation) and the Linux
>
> I suppose you'll need some more varied benchmarks to get
> more solid data.
>
> > kernel shipped with FC5. Radix tree implementation is definately
> > better however the B-tree implementation did not suck that bad :)
>
> Have you considered trying it again instead of radix tree with
> another data structure? There are still plenty of other big
> hash tables in the kernel that might benefit from trying
> a different approach:
>
> > dmesg | grep -i hash
> PID hash table entries: 4096 (order: 12, 131072 bytes)
> Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes)
> Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes)
> Mount-cache hash table entries: 256
> Dquot-cache hash table entries: 512 (order 0, 4096 bytes)
> IP route cache hash table entries: 65536 (order: 7, 524288 bytes)
> TCP established hash table entries: 262144 (order: 9, 2097152 bytes)
> TCP bind hash table entries: 65536 (order: 7, 524288 bytes)
> TCP: Hash tables configured (established 262144 bind 65536)
>
> e.g. the dentry/inode hashes are an obvious attack point.
>
> Of course you'll need benchmarks that actually stress them.
>
> -Andi
>
--
Motivation will almost always beat mere talent.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2006-08-18 16:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-18 1:43 Page cache using B-trees benchmark results Vishal Patil
2006-08-18 13:52 ` Bill Davidsen
2006-08-18 16:25 ` Andi Kleen
2006-08-18 16:37 ` Vishal Patil
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox