public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* Page cache using B-trees benchmark results
@ 2006-08-18  1:43 Vishal Patil
  2006-08-18 13:52 ` Bill Davidsen
  2006-08-18 16:25 ` Andi Kleen
  0 siblings, 2 replies; 4+ messages in thread
From: Vishal Patil @ 2006-08-18  1:43 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Andrea Arcangeli

[-- Attachment #1: Type: text/plain, Size: 595 bytes --]

Folks

I am attaching the benchmark results for Page Cache Implementation
using B-trees. I basically ran the tio (threaded i/o) benchmark
against my kernel (with the B-tree implementation) and the Linux
kernel shipped with FC5. Radix tree implementation is definately
better however the B-tree implementation did not suck that bad :)

Also I attaching a new patch which was used for measuring the
benchmarks. Also henceforth changes to the page will be tracked using
the projected hosted at http://code.google.com/p/btreepc

Thanks

- Vishal

-- 
Motivation will almost always beat mere talent.

[-- Attachment #2: btree.linux2.6.16.2.patch --]
[-- Type: text/x-patch, Size: 55283 bytes --]

diff -urN linux-2.6.16.2/mm/filemap.c linux-2.6.16.2-btree/mm/filemap.c
--- linux-2.6.16.2/mm/filemap.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/filemap.c	2006-08-17 21:11:21.000000000 -0400
@@ -112,10 +112,16 @@
  */
 void __remove_from_page_cache(struct page *page)
 {
+	struct page * bpage;
 	struct address_space *mapping = page->mapping;
+	unsigned long index = page->index;	
 
-	radix_tree_delete(&mapping->page_tree, page->index);
-	page->mapping = NULL;
+	bpage = btree_delete(&mapping->btree,mapping->btree.root,index).val;
+	if ( bpage != page) {
+		panic("DEBUG: Deleting wrong page in remove from page" 
+			"cache %d,0x%x\n",index,bpage);
+	} 
+     	page->mapping = NULL;
 	mapping->nrpages--;
 	pagecache_acct(-1);
 }
@@ -395,12 +401,14 @@
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 		pgoff_t offset, gfp_t gfp_mask)
 {
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-
-	if (error == 0) {
+        int btree_error = btree_preload(gfp_mask & ~__GFP_HIGHMEM);
+        
+	if (btree_error == 0) {
 		write_lock_irq(&mapping->tree_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
+                btree_error = 
+			btree_insert(&mapping->btree,offset,page);
+                
+		if (!btree_error) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			page->mapping = mapping;
@@ -409,9 +417,11 @@
 			pagecache_acct(1);
 		}
 		write_unlock_irq(&mapping->tree_lock);
-		radix_tree_preload_end();
+                if(btree_error == 0) {
+                        btree_preload_end();
+                }                        
 	}
-	return error;
+	return btree_error;
 }
 
 EXPORT_SYMBOL(add_to_page_cache);
@@ -522,7 +532,7 @@
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = btree_lookup(&mapping->btree,offset);
 	if (page)
 		page_cache_get(page);
 	read_unlock_irq(&mapping->tree_lock);
@@ -539,7 +549,7 @@
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = btree_lookup(&mapping->btree,offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
 	read_unlock_irq(&mapping->tree_lock);
@@ -563,10 +573,9 @@
 				unsigned long offset)
 {
 	struct page *page;
-
 	read_lock_irq(&mapping->tree_lock);
 repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = btree_lookup(&mapping->btree,offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
@@ -658,8 +667,10 @@
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
-				(void **)pages, start, nr_pages);
+
+	ret = btree_gang_lookup(&mapping->btree,(void **)pages,
+				start,nr_pages);
+
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
 	read_unlock_irq(&mapping->tree_lock);
@@ -677,8 +688,9 @@
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
+	ret = btree_gang_lookup_tag(&mapping->btree,
 				(void **)pages, *index, nr_pages, tag);
+		
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
 	if (ret)
diff -urN linux-2.6.16.2/mm/page-writeback.c linux-2.6.16.2-btree/mm/page-writeback.c
--- linux-2.6.16.2/mm/page-writeback.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/page-writeback.c	2006-08-17 21:11:21.000000000 -0400
@@ -634,8 +634,16 @@
 				BUG_ON(mapping2 != mapping);
 				if (mapping_cap_account_dirty(mapping))
 					inc_page_state(nr_dirty);
-				radix_tree_tag_set(&mapping->page_tree,
-					page_index(page), PAGECACHE_TAG_DIRTY);
+				btree_tag_set(&mapping->btree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+
+				if (btree_tag_get(&mapping->btree,
+				page_index(page), PAGECACHE_TAG_DIRTY) != 1 ) {
+					panic("__set_page_dirty_nobuffers "
+					" Problem setting the tag\n");
+				}					
+
+				
 			}
 			write_unlock_irq(&mapping->tree_lock);
 			if (mapping->host) {
@@ -714,9 +722,16 @@
 	if (mapping) {
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		if (TestClearPageDirty(page)) {
-			radix_tree_tag_clear(&mapping->page_tree,
+			btree_tag_clear(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+
+			if (btree_tag_get(&mapping->btree,
+					page_index(page),
+					PAGECACHE_TAG_DIRTY)  == 1) {
+				panic("Problem clearing the tag\n");
+			}				
+
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
 			if (mapping_cap_account_dirty(mapping))
 				dec_page_state(nr_dirty);
@@ -769,10 +784,17 @@
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
-			radix_tree_tag_clear(&mapping->page_tree,
+		if (ret) {
+			btree_tag_clear(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (btree_tag_get(&mapping->btree,
+				page_index(page),
+				PAGECACHE_TAG_WRITEBACK) == 1) {
+				panic("Problem clearing the tag\n");
+			}				
+
+		}			
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -790,14 +812,29 @@
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
-			radix_tree_tag_set(&mapping->page_tree,
+		if (!ret) {
+			btree_tag_set(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-		if (!PageDirty(page))
-			radix_tree_tag_clear(&mapping->page_tree,
+			if (btree_tag_get(&mapping->btree,
+				page_index(page),
+				PAGECACHE_TAG_WRITEBACK) != 1) {
+				panic("test_set_page_writeback"
+					"Problem setting the tag\n");
+			}				
+
+		}			
+		if (!PageDirty(page)) {
+			btree_tag_clear(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+			if (btree_tag_get(&mapping->btree,
+				page_index(page),
+				PAGECACHE_TAG_DIRTY) == 1) {
+				panic("Problem clearing the tag");
+			}				
+
+		}			
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
@@ -817,7 +854,7 @@
 	int ret;
 
 	read_lock_irqsave(&mapping->tree_lock, flags);
-	ret = radix_tree_tagged(&mapping->page_tree, tag);
+	ret = btree_tagged(&mapping->btree,tag);
 	read_unlock_irqrestore(&mapping->tree_lock, flags);
 	return ret;
 }
diff -urN linux-2.6.16.2/mm/readahead.c linux-2.6.16.2-btree/mm/readahead.c
--- linux-2.6.16.2/mm/readahead.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/readahead.c	2006-08-17 21:11:21.000000000 -0400
@@ -282,7 +282,8 @@
 		if (page_offset > end_index)
 			break;
 
-		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		page = btree_lookup(&mapping->btree,page_offset);
+	
 		if (page)
 			continue;
 
diff -urN linux-2.6.16.2/mm/swap_state.c linux-2.6.16.2-btree/mm/swap_state.c
--- linux-2.6.16.2/mm/swap_state.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/swap_state.c	2006-08-17 21:11:21.000000000 -0400
@@ -37,6 +37,7 @@
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+        .btree          = BTREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 
 	.tree_lock	= RW_LOCK_UNLOCKED,
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
@@ -71,16 +72,21 @@
 static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			       gfp_t gfp_mask)
 {
-	int error;
+	int btree_error;
 
 	BUG_ON(PageSwapCache(page));
 	BUG_ON(PagePrivate(page));
-	error = radix_tree_preload(gfp_mask);
-	if (!error) {
+
+        btree_error = btree_preload(gfp_mask);
+	
+	if (!btree_error) {
 		write_lock_irq(&swapper_space.tree_lock);
-		error = radix_tree_insert(&swapper_space.page_tree,
+                if (!btree_error) {
+                       btree_error = btree_insert(&swapper_space.btree,
 						entry.val, page);
-		if (!error) {
+                }
+                                               
+		if (!btree_error) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
@@ -89,9 +95,12 @@
 			pagecache_acct(1);
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
-		radix_tree_preload_end();
+
+                if (!btree_error) {
+                        btree_preload_end();
+                }
 	}
-	return error;
+	return btree_error;
 }
 
 static int add_to_swap_cache(struct page *page, swp_entry_t entry)
@@ -127,7 +136,10 @@
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+        if (btree_delete(&swapper_space.btree,swapper_space.btree.root,
+                                page_private(page)).val != page) {
+		panic("DEBUG: Deleting wrong page from swap cache\n");			
+	}
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
diff -urN linux-2.6.16.2/mm/vmscan.c linux-2.6.16.2-btree/mm/vmscan.c
--- linux-2.6.16.2/mm/vmscan.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/vmscan.c	2006-08-17 21:11:21.000000000 -0400
@@ -692,7 +692,7 @@
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
-	struct page **radix_pointer;
+	struct page * btree_pointer;
 
 	/*
 	 * Avoid doing any of the following work if the page count
@@ -733,12 +733,12 @@
 
 	write_lock_irq(&mapping->tree_lock);
 
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
+	btree_pointer = (struct page *)btree_lookup_slot(
+						&mapping->btree,
 						page_index(page));
 
 	if (!page_mapping(page) || page_count(page) != nr_refs ||
-			*radix_pointer != page) {
+			btree_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -759,7 +759,7 @@
 		set_page_private(newpage, page_private(page));
 	}
 
-	*radix_pointer = newpage;
+	btree_pointer = newpage;
 	__put_page(page);
 	write_unlock_irq(&mapping->tree_lock);
 
diff -urN linux-2.6.16.2/lib/btree.c linux-2.6.16.2-btree/lib/btree.c
--- linux-2.6.16.2/lib/btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.16.2-btree/lib/btree.c	2006-08-17 21:11:36.000000000 -0400
@@ -0,0 +1,1291 @@
+/*
+ *  btree.c
+ *
+ *  Copyright (C) 2006 Vishal Patil (vishpat AT gmail DOT com) 
+ *
+*/
+
+#include <linux/btree.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/radix-tree.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>  
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+
+typedef enum {left = -1,right = 1} position_t;
+
+typedef struct {
+	btree_node * node;
+	unsigned int index;
+}node_pos;
+
+struct btree_node {
+	struct btree_node * next;		        // Pointer used for linked list 
+	bool leaf;			        // Used to indicate whether leaf or not
+        unsigned int nr_active;		        // Number of active keys
+	unsigned int level;		        // Level in the B-Tree
+        bt_key_val key_vals[2*BTREE_ORDER - 1];	// Array of keys and values
+        struct btree_node * children[2*BTREE_ORDER];	// Array of pointers to child nodes
+        unsigned long tags[BTREE_TAGS][BTREE_TAG_LONGS];
+                                                // Bitmap required by page
+                                                // cache
+	unsigned long padding[6];
+} ;
+
+/*
+*       B-Tree node and key val cache
+*/
+static kmem_cache_t *btree_node_cachep;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct btree_preload {
+        int nr;
+        struct btree_node *nodes[BTREE_MAX_PATH];
+};
+DEFINE_PER_CPU(struct btree_preload, btree_preloads) = { 0, };
+
+static btree_node * allocate_btree_node (btree * btree,unsigned int order);
+static node_pos get_btree_node(btree * btree,unsigned long key);
+static bt_key_val remove_key_from_leaf(btree * btree, node_pos * node_pos);
+static btree_node * merge_nodes(btree * btree, btree_node * n1, bt_key_val kv ,
+        btree_node * n2);
+static void move_key(btree * btree, btree_node * node, unsigned int index, 
+                                        position_t pos);
+static node_pos get_max_key_pos(btree * btree, btree_node * subtree);
+static node_pos get_min_key_pos(btree * btree, btree_node * subtree);
+static btree_node * merge_siblings(btree * btree, btree_node * parent,
+                      unsigned int index,position_t pos);
+static unsigned int __lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items, int tag);
+static void print_single_node(btree *btree, btree_node * node); 
+static void transfer_tags(btree_node * source_node, int source_index,
+                btree_node * dest_node,int dest_index);
+static void clear_tags(btree_node * node, int index);
+
+static void
+btree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) {
+        memset(node, 0, sizeof(struct btree_node));
+}
+
+void __init btree_init(void) {
+        btree_node_cachep = kmem_cache_create("btree_node",
+                sizeof(btree_node), 0,
+                SLAB_PANIC, btree_node_ctor, NULL);
+}
+
+unsigned long btree_key_fn (void * page) {
+        return (((struct page*)page)->index);
+}
+
+unsigned long btree_value_fn (unsigned long key) {
+        return *((unsigned long*)key);
+}
+
+void btree_print_fn (unsigned long key) {
+        printk("%lu ", *((unsigned long*)key));
+}
+
+static inline void tag_set(struct btree_node *node, int tag, int offset)
+{
+        __set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct btree_node *node, int tag, int offset)
+{
+        __clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct btree_node *node, int tag, int offset)
+{
+        return test_bit(offset, node->tags[tag]);
+}
+
+/*
+ * A modified form of binary search that searches for index position of the key
+ * In case the key is not found the index of the key lesser than the search key
+ * is returned
+ */
+static inline int search_key(btree_node * node, long key) {
+      int left_index = 0;
+      int right_index = node->nr_active - 1;
+      int mid_index = -1;
+
+      if (key > node->key_vals[right_index].key) {
+               return right_index + 1;
+      }
+
+      while (left_index <= right_index) {
+              mid_index = left_index + (right_index - left_index)/2;
+
+              if (key < node->key_vals[mid_index].key) {
+                      right_index = mid_index - 1;
+              } else
+              if (key > node->key_vals[mid_index].key) {
+                      left_index = mid_index + 1;
+              } else {
+                      return mid_index;
+              }
+      }
+      return right_index + 1;
+}
+
+
+
+/*
+ * Load up this CPU's btree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int btree_preload(gfp_t gfp_mask) {
+        struct btree_preload *btp;
+        struct btree_node *node;
+        int ret = -ENOMEM;
+
+        preempt_disable();
+        btp = &__get_cpu_var(btree_preloads);
+        while (btp->nr < ARRAY_SIZE(btp->nodes)) {
+                preempt_enable();
+                node = kmem_cache_alloc(btree_node_cachep, gfp_mask);
+                if (node == NULL)
+                        goto out;
+                preempt_disable();
+                btp = &__get_cpu_var(btree_preloads);
+                if (btp->nr < ARRAY_SIZE(btp->nodes))
+                        btp->nodes[btp->nr++] = node;
+                else
+                        kmem_cache_free(btree_node_cachep, node);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+
+/**
+*       Function used to allocate memory for the btree node
+*       @param btree The btree
+*       @param order Order of the B-Tree
+*       @return The allocated B-tree node
+*/
+static btree_node * allocate_btree_node (btree * btree,unsigned int order) {
+        btree_node * node;
+
+        // Allocate memory for the node
+        node = kmem_cache_alloc(btree_node_cachep,btree->gfp_mask);
+
+        // If this is NULL get preloaded node
+        if (node  == NULL && !(btree->gfp_mask & __GFP_WAIT)) {
+                struct btree_preload *btp;
+
+                btp = &__get_cpu_var(btree_preloads);
+                if (btp->nr) {
+                        node = btp->nodes[btp->nr - 1];
+                        btp->nodes[btp->nr - 1] = NULL;
+                        btp->nr--;
+                }
+        }
+       
+        // Initialize the number of active nodes
+        node->nr_active = 0;
+
+	// Use to determine whether it is a leaf
+	node->leaf = true;
+
+	// Use to determine the level in the tree
+	node->level = 0;
+
+	//Initialize the linked list pointer to NULL
+	node->next = NULL;
+
+        return node;
+}
+
+/**
+*       Function used to free the memory allocated to the b-tree 
+*       @param node The node to be freed
+*       @param order Order of the B-Tree
+*       @return The allocated B-tree node
+*/
+static int free_btree_node (btree_node * node) {
+        kmem_cache_free(btree_node_cachep,node);
+        return 0;
+}
+
+/**
+*	Used to split the child node and adjust the parent so that
+*	it has two children
+*	@param parent Parent Node
+*	@param index Index of the child node
+*	@param child  Full child node
+*	
+*/
+static void btree_split_child(btree * btree, btree_node * parent, 
+				unsigned int index,
+				btree_node * child) {
+	int i = 0;	
+	unsigned int order = btree->order;
+
+	btree_node * new_child = allocate_btree_node(btree,btree->order); 
+	new_child->leaf = child->leaf;
+	new_child->level = child->level;
+	new_child->nr_active = btree->order - 1;
+
+	// Copy the higher order keys to the new child	
+	for(i=0;i<order - 1;i++) {
+		new_child->key_vals[i] = child->key_vals[i + order];
+                transfer_tags(child,i + order,new_child,i); 
+                
+                if(!child->leaf) {
+			new_child->children[i] = 
+				child->children[i + order];
+		} 
+	}
+	
+	// Copy the last child pointer
+	if(!child->leaf) {
+		new_child->children[i] = 
+		child->children[i + order];
+	}
+
+	child->nr_active = order - 1;
+	
+	for(i = parent->nr_active + 1;i > index + 1;i--) {
+		parent->children[i] = parent->children[i - 1];
+	}
+	parent->children[index + 1] = new_child;
+
+	for(i = parent->nr_active;i > index;i--) {
+		parent->key_vals[i] = parent->key_vals[i - 1];
+                transfer_tags(parent,i - 1,parent,i);                
+	}
+
+	parent->key_vals[index] = child->key_vals[order - 1];	
+        transfer_tags(child,order - 1,parent,index);
+	parent->nr_active++;
+
+}
+
+/**
+*	Used to insert a key in the non-full node
+*	@param btree The btree
+*	@param node The node to which the key will be added
+*	@param the key value pair
+*	@return void
+*/
+
+static int btree_insert_nonfull (btree * btree, btree_node * parent_node,
+				bt_key_val * key_val) {
+	
+	unsigned long key = key_val->key;
+	int i,j,index;
+	btree_node * child;	
+	btree_node * node = parent_node;
+	
+insert:	i = node->nr_active - 1;
+	if(node->leaf) {
+		while(i >= 0 && key < node->key_vals[i].key) {
+			node->key_vals[i + 1] = node->key_vals[i];
+                        transfer_tags(node,i,node,i + 1);
+			i--;
+		}
+
+		node->key_vals[i + 1].key = key;
+                node->key_vals[i + 1].val = key_val->val;
+
+		// Identical key found
+          	if (i >= 0 && 
+			key == node->key_vals[i].key) {
+                        for (j = i + 1; j <node->nr_active; j++ ) {
+                                node->key_vals[j] =
+                                        node->key_vals[j + 1];
+				transfer_tags(node,j,node,j + 1);
+                        }
+			return -EEXIST;
+                } else
+		node->nr_active++; 
+	} else {
+		while (i >= 0 && key < node->key_vals[i].key) {
+			i--;
+		}
+
+                // Identical key found  
+                if (i >= 0 && key == node->key_vals[i].key) {
+                        return -EEXIST;
+                }
+
+		i++;
+		child = node->children[i]; 
+		
+		if(child->nr_active == 2*btree->order - 1) {
+
+ 			// Check for identical key here
+			index = search_key(child, key);
+			if (key == child->key_vals[index].key) {
+				return -EEXIST;
+			}
+
+			btree_split_child(btree,node,i,child);
+			if(key > 
+				node->key_vals[i].key) {
+				i++;	
+			}	
+		}
+		node = node->children[i];
+		goto insert;	
+	}
+       return 0;	
+}
+
+
+/**
+*       Function used to insert node into a B-Tree
+*       @param root Root of the B-Tree
+*       @param node The node to be inserted
+*       @param compare Function used to compare the two nodes of the tree
+*       @return success or failure
+*/
+int btree_insert(btree * btree, unsigned long key, void * val) {
+	btree_node * rnode = NULL;
+        bt_key_val key_val;
+        int ret = -1;	
+       	
+        key_val.key = key;
+        key_val.val = val;
+
+        // During the B-tree initialization
+        if(unlikely(btree->root == NULL)) {
+                btree->root = allocate_btree_node(btree,btree->order);
+        }
+        rnode = btree->root;
+
+	if(rnode->nr_active == (2*btree->order - 1)) {
+		btree_node * new_root;
+		new_root = allocate_btree_node(btree,btree->order);
+		new_root->level = btree->root->level + 1;
+		btree->root = new_root;	
+		new_root->leaf = false;
+		new_root->nr_active = 0;
+		new_root->children[0]  = rnode;
+		btree_split_child(btree,new_root,0,rnode);
+		ret = btree_insert_nonfull(btree,new_root,&key_val);	
+	} else
+		ret = btree_insert_nonfull(btree,rnode,&key_val);		
+        return ret;
+}
+EXPORT_SYMBOL(btree_insert);        
+
+/**
+*	Used to get the position of the MAX key within the subtree
+*	@param btree The btree
+*	@param subtree The subtree to be searched
+*	@return The node containing the key and position of the key
+*/
+static node_pos get_max_key_pos(btree * btree, btree_node * subtree) {
+	node_pos node_pos;
+	btree_node * node = subtree; 
+	
+	node_pos.node = NULL;
+	node_pos.index = 0;
+
+	while(true) {
+		if(node == NULL) {
+			break;
+		}
+
+		if(node->leaf) {
+			node_pos.node 	= node;
+			node_pos.index 	= node->nr_active - 1;
+			return node_pos;	
+		} else {
+			node_pos.node 	= node;
+			node_pos.index 	= node->nr_active - 1;
+			node = node->children[node->nr_active];	
+		}
+	}
+	return node_pos;
+}
+
+/**
+*	Used to get the position of the MAX key within the subtree
+*	@param btree The btree
+*	@param subtree The subtree to be searched
+*	@return The node containing the key and position of the key
+*/
+static node_pos get_min_key_pos(btree * btree, btree_node * subtree) {
+	node_pos node_pos;
+	btree_node * node = subtree; 
+
+	node_pos.node = NULL;
+	node_pos.index = 0;
+
+	while(true) {
+		if(node == NULL) {
+			break;
+		}
+
+		if(node->leaf) {
+			node_pos.node 	= node;
+			node_pos.index 	= 0;
+			return node_pos;	
+		} else {
+			node_pos.node 	= node;
+			node_pos.index 	= 0;
+			node = node->children[0];	
+		}
+	}
+	return node_pos;
+}
+
+/**
+*	Merge nodes n1 and n2 (case 3b from Cormen)
+*	@param btree The btree 
+*	@param node The parent node 
+*	@param index of the child
+*	@param pos left or right
+*	@return none 
+*/
+static btree_node * merge_siblings(btree * btree, btree_node * parent, unsigned int index , 
+					position_t pos) {
+	unsigned int j;
+	btree_node * new_node;
+	btree_node * n1, * n2;
+        
+        if (index == (parent->nr_active)) {   
+               index--;
+	       n1 = parent->children[parent->nr_active - 1];
+	       n2 = parent->children[parent->nr_active];
+        } else {
+               n1 = parent->children[index];
+	       n2 = parent->children[index + 1];
+        }
+
+	//Merge the current node with the left node
+	new_node = allocate_btree_node(btree,btree->order);
+	new_node->level = n1->level;
+	new_node->leaf = n1->leaf;
+
+	for(j=0;j<btree->order - 1; j++) {
+		new_node->key_vals[j] =	n1->key_vals[j];
+                transfer_tags(n1,j,new_node,j);
+		new_node->children[j] =	n1->children[j];
+	}
+	
+	new_node->key_vals[btree->order - 1] =	parent->key_vals[index];
+        transfer_tags(parent,index,new_node,btree->order - 1);
+	new_node->children[btree->order - 1] =	n1->children[btree->order - 1];
+
+	for(j=0;j<btree->order - 1; j++) {
+		new_node->key_vals[j + btree->order] = 	n2->key_vals[j];
+                transfer_tags(n2,j,new_node,j + btree->order);
+		new_node->children[j + btree->order] = 	n2->children[j];
+	}
+	new_node->children[2*btree->order - 1] = n2->children[btree->order - 1];
+
+	parent->children[index] = new_node;
+
+	for(j = index;j<parent->nr_active;j++) {
+		parent->key_vals[j] = parent->key_vals[j + 1];
+                transfer_tags(parent,j + 1,parent,j);
+		parent->children[j + 1] = parent->children[j + 2];
+	}
+
+	new_node->nr_active = n1->nr_active + n2->nr_active + 1;
+	parent->nr_active--;
+
+	free_btree_node(n1);
+	free_btree_node(n2);
+
+	if (parent->nr_active == 0 && btree->root == parent) {
+		free_btree_node(parent);
+		btree->root = new_node;
+		if(new_node->level)
+			new_node->leaf = false;
+		else
+			new_node->leaf = true; 
+	}
+
+	return new_node;
+}
+
+/**
+*	Move the key from node to another	
+*	@param btree The B-Tree
+*	@param node The parent node
+*	@param index of the key to be moved done 
+*	@param pos the position of the child to receive the key 
+*	@return none
+*/
+static void move_key(btree * btree, btree_node * node, unsigned int index, position_t pos) {
+	btree_node * lchild;
+	btree_node * rchild;
+	unsigned int i;
+
+	if(pos == right) {
+		index--;
+	}
+	lchild = node->children[index];
+	rchild = node->children[index + 1];
+	
+	// Move the key from the parent to the left child
+	if(pos == left) {
+		lchild->key_vals[lchild->nr_active] = node->key_vals[index];
+                transfer_tags(node,index,lchild,lchild->nr_active);
+
+		lchild->children[lchild->nr_active + 1] = rchild->children[0];
+		rchild->children[0] = NULL;
+		lchild->nr_active++;
+
+		node->key_vals[index] = rchild->key_vals[0];
+                transfer_tags(rchild,0,node,index);
+                
+		for(i=0;i<rchild->nr_active - 1;i++) {
+			rchild->key_vals[i] = rchild->key_vals[i + 1];
+                        transfer_tags(rchild,i + 1,rchild,i);
+			rchild->children[i] = rchild->children[i + 1];
+		}
+		rchild->children[rchild->nr_active - 1] = 
+				rchild->children[rchild->nr_active];
+		rchild->nr_active--;
+	} else {
+		// Move the key from the parent to the right child
+		for(i=rchild->nr_active;i > 0 ; i--) {
+			rchild->key_vals[i] = rchild->key_vals[i - 1];
+                        transfer_tags(rchild,i - 1,rchild,i);
+			rchild->children[i + 1] = rchild->children[i];		
+		}
+		rchild->children[1] = rchild->children[0];		
+		rchild->children[0] = NULL;
+
+		rchild->key_vals[0] = node->key_vals[index];
+                transfer_tags(node,index,rchild,0);
+
+		rchild->children[0] = lchild->children[lchild->nr_active];	
+		lchild->children[lchild->nr_active] = NULL;
+
+		node->key_vals[index] = lchild->key_vals[lchild->nr_active - 1];
+		transfer_tags(lchild,lchild->nr_active - 1,node,index);
+                
+		lchild->nr_active--;
+		rchild->nr_active++;		
+	}				
+}
+
+/**
+*	Merge nodes n1 and n2
+*	@param n1 First node
+*	@param n2 Second node
+*	@return combined node
+*/
+static btree_node * merge_nodes(btree * btree, btree_node * n1, bt_key_val kv,
+                                                btree_node * n2) {
+	btree_node * new_node;
+	unsigned int i;	
+
+	new_node = allocate_btree_node(btree,btree->order);
+	new_node->leaf = true;
+	
+	for(i=0;i<n1->nr_active;i++) {
+		new_node->key_vals[i]   = n1->key_vals[i];
+                transfer_tags(n1,i,new_node,i);
+                new_node->children[i]   = n1->children[i];
+	}
+        new_node->children[n1->nr_active] = n1->children[n1->nr_active];
+        new_node->key_vals[n1->nr_active] = kv;
+
+	for(i=0;i<n2->nr_active;i++) {
+		new_node->key_vals[i + n1->nr_active + 1] = n2->key_vals[i];
+                transfer_tags(n2,i,new_node,i + n1->nr_active + 1);
+                new_node->children[i + n1->nr_active + 1] = n2->children[i];
+	}
+        new_node->children[2*btree->order - 1] = n2->children[n2->nr_active];
+	
+        new_node->nr_active = n1->nr_active + n2->nr_active + 1;
+        new_node->leaf = n1->leaf;
+        new_node->level = n1->level;
+
+	free_btree_node(n1);
+	free_btree_node(n2);
+
+	return new_node;
+}
+
+/**
+*	Used to remove a key from the B-tree node
+*	@param btree The btree
+*	@param node The node from which the key is to be removed 	
+*	@param key The key to be removed
+*	@return 0 on success -1 on error 
+*/
+
+bt_key_val remove_key_from_leaf(btree * btree, node_pos * node_pos) {
+	unsigned int keys_max = 2*btree->order - 1;
+	unsigned int i;
+	bt_key_val key_val;
+	btree_node * node = node_pos->node;
+
+        key_val.key = -1;
+        key_val.val = NULL;
+
+	if(node->leaf == false) {
+		return key_val;
+	}	
+	
+	key_val = node->key_vals[node_pos->index];
+
+	for(i=node_pos->index;i< keys_max - 1;i++) {
+		node->key_vals[i] = node->key_vals[i + 1];
+                transfer_tags(node,i + 1,node,i);
+	}
+	
+	node->nr_active--;
+
+	if(node->nr_active == 0 ) {
+                if (btree->root == node) {
+                        btree->root = NULL;
+                }
+		free_btree_node(node);
+	}
+	return key_val;
+}
+
+/**
+*       Function used to remove a node from a  B-Tree
+*       @param btree The B-Tree
+*       @param key Key of the node to be removed
+*       @param value function to map the key to an unique integer value        
+*       @param compare Function used to compare the two nodes of the tree
+*       @return The removed key value pair 
+*/
+
+bt_key_val btree_delete(btree * btree,btree_node * subtree,unsigned long key) {
+	unsigned int i,index;
+	btree_node * node = NULL, * rsibling, *lsibling;
+	btree_node * comb_node, * parent;
+	node_pos sub_node_pos;
+	node_pos node_pos;
+	bt_key_val key_val, removed_kv,to_remove_kv;
+	unsigned long kv = key;	
+
+	node = subtree;
+	parent = NULL;
+	removed_kv.key = -1;
+	removed_kv.val = NULL;	
+
+	// Empty subtree
+	if (node == NULL) {
+		return removed_kv;
+	}
+
+del_loop:for (i = 0;;i = 0) {
+             
+            //If there are no keys simply return
+            if(!node->nr_active)
+                return removed_kv;
+
+	    // Fix the index of the key greater than or equal
+	    // to the key that we would like to search
+	    i = search_key(node,kv);
+	    index = i;
+
+	    // If we find such key break		    
+	    if(i < node->nr_active && 
+			kv == node->key_vals[i].key) {
+			break;
+	    }
+            if(node->leaf)
+                return removed_kv;
+                
+    	    //Store the parent node
+	    parent = node;
+
+	    // To get a child node 
+	    node = node->children[i];
+
+            //If NULL not found
+            if (node == NULL)
+                return removed_kv;
+
+            if (index == (parent->nr_active)) {   
+                lsibling =  parent->children[parent->nr_active - 1];
+	    	rsibling = NULL; 
+            } else if (index == 0) {
+                lsibling = NULL;
+                rsibling = parent->children[1];
+            } else {
+        	lsibling = parent->children[i - 1];
+	    	rsibling = parent->children[i + 1];
+            }
+
+	    if (node->nr_active == btree->order - 1 && parent) {
+		// The current node has (t - 1) keys but the right sibling has > (t - 1)
+		// keys
+		if (rsibling && (rsibling->nr_active > btree->order - 1)) {
+			move_key(btree,parent,i,left);
+		} else
+		// The current node has (t - 1) keys but the left sibling has (t - 1)
+		// keys
+		if (lsibling && (lsibling->nr_active > btree->order - 1)) {
+			move_key(btree,parent,i,right);
+		} else
+		// Left sibling has (t - 1) keys
+	        if(lsibling && (lsibling->nr_active == btree->order - 1)) {
+		        node = merge_siblings(btree,parent,i,left);
+		} else
+                // Right sibling has (t - 1) keys
+	        if(rsibling && (rsibling->nr_active == btree->order - 1)) {
+		        node = merge_siblings(btree,parent,i,right);
+		}
+	    }
+        }            
+	
+	//Case 1 : The node containing the key is found and is the leaf node.
+	//Also the leaf node has keys greater than the minimum required.
+	//Simply remove the key
+	if(node->leaf && (node->nr_active > btree->order - 1)) {
+		node_pos.node = node;
+		node_pos.index = index;
+		return remove_key_from_leaf(btree,&node_pos);
+	}
+
+	//If the leaf node is the root permit deletion even if the number of keys is
+	//less than (t - 1)
+	if(node->leaf && (node == btree->root)) {
+		node_pos.node = node;
+		node_pos.index = index;
+		return remove_key_from_leaf(btree,&node_pos);
+	}
+
+
+	//Case 2: The node containing the key is found and is an internal node
+	if(node->leaf == false) {
+		if(node->children[index]->nr_active > btree->order - 1 ) {
+			to_remove_kv = node->key_vals[index];
+
+			sub_node_pos = 
+				get_max_key_pos(btree,node->children[index]);
+                        key_val = 
+				sub_node_pos.node->key_vals[sub_node_pos.index];
+        		node->key_vals[index] = key_val;	
+                        transfer_tags(node,index,sub_node_pos.node,
+                                sub_node_pos.index);
+
+			sub_node_pos.node->key_vals[sub_node_pos.index] = 
+				to_remove_kv;
+                        node = node->children[index];
+                        goto del_loop;
+               
+		} else if ((node->children[index + 1]->nr_active > btree->order - 1) ) {
+			to_remove_kv = node->key_vals[index];
+
+			sub_node_pos = 
+                                get_min_key_pos(btree,node->children[index + 1]);
+                        key_val = sub_node_pos.node->key_vals[sub_node_pos.index];
+        		node->key_vals[index] = key_val;	
+                        transfer_tags(node,index,sub_node_pos.node,
+                                sub_node_pos.index);
+
+			sub_node_pos.node->key_vals[sub_node_pos.index] = 
+				to_remove_kv;
+
+                        node = node->children[index + 1];
+                        goto del_loop;
+              
+		} else if ( 
+			node->children[index]->nr_active == btree->order - 1 &&
+			node->children[index + 1]->nr_active == btree->order - 1) {
+
+			comb_node = merge_nodes(btree,node->children[index],
+                                node->key_vals[index],
+				node->children[index + 1]);
+			node->children[index] = comb_node;
+			
+			for(i=index + 1;i<node->nr_active;i++) {
+				node->children[i] = node->children[i + 1];
+				node->key_vals[i - 1] = node->key_vals[i];
+                                transfer_tags(node,i,node,i - 1);
+			}
+			node->nr_active--;
+                        if (node->nr_active == 0 && btree->root == node) {
+                                free_btree_node(node);
+                                btree->root = comb_node;        
+                        } 
+                        node = comb_node;
+                        goto del_loop;
+		}		
+          }
+	
+	// Case 3:
+	// In this case start from the top of the tree and continue
+	// moving to the leaf node making sure that each node that
+	// we encounter on the way has atleast 't' (order of the tree)
+	// keys 
+	if(node->leaf && (node->nr_active > btree->order - 1)) {
+	      node_pos.node = node;
+	      node_pos.index = index;
+	      return remove_key_from_leaf(btree,&node_pos);
+	}
+
+        return removed_kv;
+}
+
+/**
+*	Function used to get the node containing the given key
+*	@param btree The btree to be searched
+*	@param key The the key to be searched
+*	@return The node and position of the key within the node 
+*/
+node_pos  get_btree_node(btree * btree,unsigned long key) {
+	node_pos kp;
+	unsigned long key_val = key;	
+	btree_node * node;
+	unsigned int i = 0;
+
+	node = btree->root;
+	kp.node = NULL;
+	kp.index = 0;
+
+	// Empty tree
+	if (node == NULL) {
+		return kp;
+	}	
+	
+	for (;;i = 0) {	
+
+	    // Fix the index of the key greater than or equal
+	    // to the key that we would like to search
+	    i = search_key(node,key_val);		
+	    
+	    // If we find such key return the key-value pair		    
+	    if(i < node->nr_active && 
+			key_val == node->key_vals[i].key) {
+		    kp.node = node;
+		    kp.index = i;	
+		    return kp;
+	    }
+	
+	    // If the node is leaf and if we did not find the key 
+	    // return NULL
+	    if(node->leaf) {
+		    return kp;
+	    }
+
+	    // To got a child node 
+	    node = node->children[i];
+	}
+      	return kp;
+
+}
+
+/**
+*       Used to destory btree
+*       @param btree The B-tree
+*       @return none
+*/
+void btree_destroy(btree * btree) {
+       int i = 0;
+       unsigned int current_level;
+
+       btree_node * head, * tail, * node;
+       btree_node * child, * del_node;
+
+       node = btree->root;
+       current_level = node->level;
+       head = node;
+       tail = node;
+
+       while(true) {
+               if(head == NULL) {
+                       break;
+               }
+               if (head->level < current_level) {
+                       current_level = head->level;
+               }
+
+               if(head->leaf == false) {
+                       for(i = 0 ; i < head->nr_active + 1; i++) {
+                               child = head->children[i];
+                               tail->next = child;
+                               tail = child;
+                               child->next = NULL;
+                       }
+               }
+               del_node = head;
+               head = head->next;
+               free_btree_node(del_node);
+       }
+
+}
+
+int btree_tagged(btree * btree, int tag) {
+        int i = 0;
+        btree_node * head, * tail;
+        btree_node * child;
+
+        head = btree->root;
+        tail = btree->root;
+
+        while(true) {
+                if(head == NULL) {
+                        break;
+                }
+
+                for (i = 0; i < head->nr_active; i++ ) {
+                  
+                  if (tag != -1 && tag_get(head,tag,i)) 
+                     return 1;
+                  
+                }     
+
+                if(head->leaf == false) {
+                        for(i = 0 ; i < head->nr_active + 1; i++) {
+                                child = head->children[i];
+                                tail->next = child;
+                                tail = child;
+                                child->next = NULL;
+                        }
+                }
+                head = head->next;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(btree_tagged);
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct btree_node *node, int tag)
+{
+        int idx;
+        for (idx = 0; idx < BTREE_TAG_LONGS; idx++) {
+                if (node->tags[tag][idx])
+                        return 1;
+        }
+        return 0;
+}
+
+static void clear_tags(btree_node * node, int index) {
+	 int tag;
+        for (tag = 0;tag< BTREE_TAGS; tag++) {
+		tag_clear(node,tag,index);
+        }
+}
+
+static void transfer_tags(btree_node * source_node, int source_index,
+                btree_node * dest_node,int dest_index) {
+
+        int tag;
+        for (tag = 0;tag< BTREE_TAGS; tag++) {
+                if (tag_get(source_node,tag,source_index)) {
+                        tag_set(dest_node,tag,dest_index);
+                        tag_clear(source_node,tag,source_index);
+                } else {
+                        tag_clear(dest_node,tag,dest_index);
+                }
+        }
+}
+
+static unsigned int __lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items, int tag) {
+
+        int i = 0, j = 0;
+        int ret = 0;
+        unsigned long key;
+        btree_node * head, * tail;
+        btree_node * child;
+
+        head = btree->root;
+        tail = btree->root;
+
+        while(true) {
+                if(head == NULL) {
+                        break;
+                }
+
+                if ((head->key_vals[head->nr_active - 1].key 
+                                >= first_index) && (ret < max_items)) {
+                   for (i = 0; i < head->nr_active; i++ ) {
+                     
+                     if (tag != -1 && !tag_get(head,tag,i)) 
+                        continue;
+                     
+                     if ((key = head->key_vals[i].key) 
+                                                >= first_index) {
+                                // Insertion sort                
+                                for (j = (ret - 1);j>=0;j--) {
+                                      if (btree->key(results[j]) > key) {
+                                                results[j + 1] =
+                                                    results[j];    
+                                      } else {
+                                                break;
+                                      }
+                                }
+                               
+                                results[++j] = head->key_vals[i].val; 
+                                ret++;
+
+                                if(ret == max_items) {
+                                        break;
+                                }
+                     }           
+                   }     
+                }
+
+                if(head->leaf == false) {
+                        for(i = 0 ; i < head->nr_active + 1; i++) {
+                                child = head->children[i];
+                                tail->next = child;
+                                tail = child;
+                                child->next = NULL;
+                        }
+                }
+                head = head->next;
+        }
+        return ret;
+}
+
+/**
+*       Perform multiple lookup on the B-Tree
+*       @param btree The btree
+*       @param results where the results will be placed
+*       @param first_index start the lookup from this tree
+*       @param max_items Place upto these many items in results
+*       @return The number of items found
+*/
+unsigned int btree_gang_lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items) {
+        return __lookup(btree,results,first_index,max_items, -1);
+}
+
+
+EXPORT_SYMBOL(btree_gang_lookup);
+
+/**
+*       Perform multiple lookup on the B-Tree
+*       @param btree The btree
+*       @param results where the results will be placed
+*       @param first_index start the lookup from this tree
+*       @param max_items Place upto these many items in results
+*       @return The number of items found
+*/
+unsigned int btree_gang_lookup_tag(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items,int tag) {
+        return __lookup(btree,results,first_index,max_items, tag);
+}
+
+
+EXPORT_SYMBOL(btree_gang_lookup_tag);
+
+
+/**
+*       Function used to search a node in a B-Tree
+*       @param btree The B-tree to be searched
+*       @param key Key of the node to be search
+*       @return The key-value pair
+*/
+void * btree_lookup(btree * btree,unsigned long key) {
+
+	bt_key_val key_val;
+	node_pos kp;
+	
+	if (btree->root == NULL) {
+		return NULL;
+	}
+
+	kp = get_btree_node(btree,key);
+	key_val.val = NULL;
+
+	if(kp.node) {
+		key_val = kp.node->key_vals[kp.index];
+	} 
+	return key_val.val; 
+}
+EXPORT_SYMBOL(btree_lookup);
+
+/**
+*       Function used to get the slot in a node of a B-Tree
+*       @param btree The B-tree to be searched
+*       @param key Key of the node to be search
+*       @return The key-value pair
+*/
+bt_key_val * btree_lookup_slot(btree * btree,unsigned long key) {
+
+	bt_key_val * key_val = NULL;
+	node_pos kp;
+
+	if (btree->root == NULL) {
+		return NULL;
+	}
+
+	kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+	}
+	return key_val; 
+}
+EXPORT_SYMBOL(btree_lookup_slot);
+
+/**
+*       Used to set the tag for a node
+*       @param btree The btree
+*       @param key The key of the page for which the tag is to be set
+*       @param tag The tag to be set
+*       @return The btree key and val for which the tag is set  
+*/
+
+bt_key_val * btree_tag_set(btree * btree,unsigned long key, int tag) {
+	bt_key_val * key_val = NULL;
+	node_pos kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+                if(!tag_get(kp.node,tag,kp.index))
+                        tag_set(kp.node,tag,kp.index);
+	}
+        BUG_ON(key_val == NULL);
+	return key_val; 
+}
+EXPORT_SYMBOL(btree_tag_set);
+
+/**
+*       Used to clear the tag for a node
+*       @param btree The btree
+*       @param key The key of the page for which the tag is to be set
+*       @param tag The tag to be set
+*       @return The btree key and val for which the tag is cleared  
+*/
+
+bt_key_val * btree_tag_clear(btree * btree,unsigned long key, int tag) {
+         bt_key_val * key_val = NULL;
+	node_pos kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+                if(tag_get(kp.node,tag,kp.index))
+                        tag_clear(kp.node,tag,kp.index);
+	}
+        BUG_ON(key_val == NULL);
+	return key_val; 
+}
+EXPORT_SYMBOL(btree_tag_clear);
+
+/**
+*       Used to verify whether a tag is set
+*       @param btree The btree
+*       @param key The key
+*       @param tag The tag
+*       @return
+*       0: tag not present
+*       1: tag present, set
+*       -1: tag present, unset 
+*/
+int btree_tag_get(btree * btree,unsigned long key, int tag) {
+        int ret = 0;
+        bt_key_val * key_val = NULL;
+	node_pos kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+                ret = tag_get(kp.node,tag,kp.index);
+                return ret? 1 : -1;
+	}
+        BUG_ON(key_val == NULL);
+	return ret; 
+}
+EXPORT_SYMBOL(btree_tag_get);
+
+
+/**
+*	Get the max key in the btree
+*	@param btree The btree
+*	@return The max key 
+*/
+unsigned long btree_get_max_key(btree * btree) {
+	node_pos node_pos;
+	node_pos = get_max_key_pos(btree,btree->root);
+	return node_pos.node->key_vals[node_pos.index].key;
+}
+
+/**
+*	Get the min key in the btree
+*	@param btree The btree
+*	@return The max key 
+*/
+unsigned long btree_get_min_key(btree * btree) {
+	node_pos node_pos;
+	node_pos = get_min_key_pos(btree,btree->root);
+	return node_pos.node->key_vals[node_pos.index].key;
+}
+
+/**
+*	Used to print the keys of the btree_node
+*	@param node The node whose keys are to be printed
+*	@return none
+*/
+
+static void print_single_node(btree *btree, btree_node * node) {
+	
+	int i = 0;
+	
+	printk(" { ");	
+	while(i < node->nr_active) {
+		printk("%d(0x%x) ", node->key_vals[i].key,
+			node->key_vals[i].val);
+		i++;
+	}
+	printk("} (0x%x,%d,%d) ", node,node->nr_active,node->level);	
+}
+
+/**
+*       Function used to print the B-tree
+*       @param root Root of the B-Tree
+*       @param print_key Function used to print the key value
+*       @return none
+*/
+
+void print_subtree(btree *btree,btree_node * node) {
+	
+	int i = 0;
+	unsigned int current_level;
+
+	btree_node * head, * tail;
+
+	current_level = node->level;
+	head = node;
+	tail = node;
+	
+	printk("DEBUG: Printing subtree\n");
+	while(true) {
+		if(head == NULL) {
+			break;
+		}
+		if (head->level < current_level) {
+			current_level = head->level;
+			printk("\n");
+		}
+		print_single_node(btree,head);
+
+		if(head->leaf == false) {	
+			for(i = 0 ; i < head->nr_active + 1; i++) {
+				tail->next = head->children[i];
+				tail = head->children[i];
+				head->children[i]->next = NULL;
+			}
+		}
+		head = head->next;	
+	}
+	printk("\n");
+}
+EXPORT_SYMBOL(print_subtree);
+
diff -urN linux-2.6.16.2/lib/Makefile linux-2.6.16.2-btree/lib/Makefile
--- linux-2.6.16.2/lib/Makefile	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/lib/Makefile	2006-08-17 21:11:21.000000000 -0400
@@ -3,7 +3,7 @@
 #
 
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
-	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
+	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o btree.o\
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o
 
diff -urN linux-2.6.16.2/init/main.c linux-2.6.16.2-btree/init/main.c
--- linux-2.6.16.2/init/main.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/init/main.c	2006-08-17 21:11:21.000000000 -0400
@@ -84,6 +84,7 @@
 extern void pidmap_init(void);
 extern void prio_tree_init(void);
 extern void radix_tree_init(void);
+extern void btree_init(void);
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
@@ -530,6 +531,7 @@
 	security_init();
 	vfs_caches_init(num_physpages);
 	radix_tree_init();
+        btree_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff -urN linux-2.6.16.2/fs/buffer.c linux-2.6.16.2-btree/fs/buffer.c
--- linux-2.6.16.2/fs/buffer.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/fs/buffer.c	2006-08-17 21:11:21.000000000 -0400
@@ -859,9 +859,16 @@
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
 				inc_page_state(nr_dirty);
-			radix_tree_tag_set(&mapping->page_tree,
+			btree_tag_set(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+
+			if (btree_tag_get(&mapping->btree,
+			  page_index(page),PAGECACHE_TAG_DIRTY) != 1) {
+				panic("__set_page_dirty_buffers" 
+					" Problem setting the tag\n");
+			}				
+
 		}
 		write_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff -urN linux-2.6.16.2/fs/inode.c linux-2.6.16.2-btree/fs/inode.c
--- linux-2.6.16.2/fs/inode.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/fs/inode.c	2006-08-17 21:11:21.000000000 -0400
@@ -16,6 +16,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
+#include <linux/btree.h>
 #include <linux/swap.h>
 #include <linux/security.h>
 #include <linux/pagemap.h>
@@ -196,6 +197,7 @@
 	mutex_init(&inode->i_mutex);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+        INIT_BTREE(&inode->i_data.btree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
diff -urN linux-2.6.16.2/include/linux/btree.h linux-2.6.16.2-btree/include/linux/btree.h
--- linux-2.6.16.2/include/linux/btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.16.2-btree/include/linux/btree.h	2006-08-17 21:11:21.000000000 -0400
@@ -0,0 +1,89 @@
+/*
+ *  btree.h
+ *
+ *  Copyright (C) 2006 Vishal Patil (vishpat AT gmail DOT com) 
+ *
+*/
+
+#ifndef _BTREE_H_
+#define _BTREE_H_
+
+#include <linux/sched.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
+
+#define BTREE_ORDER 8 
+#define BTREE_TAGS 2
+#define BTREE_TAG_LONGS    \
+        ((BTREE_ORDER + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define BTREE_MAX_PATH 8
+
+
+typedef enum  {false,true} bool;
+typedef struct btree_node btree_node;
+
+typedef struct {
+        unsigned long key;
+	void * val;
+} bt_key_val;
+
+
+typedef struct btree {
+	unsigned int order;			// B-Tree order
+	btree_node * root;		        // Root of the B-Tree
+       	unsigned long (*value)(unsigned long key);	// Generate uint value for the key
+        unsigned long (*key)(void * data);      // Get the value of the key from
+                                                // from the data
+	void (*print_key)(unsigned long key);		// Print the key
+        gfp_t                   gfp_mask;
+} btree; 
+
+extern int btree_insert(btree * btree, unsigned long key, void * val);
+extern bt_key_val btree_delete(btree * btree,btree_node * subtree,
+                                unsigned long key);
+extern void * btree_lookup(btree * btree,  unsigned long key);
+extern unsigned int btree_gang_lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items);
+extern unsigned int btree_gang_lookup_tag(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items,int tag);
+extern bt_key_val * btree_lookup_slot(btree * btree, unsigned long key);
+extern bt_key_val * btree_tag_set(btree * btree,unsigned long key, int tag);
+extern int btree_tag_get(btree * btree,unsigned long key, int tag);
+extern bt_key_val * btree_tag_clear(btree * btree,unsigned long key, int tag);
+extern void btree_destroy(btree * btree);
+extern unsigned long btree_get_max_key(btree * btree);
+extern unsigned long btree_get_min_key(btree * btree);
+extern unsigned long btree_value_fn(unsigned long key);
+extern unsigned long btree_key_fn(void * page);
+extern void btree_print_fn(unsigned long key);
+extern int btree_preload(gfp_t gfp_mask);
+extern int btree_tagged(btree * btree, int tag);
+
+static inline void btree_preload_end(void)
+{
+        preempt_enable();
+}
+
+#define BTREE_INIT(mask) { \
+        .order          = BTREE_ORDER,                            \
+        .root           = NULL,                                   \
+        .value          = btree_value_fn,                         \
+        .key            = btree_key_fn,                           \
+        .print_key      = btree_print_fn,                         \
+        .gfp_mask       = (mask)                                  \
+}
+
+#define INIT_BTREE(btree,mask)                            \
+do {                                                      \
+        (btree)->order = BTREE_ORDER;                     \
+        (btree)->root = NULL;                             \
+        (btree)->value            = btree_value_fn;       \
+        (btree)->key              = btree_key_fn;         \
+        (btree)->print_key        = btree_print_fn;       \
+        (btree)->gfp_mask         = (mask);               \
+} while (0)
+
+
+extern void print_subtree(btree * btree,btree_node * node);
+
+#endif
diff -urN linux-2.6.16.2/include/linux/fs.h linux-2.6.16.2-btree/include/linux/fs.h
--- linux-2.6.16.2/include/linux/fs.h	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/include/linux/fs.h	2006-08-17 21:11:21.000000000 -0400
@@ -214,6 +214,7 @@
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
+#include <linux/btree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -372,6 +373,7 @@
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
+        struct btree            btree;          /* BTree of all pages*/
 	rwlock_t		tree_lock;	/* and rwlock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */

[-- Attachment #3: btree.txt --]
[-- Type: text/plain, Size: 1894 bytes --]

                    File  Blk   Num                   Avg      Maximum      Lat%     Lat%    CPU
Identifier          Size  Size  Thr   Rate  (CPU%)  Latency    Latency      >2s      >10s    Eff
------------------- ------ ----- ---  ------ ------ --------- -----------  -------- -------- -----
Sequential Reads
2.6.16.2-btree      1004  4096    1   23.89 5.196%     0.163    298.47   0.00000  0.00000   460
2.6.16.2-btree      1004  4096    2   41.37 22.74%     0.186    137.93   0.00000  0.00000   182
2.6.16.2-btree      1004  4096    4   39.82 29.40%     0.356    336.10   0.00000  0.00000   135
2.6.16.2-btree      1004  4096    8   36.22 45.19%     0.767    943.55   0.00000  0.00000    80

Random Reads
2.6.16.2-btree      1004  4096    1    0.51 0.679%     7.646     40.24   0.00000  0.00000    75
2.6.16.2-btree      1004  4096    2    0.43 0.475%    18.071    111.75   0.00000  0.00000    91
2.6.16.2-btree      1004  4096    4    0.46 1.239%    31.134    226.30   0.00000  0.00000    37
2.6.16.2-btree      1004  4096    8    0.45 2.915%    60.470    378.33   0.00000  0.00000    15

Sequential Writes
2.6.16.2-btree      1004  4096    1   16.14 65.46%     0.239   2163.35   0.00039  0.00000    25
2.6.16.2-btree      1004  4096    2   17.24 75.47%     0.430   7099.42   0.00195  0.00000    23
2.6.16.2-btree      1004  4096    4   15.64 66.73%     0.857   9904.39   0.00856  0.00000    23
2.6.16.2-btree      1004  4096    8   13.73 80.91%     1.858  13682.20   0.03398  0.00078    17

Random Writes
2.6.16.2-btree      1004  4096    1    7.66 35.28%     0.500    361.49   0.00000  0.00000    22
2.6.16.2-btree      1004  4096    2    0.86 1.654%     0.668    125.78   0.00000  0.00000    52
2.6.16.2-btree      1004  4096    4    0.96 2.904%     0.620    113.66   0.00000  0.00000    33
2.6.16.2-btree      1004  4096    8    1.04 4.632%     0.461    175.49   0.00000  0.00000    22

[-- Attachment #4: radixtree.txt --]
[-- Type: text/plain, Size: 2083 bytes --]

                              File  Blk   Num                   Avg      Maximum      Lat%     Lat%    CPU
Identifier                    Size  Size  Thr   Rate  (CPU%)  Latency    Latency      >2s      >10s    Eff
---------------------------- ------ ----- ---  ------ ------ --------- -----------  -------- -------- -----
Sequential Reads
2.6.15-1.2054_FC5smp          1004  4096    1   54.31 9.370%     0.071     29.98   0.00000  0.00000   580
2.6.15-1.2054_FC5smp          1004  4096    2   39.66 11.51%     0.195    158.49   0.00000  0.00000   344
2.6.15-1.2054_FC5smp          1004  4096    4   42.39 27.79%     0.360    371.22   0.00000  0.00000   152
2.6.15-1.2054_FC5smp          1004  4096    8   36.02 37.36%     0.750    870.62   0.00000  0.00000    96

Random Reads
2.6.15-1.2054_FC5smp          1004  4096    1    0.46 0.516%     8.523     39.20   0.00000  0.00000    89
2.6.15-1.2054_FC5smp          1004  4096    2    0.41 0.500%    18.946    136.92   0.00000  0.00000    81
2.6.15-1.2054_FC5smp          1004  4096    4    0.50 1.235%    30.916    226.78   0.00000  0.00000    40
2.6.15-1.2054_FC5smp          1004  4096    8    0.46 2.755%    61.054    590.53   0.00000  0.00000    17

Sequential Writes
2.6.15-1.2054_FC5smp          1004  4096    1   47.80 24.17%     0.071   1608.05   0.00000  0.00000   198
2.6.15-1.2054_FC5smp          1004  4096    2   45.82 51.59%     0.135   2742.27   0.00078  0.00000    89
2.6.15-1.2054_FC5smp          1004  4096    4   44.34 99.81%     0.291   3190.65   0.00272  0.00000    44
2.6.15-1.2054_FC5smp          1004  4096    8   42.57 181.1%     0.455   4463.81   0.00234  0.00000    24

Random Writes
2.6.15-1.2054_FC5smp          1004  4096    1    1.46 0.561%     0.086     29.08   0.00000  0.00000   260
2.6.15-1.2054_FC5smp          1004  4096    2    1.44 1.547%     0.072     42.84   0.00000  0.00000    93
2.6.15-1.2054_FC5smp          1004  4096    4    1.47 2.707%     0.015      0.55   0.00000  0.00000    54
2.6.15-1.2054_FC5smp          1004  4096    8    1.43 7.313%     0.015      0.60   0.00000  0.00000    20

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Page cache using B-trees benchmark results
  2006-08-18  1:43 Page cache using B-trees benchmark results Vishal Patil
@ 2006-08-18 13:52 ` Bill Davidsen
  2006-08-18 16:25 ` Andi Kleen
  1 sibling, 0 replies; 4+ messages in thread
From: Bill Davidsen @ 2006-08-18 13:52 UTC (permalink / raw)
  To: linux-kernel; +Cc: Andrea Arcangeli

Vishal Patil wrote:
> Folks
> 
> I am attaching the benchmark results for Page Cache Implementation
> using B-trees. I basically ran the tio (threaded i/o) benchmark
> against my kernel (with the B-tree implementation) and the Linux
> kernel shipped with FC5. Radix tree implementation is definately
> better however the B-tree implementation did not suck that bad :)
> 
> Also I attaching a new patch which was used for measuring the
> benchmarks. Also henceforth changes to the page will be tracked using
> the projected hosted at http://code.google.com/p/btreepc
> 
Thanks for this. I guess a purist would say that you need to run against 
the base kernel and base kernel plus your patches, but these numbers are 
certainly enough to support your conclusion.

What's next?

-- 
Bill Davidsen <davidsen@tmr.com>
   Obscure bug of 2004: BASH BUFFER OVERFLOW - if bash is being run by a
normal user and is setuid root, with the "vi" line edit mode selected,
and the character set is "big5," an off-by-one errors occurs during
wildcard (glob) expansion.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Page cache using B-trees benchmark results
  2006-08-18  1:43 Page cache using B-trees benchmark results Vishal Patil
  2006-08-18 13:52 ` Bill Davidsen
@ 2006-08-18 16:25 ` Andi Kleen
  2006-08-18 16:37   ` Vishal Patil
  1 sibling, 1 reply; 4+ messages in thread
From: Andi Kleen @ 2006-08-18 16:25 UTC (permalink / raw)
  To: Vishal Patil; +Cc: Andrea Arcangeli, linux-kernel

"Vishal Patil" <vishpat@gmail.com> writes:

> I am attaching the benchmark results for Page Cache Implementation
> using B-trees. I basically ran the tio (threaded i/o) benchmark
> against my kernel (with the B-tree implementation) and the Linux

I suppose you'll need some more varied benchmarks to get
more solid data.

> kernel shipped with FC5. Radix tree implementation is definately
> better however the B-tree implementation did not suck that bad :)

Have you considered trying it again instead of radix tree with
another data structure? There are still plenty of other big
hash tables in the kernel that might benefit from trying
a different approach:

> dmesg | grep -i hash
PID hash table entries: 4096 (order: 12, 131072 bytes)
Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes)
Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes)
Mount-cache hash table entries: 256
Dquot-cache hash table entries: 512 (order 0, 4096 bytes)
IP route cache hash table entries: 65536 (order: 7, 524288 bytes)
TCP established hash table entries: 262144 (order: 9, 2097152 bytes)
TCP bind hash table entries: 65536 (order: 7, 524288 bytes)
TCP: Hash tables configured (established 262144 bind 65536)

e.g. the dentry/inode hashes are an obvious attack point.

Of course you'll need benchmarks that actually stress them.

-Andi

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: Page cache using B-trees benchmark results
  2006-08-18 16:25 ` Andi Kleen
@ 2006-08-18 16:37   ` Vishal Patil
  0 siblings, 0 replies; 4+ messages in thread
From: Vishal Patil @ 2006-08-18 16:37 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andrea Arcangeli, linux-kernel

Hey Andi....This is useful information....I will look into it and let
you know....
Many thanks.

- Vishal

On 18 Aug 2006 18:25:54 +0200, Andi Kleen <ak@suse.de> wrote:
> "Vishal Patil" <vishpat@gmail.com> writes:
>
> > I am attaching the benchmark results for Page Cache Implementation
> > using B-trees. I basically ran the tio (threaded i/o) benchmark
> > against my kernel (with the B-tree implementation) and the Linux
>
> I suppose you'll need some more varied benchmarks to get
> more solid data.
>
> > kernel shipped with FC5. Radix tree implementation is definately
> > better however the B-tree implementation did not suck that bad :)
>
> Have you considered trying it again instead of radix tree with
> another data structure? There are still plenty of other big
> hash tables in the kernel that might benefit from trying
> a different approach:
>
> > dmesg | grep -i hash
> PID hash table entries: 4096 (order: 12, 131072 bytes)
> Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes)
> Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes)
> Mount-cache hash table entries: 256
> Dquot-cache hash table entries: 512 (order 0, 4096 bytes)
> IP route cache hash table entries: 65536 (order: 7, 524288 bytes)
> TCP established hash table entries: 262144 (order: 9, 2097152 bytes)
> TCP bind hash table entries: 65536 (order: 7, 524288 bytes)
> TCP: Hash tables configured (established 262144 bind 65536)
>
> e.g. the dentry/inode hashes are an obvious attack point.
>
> Of course you'll need benchmarks that actually stress them.
>
> -Andi
>


-- 
Motivation will almost always beat mere talent.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2006-08-18 16:37 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-18  1:43 Page cache using B-trees benchmark results Vishal Patil
2006-08-18 13:52 ` Bill Davidsen
2006-08-18 16:25 ` Andi Kleen
2006-08-18 16:37   ` Vishal Patil

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox