public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* Generic B-tree implementation
@ 2006-07-18  2:02 Vishal Patil
  2006-07-18  2:58 ` Horst von Brand
  2006-07-18  4:27 ` Gary Funck
  0 siblings, 2 replies; 14+ messages in thread
From: Vishal Patil @ 2006-07-18  2:02 UTC (permalink / raw)
  To: Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 661 bytes --]

Hello folks

I am attaching source files containing a very generic implementation
of B-trees in C. The implementation corresponds to in memory B-Tree
data structure. The B-tree library consists of two files, btree.h and
btree.c. I am also attaching a sample program main.c which should
hopefully make the use of the library clear.

I would be happy to receive inputs from the community for changes
(enchancements) to the library. All the more I would like to help
someone with a project which would take advantage of the B-Tree
implementation.

- Vishal

PS: This code is being released under GPL version 2.

-- 
Motivation will almost always beat mere talent.

[-- Attachment #2: btree.h --]
[-- Type: application/octet-stream, Size: 1557 bytes --]

#ifndef _BTREE_H_
#define _BTREE_H_

// Platform dependent headers
#include <stdlib.h>
#include <stdio.h>
#include <strings.h>

#define mem_alloc malloc
#define mem_free free
#define bcopy bcopy
#define print printf

typedef enum  {false,true} bool;

typedef struct {
        void * key;
        void * val;
} bt_key_val;

typedef struct bt_node {
	struct bt_node * next;		// Pointer used for linked list 
	bool leaf;			// Used to indicate whether leaf or not
        unsigned int nr_active;		// Number of active keys
	unsigned int level;		// Level in the B-Tree
        bt_key_val ** key_vals; 	// Array of keys and values
        struct bt_node ** children;	// Array of pointers to child nodes
}bt_node;

typedef struct {
	unsigned int order;			// B-Tree order
	bt_node * root;				// Root of the B-Tree
	unsigned int (*value)(void * key);	// Generate uint value for the key
        unsigned int (*key_size)(void * key);    // Return the key size
        unsigned int (*data_size)(void * data);  // Return the data size
	void (*print_key)(void * key);		// Print the key
}btree; 

extern btree * btree_create(unsigned int order);
extern int btree_insert_key(btree * btree, bt_key_val * key_val);
extern int btree_delete_key(btree * btree,bt_node * subtree ,void * key);
extern bt_key_val * btree_search(btree * btree,  void * key);
extern void btree_destroy(btree * btree);
extern void * btree_get_max_key(btree * btree);
extern void * btree_get_min_key(btree * btree);

#ifdef DEBUG
extern void print_subtree(btree * btree,bt_node * node);
#endif


#endif

[-- Attachment #3: btree.c --]
[-- Type: application/octet-stream, Size: 23027 bytes --]

#include "btree.h"

typedef enum {left = -1,right = 1} position_t;

typedef struct {
	bt_node * node;
	unsigned int index;
}node_pos;

static void print_single_node(btree *btree, bt_node * node);
static bt_node * allocate_btree_node (unsigned int order);
static int free_btree_node (bt_node * node);

static node_pos get_btree_node(btree * btree,void * key);

static int delete_key_from_node(btree * btree, node_pos * node_pos);
static bt_node * merge_nodes(btree * btree, bt_node * n1, bt_key_val * kv ,bt_node * n2);
static void move_key(btree * btree, bt_node * node, unsigned int index, position_t pos);
static node_pos get_max_key_pos(btree * btree, bt_node * subtree);
static node_pos get_min_key_pos(btree * btree, bt_node * subtree);
static bt_node * merge_siblings(btree * btree, bt_node * parent,unsigned int index,
					position_t pos);
static void copy_key_val(btree * btree,bt_key_val * src, bt_key_val * dst);

/**
*	Used to create a btree with just the root node
*	@param order The order of the B-tree
*	@return The an empty B-tree
*/
btree * btree_create(unsigned int order) {
	btree * btree;
	btree = mem_alloc(sizeof(*btree));
	btree->order = order;
	btree->root = allocate_btree_node(order);
	btree->root->leaf = true;
	btree->root->nr_active = 0;
	btree->root->next = NULL;
	btree->root->level = 0;
	return btree;
}

/**
*       Function used to allocate memory for the btree node
*       @param order Order of the B-Tree
*	@param leaf boolean set true for a leaf node
*       @return The allocated B-tree node
*/
static bt_node * allocate_btree_node (unsigned int order) {
        bt_node * node;

        // Allocate memory for the node
        node = (bt_node *)mem_alloc(sizeof(bt_node));
       
        // Initialize the number of active nodes
        node->nr_active = 0;

        // Initialize the keys
        node->key_vals = (bt_key_val **)mem_alloc(2*order*sizeof(bt_key_val*) - 1);
        
        // Initialize the child pointers
        node->children = (bt_node **)mem_alloc(2*order*sizeof(bt_node*));

	// Use to determine whether it is a leaf
	node->leaf = true;

	// Use to determine the level in the tree
	node->level = 0;

	//Initialize the linked list pointer to NULL
	node->next = NULL;

        return node;
}

/**
*       Function used to free the memory allocated to the b-tree 
*       @param node The node to be freed
*       @param order Order of the B-Tree
*       @return The allocated B-tree node
*/
static int free_btree_node (bt_node * node) {

        mem_free(node->children);
        mem_free(node->key_vals);
        mem_free(node);

        return 0;
}

/**
*	Used to split the child node and adjust the parent so that
*	it has two children
*	@param parent Parent Node
*	@param index Index of the child node
*	@param child  Full child node
*	
*/
static void btree_split_child(btree * btree, bt_node * parent, 
				unsigned int index,
				bt_node * child) {
	int i = 0;	
	unsigned int order = btree->order;

	bt_node * new_child = allocate_btree_node(btree->order); 
	new_child->leaf = child->leaf;
	new_child->level = child->level;
	new_child->nr_active = btree->order - 1;

	// Copy the higher order keys to the new child	
	for(i=0;i<order - 1;i++) {
		new_child->key_vals[i] = child->key_vals[i + order];
		if(!child->leaf) {
			new_child->children[i] = 
				child->children[i + order];
		} 
	}
	
	// Copy the last child pointer
	if(!child->leaf) {
		new_child->children[i] = 
		child->children[i + order];
	}

	child->nr_active = order - 1;
	
	for(i = parent->nr_active + 1;i > index + 1;i--) {
		parent->children[i] = parent->children[i - 1];
	}
	parent->children[index + 1] = new_child;

	for(i = parent->nr_active;i > index;i--) {
		parent->key_vals[i] = parent->key_vals[i - 1];
	}

	parent->key_vals[index] = child->key_vals[order - 1];	
	parent->nr_active++;
}

/**
*	Used to insert a key in the non-full node
*	@param btree The btree
*	@param node The node to which the key will be added
*	@param the key value pair
*	@return void
*/

static void btree_insert_nonfull (btree * btree, bt_node * parent_node,
				bt_key_val * key_val) {
	
	unsigned int key = btree->value(key_val->key);
	int i ;
	bt_node * child;	
	bt_node * node = parent_node;

insert:	i = node->nr_active - 1;
	if(node->leaf) {
		while(i >= 0 && key < btree->value(node->key_vals[i]->key)) {
			node->key_vals[i + 1] = node->key_vals[i];
			i--;
		}
		node->key_vals[i + 1] = key_val;
		node->nr_active++; 
	} else {
		while (i >= 0 && key < btree->value(node->key_vals[i]->key)) {
			i--;
		}
		i++;
		child = node->children[i]; 
		
		if(child->nr_active == 2*btree->order - 1) {
			btree_split_child(btree,node,i,child);	
			if(btree->value(key_val->key) > 
				btree->value(node->key_vals[i]->key)) {
				i++;	
			}	
		}
		node = node->children[i];
		goto insert;	
	} 
}


/**
*       Function used to insert node into a B-Tree
*       @param root Root of the B-Tree
*       @param node The node to be inserted
*       @param compare Function used to compare the two nodes of the tree
*       @return success or failure
*/
int btree_insert_key(btree * btree, bt_key_val * key_val) {
	bt_node * rnode;

	rnode = btree->root;
	if(rnode->nr_active == (2*btree->order - 1)) {
		bt_node * new_root;
		new_root = allocate_btree_node(btree->order);
		new_root->level = btree->root->level + 1;
		btree->root = new_root;	
		new_root->leaf = false;
		new_root->nr_active = 0;
		new_root->children[0]  = rnode;
		btree_split_child(btree,new_root,0,rnode);
		btree_insert_nonfull(btree,new_root,key_val);	
	} else
		btree_insert_nonfull(btree,rnode,key_val);		

        return 0;
}

/**
*	Used to get the position of the MAX key within the subtree
*	@param btree The btree
*	@param subtree The subtree to be searched
*	@return The node containing the key and position of the key
*/
static node_pos get_max_key_pos(btree * btree, bt_node * subtree) {
	node_pos node_pos;
	bt_node * node = subtree; 

	while(true) {
		if(node == NULL) {
			break;
		}

		if(node->leaf) {
			node_pos.node 	= node;
			node_pos.index 	= node->nr_active - 1;
			return node_pos;	
		} else {
			node_pos.node 	= node;
			node_pos.index 	= node->nr_active - 1;
			node = node->children[node->nr_active];	
		}
	}
	return node_pos;
}

/**
*	Used to get the position of the MAX key within the subtree
*	@param btree The btree
*	@param subtree The subtree to be searched
*	@return The node containing the key and position of the key
*/
static node_pos get_min_key_pos(btree * btree, bt_node * subtree) {
	node_pos node_pos;
	bt_node * node = subtree; 

	while(true) {
		if(node == NULL) {
			break;
		}

		if(node->leaf) {
			node_pos.node 	= node;
			node_pos.index 	= 0;
			return node_pos;	
		} else {
			node_pos.node 	= node;
			node_pos.index 	= 0;
			node = node->children[0];	
		}
	}
	return node_pos;
}

/**
*	Merge nodes n1 and n2 (case 3b from Cormen)
*	@param btree The btree 
*	@param node The parent node 
*	@param index of the child
*	@param pos left or right
*	@return none 
*/
static bt_node * merge_siblings(btree * btree, bt_node * parent, unsigned int index , 
					position_t pos) {
	unsigned int i,j;
	bt_node * new_node;
	bt_node * n1, * n2;
        
        if (index == (parent->nr_active)) {   
               index--;
	       n1 = parent->children[parent->nr_active - 1];
	       n2 = parent->children[parent->nr_active];
        } else {
               n1 = parent->children[index];
	       n2 = parent->children[index + 1];
        }

	//Merge the current node with the left node
	new_node = allocate_btree_node(btree->order);
	new_node->level = n1->level;
	new_node->leaf = n1->leaf;

	for(j=0;j<btree->order - 1; j++) {
		new_node->key_vals[j] =	n1->key_vals[j];
		new_node->children[j] =	n1->children[j];
	}
	
	new_node->key_vals[btree->order - 1] =	parent->key_vals[index];
	new_node->children[btree->order - 1] =	n1->children[btree->order - 1];

	for(j=0;j<btree->order - 1; j++) {
		new_node->key_vals[j + btree->order] = 	n2->key_vals[j];
		new_node->children[j + btree->order] = 	n2->children[j];
	}
	new_node->children[2*btree->order - 1] = n2->children[btree->order - 1];

	parent->children[index] = new_node;

	for(j = index;j<parent->nr_active;j++) {
		parent->key_vals[j] = parent->key_vals[j + 1];
		parent->children[j + 1] = parent->children[j + 2];
	}

	new_node->nr_active = n1->nr_active + n2->nr_active + 1;
	parent->nr_active--;

	for(i=parent->nr_active;i < 2*btree->order - 1; i++) {
		parent->key_vals[i] = NULL; 
	}

	free_btree_node(n1);
	free_btree_node(n2);

	if (parent->nr_active == 0 && btree->root == parent) {
		free_btree_node(parent);
		btree->root = new_node;
		if(new_node->level)
			new_node->leaf = false;
		else
			new_node->leaf = true; 
	}

	return new_node;
}

/**
*	Move the key from node to another	
*	@param btree The B-Tree
*	@param node The parent node
*	@param index of the key to be moved done 
*	@param pos the position of the child to receive the key 
*	@return none
*/
static void move_key(btree * btree, bt_node * node, unsigned int index, position_t pos) {
	bt_node * lchild;
	bt_node * rchild;
	unsigned int i;

	if(pos == right) {
		index--;
	}
	lchild = node->children[index];
	rchild = node->children[index + 1];
	
	// Move the key from the parent to the left child
	if(pos == left) {
		lchild->key_vals[lchild->nr_active] = node->key_vals[index];
		lchild->children[lchild->nr_active + 1] = rchild->children[0];
		rchild->children[0] = NULL;
		lchild->nr_active++;

		node->key_vals[index] = rchild->key_vals[0];
		rchild->key_vals[0] = NULL;

		for(i=0;i<rchild->nr_active - 1;i++) {
			rchild->key_vals[i] = rchild->key_vals[i + 1];
			rchild->children[i] = rchild->children[i + 1];
		}
		rchild->children[rchild->nr_active - 1] = 
				rchild->children[rchild->nr_active];
		rchild->nr_active--;
	} else {
		// Move the key from the parent to the right child
		for(i=rchild->nr_active;i > 0 ; i--) {
			rchild->key_vals[i] = rchild->key_vals[i - 1];
			rchild->children[i + 1] = rchild->children[i];		
		}
		rchild->children[1] = rchild->children[0];		
		rchild->children[0] = NULL;

		rchild->key_vals[0] = node->key_vals[index];

		rchild->children[0] = lchild->children[lchild->nr_active];	
		lchild->children[lchild->nr_active] = NULL;

		node->key_vals[index] = lchild->key_vals[lchild->nr_active - 1];
		lchild->key_vals[lchild->nr_active - 1] = NULL;
			
		lchild->nr_active--;
		rchild->nr_active++;		
	}				
}

/**
*	Merge nodes n1 and n2
*	@param n1 First node
*	@param n2 Second node
*	@return combined node
*/
static bt_node * merge_nodes(btree * btree, bt_node * n1, bt_key_val * kv,
                                                bt_node * n2) {
	bt_node * new_node;
	unsigned int i;	

	new_node = allocate_btree_node(btree->order);
	new_node->leaf = true;
	
	for(i=0;i<n1->nr_active;i++) {
		new_node->key_vals[i]   = n1->key_vals[i];	
                new_node->children[i]   = n1->children[i];
	}
        new_node->children[n1->nr_active] = n1->children[n1->nr_active];
        new_node->key_vals[n1->nr_active] = kv;

	for(i=0;i<n2->nr_active;i++) {
		new_node->key_vals[i + n1->nr_active + 1] = n2->key_vals[i];
                new_node->children[i + n1->nr_active + 1] = n2->children[i];
	}
        new_node->children[2*btree->order - 1] = n2->children[n2->nr_active];
	
        new_node->nr_active = n1->nr_active + n2->nr_active + 1;
        new_node->leaf = n1->leaf;
        new_node->level = n1->level;

	free_btree_node(n1);
	free_btree_node(n2);

	return new_node;
}

/**
*	Used to delete a key from the B-tree node
*	@param btree The btree
*	@param node The node from which the key is to be deleted 	
*	@param key The key to be deleted
*	@return 0 on success -1 on error 
*/

int delete_key_from_node(btree * btree, node_pos * node_pos) {
	unsigned int keys_max = 2*btree->order - 1;
	unsigned int i;
	bt_key_val * key_val;
	bt_node * node = node_pos->node;

	if(node->leaf == false) {
		return -1;
	}	
	
	key_val = node->key_vals[node_pos->index];

	for(i=node_pos->index;i< keys_max - 1;i++) {
		node->key_vals[i] = node->key_vals[i + 1];	
	}
	
	if(key_val->key) {
		mem_free(key_val->key);
                key_val->key = NULL;
	}

	if(key_val->val) {
		mem_free(key_val->val);
                key_val->val = NULL;
	}
	
	node->nr_active--;

	if(node->nr_active == 0 ) {
		free_btree_node(node);
	}
	return 0;
}

/**
*       Function used to delete a node from a  B-Tree
*       @param btree The B-Tree
*       @param key Key of the node to be deleted
*       @param value function to map the key to an unique integer value        
*       @param compare Function used to compare the two nodes of the tree
*       @return success or failure
*/

int btree_delete_key(btree * btree,bt_node * subtree,void * key) {
	unsigned int i,index;
	bt_node * node = NULL, * rsibling, *lsibling;
	bt_node * comb_node, * parent;
	node_pos sub_node_pos;
	node_pos node_pos;
	bt_key_val * key_val, * new_key_val;
	unsigned int kv = btree->value(key);	

	node = subtree;
	parent = NULL;	

del_loop:for (i = 0;;i = 0) {
             
            //If there are no keys simply return
            if(!node->nr_active)
                return -1;

	    // Fix the index of the key greater than or equal
	    // to the key that we would like to search
		
	    while (i < node->nr_active && kv > 
			    btree->value(node->key_vals[i]->key) ) {
		    i++;
	    }
	    index = i;

	    // If we find such key break		    
	    if(i < node->nr_active && 
			kv == btree->value(node->key_vals[i]->key)) {
			break;
	    }
            if(node->leaf)
                return -1;
                
    	    //Store the parent node
	    parent = node;

	    // To get a child node 
	    node = node->children[i];

            //If NULL not found
            if (node == NULL)
                return -1;

            if (index == (parent->nr_active)) {   
                lsibling =  parent->children[parent->nr_active - 1];
	    	rsibling = NULL; 
            } else if (index == 0) {
                lsibling = NULL;
                rsibling = parent->children[1];
            } else {
        	lsibling = parent->children[i - 1];
	    	rsibling = parent->children[i + 1];
            }

	    if (node->nr_active == btree->order - 1 && parent) {
		// The current node has (t - 1) keys but the right sibling has > (t - 1)
		// keys
		if (rsibling && (rsibling->nr_active > btree->order - 1)) {
			move_key(btree,parent,i,left);
		} else
		// The current node has (t - 1) keys but the left sibling has (t - 1)
		// keys
		if (lsibling && (lsibling->nr_active > btree->order - 1)) {
			move_key(btree,parent,i,right);
		} else
		// Left sibling has (t - 1) keys
	        if(lsibling && (lsibling->nr_active == btree->order - 1)) {
		        node = merge_siblings(btree,parent,i,left);
		} else
                // Right sibling has (t - 1) keys
	        if(rsibling && (rsibling->nr_active == btree->order - 1)) {
		        node = merge_siblings(btree,parent,i,right);
		}
	    }
        }            
	
	//Case 1 : The node containing the key is found and is the leaf node.
	//Also the leaf node has keys greater than the minimum required.
	//Simply remove the key
	if(node->leaf && (node->nr_active > btree->order - 1)) {
		node_pos.node = node;
		node_pos.index = index;
		delete_key_from_node(btree,&node_pos);
		return 0;
	}

	//If the leaf node is the root permit deletion even if the number of keys is
	//less than (t - 1)
	if(node->leaf && (node == btree->root)) {
		node_pos.node = node;
		node_pos.index = index;
		delete_key_from_node(btree,&node_pos);
		return 0;
	}


	//Case 2: The node containing the key is found and is an internal node
	if(node->leaf == false) {
		if(node->children[index]->nr_active > btree->order - 1 ) {
			sub_node_pos = get_max_key_pos(btree,node->children[index]);
                        key_val = sub_node_pos.node->key_vals[sub_node_pos.index];

                        new_key_val = (bt_key_val *)mem_alloc(sizeof(bt_key_val));
                        copy_key_val(btree,key_val,new_key_val);
        		node->key_vals[index] = new_key_val;	
               
                        btree_delete_key(btree,node->children[index],key_val->key);
			if(sub_node_pos.node->leaf == false) {
                                print("Not leaf\n");
                        }
		} else if ((node->children[index + 1]->nr_active > btree->order - 1) ) {
			sub_node_pos = 
                                get_min_key_pos(btree,node->children[index + 1]);
                        key_val = sub_node_pos.node->key_vals[sub_node_pos.index];

                        new_key_val = (bt_key_val *)mem_alloc(sizeof(bt_key_val));
                        copy_key_val(btree,key_val,new_key_val);
        		node->key_vals[index] = new_key_val;	
               
                        btree_delete_key(btree,node->children[index + 1],key_val->key);
			if(sub_node_pos.node->leaf == false) {
                                print("Not leaf\n");
                        }

		} else if ( 
			node->children[index]->nr_active == btree->order - 1 &&
			node->children[index + 1]->nr_active == btree->order - 1) {

			comb_node = merge_nodes(btree,node->children[index],
                                node->key_vals[index],
				node->children[index + 1]);
			node->children[index] = comb_node;
			
			for(i=index + 1;i<node->nr_active;i++) {
				node->children[i] = node->children[i + 1];
				node->key_vals[i - 1] = node->key_vals[i];
			}
			node->nr_active--;
                        if (node->nr_active == 0 && btree->root == node) {
                                free_btree_node(node);
                                btree->root = comb_node;        
                        } 
                        node = comb_node;
                        goto del_loop;
		}		
          }
	
	// Case 3:
	// In this case start from the top of the tree and continue
	// moving to the leaf node making sure that each node that
	// we encounter on the way has atleast 't' (order of the tree)
	// keys 
	if(node->leaf && (node->nr_active > btree->order - 1)) {
	      node_pos.node = node;
	      node_pos.index = index;
	      delete_key_from_node(btree,&node_pos);
	}

        return 0;
}

/**
*	Function used to get the node containing the given key
*	@param btree The btree to be searched
*	@param key The the key to be searched
*	@return The node and position of the key within the node 
*/
node_pos  get_btree_node(btree * btree,void * key) {
	node_pos kp;
	unsigned int key_val = btree->value(key);	
	bt_node * node;
	unsigned int i = 0;

	node = btree->root;

		
	for (;;i = 0) {	

	    // Fix the index of the key greater than or equal
	    // to the key that we would like to search
		
	    while (i < node->nr_active && key_val > 
			    btree->value(node->key_vals[i]->key) ) {
		    i++;
	    }

	    // If we find such key return the key-value pair		    
	    if(i < node->nr_active && 
			key_val == btree->value(node->key_vals[i]->key)) {
		    kp.node = node;
		    kp.index = i;	
		    return kp;
	    }
	
	    // If the node is leaf and if we did not find the key 
	    // return NULL
	    if(node->leaf) {
		    return kp;
	    }

	    // To got a child node 
	    node = node->children[i];
	}
      	return kp;

}

/**
*       Used to destory btree
*       @param btree The B-tree
*       @return none
*/
void btree_destroy(btree * btree) {
       int i = 0;
       unsigned int current_level;

       bt_node * head, * tail, * node;
       bt_node * child, * del_node;

       node = btree->root;
       current_level = node->level;
       head = node;
       tail = node;

       while(true) {
               if(head == NULL) {
                       break;
               }
               if (head->level < current_level) {
                       current_level = head->level;
               }

               if(head->leaf == false) {
                       for(i = 0 ; i < head->nr_active + 1; i++) {
                               child = head->children[i];
                               tail->next = child;
                               tail = child;
                               child->next = NULL;
                       }
               }
               del_node = head;
               head = head->next;
               free_btree_node(del_node);
       }

}

/**
*       Function used to search a node in a B-Tree
*       @param btree The B-tree to be searched
*       @param key Key of the node to be search
*       @return The key-value pair
*/
bt_key_val * btree_search(btree * btree,void * key) {

	bt_key_val * key_val = NULL;
	node_pos kp = get_btree_node(btree,key);

	if(kp.node) {
		key_val = kp.node->key_vals[kp.index];
	}
	return key_val; 
}

/**
*       Used to copy key value from source to destination
*       @param src The source key value
*       @param dst The dest key value
*       @return none
*/
static void copy_key_val(btree * btree, bt_key_val * src, bt_key_val * dst) {
        unsigned int keysize;
        unsigned int datasize;

        keysize    = btree->key_size(src->key);
        dst->key        = (void *)mem_alloc(keysize);
        bcopy(src->key,dst->key,keysize);
        
        if(src->val) {
                datasize   = btree->data_size(src->val);
                dst->val       = (void *)mem_alloc(datasize);
                bcopy(src->val,dst->val,datasize);
        }                
        
}

/**
*	Get the max key in the btree
*	@param btree The btree
*	@return The max key 
*/
void * btree_get_max_key(btree * btree) {
	node_pos node_pos;
	node_pos = get_max_key_pos(btree,btree->root);
	return node_pos.node->key_vals[node_pos.index]->key;
}

/**
*	Get the min key in the btree
*	@param btree The btree
*	@return The max key 
*/
void * btree_get_min_key(btree * btree) {
	node_pos node_pos;
	node_pos = get_min_key_pos(btree,btree->root);
	return node_pos.node->key_vals[node_pos.index]->key;
}

#ifdef DEBUG

/**
*	Used to print the keys of the bt_node
*	@param node The node whose keys are to be printed
*	@return none
*/

static void print_single_node(btree *btree, bt_node * node) {
	
	int i = 0;
	
	print(" { ");	
	while(i < node->nr_active) {
		print("0x%x(%d) ", btree->value(node->key_vals[i]->key),
			node->level);
		i++;
	}
	print("} (0x%x,%d) ", node,node->leaf);	
}

/**
*       Function used to print the B-tree
*       @param root Root of the B-Tree
*       @param print_key Function used to print the key value
*       @return none
*/

void print_subtree(btree *btree,bt_node * node) {
	
	int i = 0;
	unsigned int current_level;

	bt_node * head, * tail;
	bt_node * child;

	current_level = node->level;
	head = node;
	tail = node;

	while(true) {
		if(head == NULL) {
			break;
		}
		if (head->level < current_level) {
			current_level = head->level;
			print("\n");
		}
		print_single_node(btree,head);

		if(head->leaf == false) {	
			for(i = 0 ; i < head->nr_active + 1; i++) {
				child = head->children[i];
				tail->next = child;
				tail = child;
				child->next = NULL;
			}
		}
		head = head->next;	
	}
	print("\n");
}


#endif

[-- Attachment #4: main.c --]
[-- Type: application/octet-stream, Size: 1325 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "btree.h"

unsigned int value(void * key) {
	return *((int *)key);
}

unsigned int keysize(void * key) {
        return sizeof(int);
}

unsigned int datasize(void * data) {
        return sizeof(int);
}

int main(int argc,char * argv[])
{
	int i = 0;
	btree * tree;
	bt_key_val * kv;
	int item = 0x43;
	int count;
	int order;
        int * values;
	
	srandom(time(NULL));
	
	for(order=2;order<60;order++) {
	    count = random()%512;		
            values = (int *)malloc(count*sizeof(int));

            for(i = 0;i<count;i++) {
                values[i] = random()%1024;
            }
            
	    tree = btree_create(order);	
            tree->value = value;
            tree->key_size = keysize;
            tree->data_size = datasize;
	    
	    for (i=0;i<count;i++) {	
		    kv = (bt_key_val*)malloc(sizeof(*kv));
		    kv->key = malloc(sizeof(int));		
		    *(int *)kv->key = values[i];
		    kv->val = malloc(sizeof(int));
		    *(int *)kv->val = values[i];
		    btree_insert_key(tree,kv);
	    }		
    	    print_subtree(tree,tree->root);
	    
	    for (i= count - 1; i > -1; i-= (random()%5)) {	
		    item = values[i];
		    btree_delete_key(tree,tree->root,&item);
	    }
	    free(values);
            btree_destroy(tree);
	}
	return 0;
}

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18  2:02 Generic B-tree implementation Vishal Patil
@ 2006-07-18  2:58 ` Horst von Brand
  2006-07-18  3:08   ` Vishal Patil
  2006-07-18  4:27 ` Gary Funck
  1 sibling, 1 reply; 14+ messages in thread
From: Horst von Brand @ 2006-07-18  2:58 UTC (permalink / raw)
  To: Vishal Patil; +Cc: Linux Kernel Mailing List

Vishal Patil <vishpat@gmail.com> wrote:
> I am attaching source files containing a very generic implementation
> of B-trees in C. The implementation corresponds to in memory B-Tree
> data structure. The B-tree library consists of two files, btree.h and
> btree.c. I am also attaching a sample program main.c which should
> hopefully make the use of the library clear.

B-trees are useful mainly when you can get a bunch of pointers in one
swoop, i.e., by reading nodes from disk.

> I would be happy to receive inputs from the community for changes
> (enchancements) to the library. All the more I would like to help
> someone with a project which would take advantage of the B-Tree
> implementation.

Build infrastructure (== library) without clear users won't go very far on
LKML.
-- 
Dr. Horst H. von Brand                   User #22616 counter.li.org
Departamento de Informatica                     Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria              +56 32 654239
Casilla 110-V, Valparaiso, Chile                Fax:  +56 32 797513

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18  2:58 ` Horst von Brand
@ 2006-07-18  3:08   ` Vishal Patil
  2006-07-18  3:20     ` Valdis.Kletnieks
  0 siblings, 1 reply; 14+ messages in thread
From: Vishal Patil @ 2006-07-18  3:08 UTC (permalink / raw)
  To: Horst von Brand; +Cc: Linux Kernel Mailing List

Agreed, however if I am not mistaken B-trees are useful even for
virtual memory implementation, for example HP-UX uses B-trees for
managing virtual memory pages.

Also I did not get the statement
"Build infrastructure (== library) without clear users won't go very
far on LKML"

- Vishal


On 7/17/06, Horst von Brand <vonbrand@inf.utfsm.cl> wrote:
> Vishal Patil <vishpat@gmail.com> wrote:
> > I am attaching source files containing a very generic implementation
> > of B-trees in C. The implementation corresponds to in memory B-Tree
> > data structure. The B-tree library consists of two files, btree.h and
> > btree.c. I am also attaching a sample program main.c which should
> > hopefully make the use of the library clear.
>
> B-trees are useful mainly when you can get a bunch of pointers in one
> swoop, i.e., by reading nodes from disk.
>
> > I would be happy to receive inputs from the community for changes
> > (enchancements) to the library. All the more I would like to help
> > someone with a project which would take advantage of the B-Tree
> > implementation.
>
> Build infrastructure (== library) without clear users won't go very far on
> LKML.
> --
> Dr. Horst H. von Brand                   User #22616 counter.li.org
> Departamento de Informatica                     Fono: +56 32 654431
> Universidad Tecnica Federico Santa Maria              +56 32 654239
> Casilla 110-V, Valparaiso, Chile                Fax:  +56 32 797513
>


-- 
Motivation will almost always beat mere talent.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18  3:08   ` Vishal Patil
@ 2006-07-18  3:20     ` Valdis.Kletnieks
  0 siblings, 0 replies; 14+ messages in thread
From: Valdis.Kletnieks @ 2006-07-18  3:20 UTC (permalink / raw)
  To: Vishal Patil; +Cc: Horst von Brand, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 686 bytes --]

On Mon, 17 Jul 2006 23:08:53 EDT, Vishal Patil said:
> Agreed, however if I am not mistaken B-trees are useful even for
> virtual memory implementation, for example HP-UX uses B-trees for
> managing virtual memory pages.

OK, sounds at least somewhat plausible..

> Also I did not get the statement
> "Build infrastructure (== library) without clear users won't go very
> far on LKML"

Your patch would go a lot further if it came as 2 parts:

PATCH 1/2: Add Generic B-tree implementation
PATCH 2/2: Convert mm/foobar.c to track VM pages using B-trees.

Barring an actual patch 2/2, a *clear* explanation of why it would benefit
a *specific* piece of code so somebody else can do it...

[-- Attachment #2: Type: application/pgp-signature, Size: 226 bytes --]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: Generic B-tree implementation
  2006-07-18  2:02 Generic B-tree implementation Vishal Patil
  2006-07-18  2:58 ` Horst von Brand
@ 2006-07-18  4:27 ` Gary Funck
  2006-07-18 13:30   ` Vishal Patil
  1 sibling, 1 reply; 14+ messages in thread
From: Gary Funck @ 2006-07-18  4:27 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Vishal Patil


Vishal Patil wrote:
> 
> I am attaching source files containing a very generic implementation
> of B-trees in C. The implementation corresponds to in memory B-Tree
> data structure. The B-tree library consists of two files, btree.h and
> btree.c. I am also attaching a sample program main.c which should
> hopefully make the use of the library clear.

Couple of thoughts:

1. red/black b-trees have superior worst case performance as it
relates to rebalancing, and the implementation doesn't add a
lot of complexity:
http://www.nist.gov/dads/HTML/redblack.html

2. Paul Vixie's b-tree implementation has been around since the mid-80's
or so, and simply from an historical perspective is worth a look
(comp.sources.unix anyone?):
http://www.isc.org/index.pl?/sources/devel/func/avl-subs-2.php

3. GCC uses 'splay trees' to good advantage:
http://www.nist.gov/dads/HTML/splaytree.html
which have the property that most-recently referenced nodes
tend to be higher up in the tree.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18  4:27 ` Gary Funck
@ 2006-07-18 13:30   ` Vishal Patil
  2006-07-18 15:00     ` Gary Funck
  0 siblings, 1 reply; 14+ messages in thread
From: Vishal Patil @ 2006-07-18 13:30 UTC (permalink / raw)
  To: Gary Funck; +Cc: Linux Kernel Mailing List

Gary

I said B-Tree and not binary tree, please read the explaination about
B-tree at http://en.wikipedia.org/wiki/B-tree. Also I am aware of AVL
trees.

I never claimed that my implementation is better or anything like
that. I posted the code so that someone in need of the data structure
might use it. Also I would be willing them to help with their project.

- Vishal

On 7/18/06, Gary Funck <gary@intrepid.com> wrote:
>
> Vishal Patil wrote:
> >
> > I am attaching source files containing a very generic implementation
> > of B-trees in C. The implementation corresponds to in memory B-Tree
> > data structure. The B-tree library consists of two files, btree.h and
> > btree.c. I am also attaching a sample program main.c which should
> > hopefully make the use of the library clear.
>
> Couple of thoughts:
>
> 1. red/black b-trees have superior worst case performance as it
> relates to rebalancing, and the implementation doesn't add a
> lot of complexity:
> http://www.nist.gov/dads/HTML/redblack.html
>
> 2. Paul Vixie's b-tree implementation has been around since the mid-80's
> or so, and simply from an historical perspective is worth a look
> (comp.sources.unix anyone?):
> http://www.isc.org/index.pl?/sources/devel/func/avl-subs-2.php
>
> 3. GCC uses 'splay trees' to good advantage:
> http://www.nist.gov/dads/HTML/splaytree.html
> which have the property that most-recently referenced nodes
> tend to be higher up in the tree.
>
>


-- 
Motivation will almost always beat mere talent.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: Generic B-tree implementation
  2006-07-18 13:30   ` Vishal Patil
@ 2006-07-18 15:00     ` Gary Funck
  2006-07-18 15:13       ` Bob Copeland
  2006-07-18 15:22       ` Vishal Patil
  0 siblings, 2 replies; 14+ messages in thread
From: Gary Funck @ 2006-07-18 15:00 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Vishal Patil


Vishal Patil wrote:
> I said B-Tree and not binary tree, please read the explaination about
> B-tree at http://en.wikipedia.org/wiki/B-tree. Also I am aware of AVL
> trees.
> 
> I never claimed that my implementation is better or anything like
> that. I posted the code so that someone in need of the data structure
> might use it. Also I would be willing them to help with their project.

My reason for pointing out the other data strucutres is to note that there
might be search tree representations that are more appropriate for
implementation inside the kernel, and to perhaps encourage you to have
a look at implementing them as well.  Red-black trees in particular have
the property that they're reasonably well-balanced, and that the balancing
algorithm makes use of local information.  That means that the kernel might
be able to limit the level of locking required to update the tree.

I liked your B-tree implementation, and have saved a copy.  Too bad there
isn't the C/C++ equivalent of CPAN (comp.unix.sources is so passe`).  Your
B-tree implementation would make a nice addition to an archive of
handy C algorithm implementations.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18 15:00     ` Gary Funck
@ 2006-07-18 15:13       ` Bob Copeland
  2006-07-18 15:22       ` Vishal Patil
  1 sibling, 0 replies; 14+ messages in thread
From: Bob Copeland @ 2006-07-18 15:13 UTC (permalink / raw)
  To: Gary Funck; +Cc: Linux Kernel Mailing List, Vishal Patil

On 7/18/06, Gary Funck <gary@intrepid.com> wrote:
> I liked your B-tree implementation, and have saved a copy.  Too bad there
> isn't the C/C++ equivalent of CPAN (comp.unix.sources is so passe`).  Your
> B-tree implementation would make a nice addition to an archive of
> handy C algorithm implementations.

FWIW C++ has http://boost.org/... and C has /usr/lib/ :)

-Bob

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18 15:00     ` Gary Funck
  2006-07-18 15:13       ` Bob Copeland
@ 2006-07-18 15:22       ` Vishal Patil
  2006-07-19  7:33         ` Anton Altaparmakov
  1 sibling, 1 reply; 14+ messages in thread
From: Vishal Patil @ 2006-07-18 15:22 UTC (permalink / raw)
  To: Gary Funck; +Cc: Linux Kernel Mailing List

B-trees are good for parellel updates as well. Anyway it would be
great to have inputs from other folks about how B-trees could help
inside the kernel (if at all)

- Vishal

On 7/18/06, Gary Funck <gary@intrepid.com> wrote:
>
> Vishal Patil wrote:
> > I said B-Tree and not binary tree, please read the explaination about
> > B-tree at http://en.wikipedia.org/wiki/B-tree. Also I am aware of AVL
> > trees.
> >
> > I never claimed that my implementation is better or anything like
> > that. I posted the code so that someone in need of the data structure
> > might use it. Also I would be willing them to help with their project.
>
> My reason for pointing out the other data strucutres is to note that there
> might be search tree representations that are more appropriate for
> implementation inside the kernel, and to perhaps encourage you to have
> a look at implementing them as well.  Red-black trees in particular have
> the property that they're reasonably well-balanced, and that the balancing
> algorithm makes use of local information.  That means that the kernel might
> be able to limit the level of locking required to update the tree.
>
> I liked your B-tree implementation, and have saved a copy.  Too bad there
> isn't the C/C++ equivalent of CPAN (comp.unix.sources is so passe`).  Your
> B-tree implementation would make a nice addition to an archive of
> handy C algorithm implementations.
>


-- 
Motivation will almost always beat mere talent.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-18 15:22       ` Vishal Patil
@ 2006-07-19  7:33         ` Anton Altaparmakov
  2006-07-19 13:34           ` Vishal Patil
  0 siblings, 1 reply; 14+ messages in thread
From: Anton Altaparmakov @ 2006-07-19  7:33 UTC (permalink / raw)
  To: Vishal Patil; +Cc: Gary Funck, Linux Kernel Mailing List

On Tue, 2006-07-18 at 11:22 -0400, Vishal Patil wrote:
> B-trees are good for parellel updates as well. Anyway it would be
> great to have inputs from other folks about how B-trees could help
> inside the kernel (if at all)

B-trees are mostly used in file systems in the kernel.  For example NTFS
and HFS (and I think HPFS) use B-trees for various metadata like
directory indexes for example.

But of course your implementation is purely userspace and cannot be used
in the kernel (you use recursion for example...) so I am not sure how
you envisage to help the kernel with your code...

Best regards,

        Anton
-- 
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-19  7:33         ` Anton Altaparmakov
@ 2006-07-19 13:34           ` Vishal Patil
  2006-07-19 16:14             ` Andrea Arcangeli
  0 siblings, 1 reply; 14+ messages in thread
From: Vishal Patil @ 2006-07-19 13:34 UTC (permalink / raw)
  To: Anton Altaparmakov; +Cc: Gary Funck, Linux Kernel Mailing List

I can get rid of recursions using loops, will need to work a little more on it.

Also I will be working on developing a patch for VM management using
B-trees instead of RB-trees.

- Vishal

On 7/19/06, Anton Altaparmakov <aia21@cam.ac.uk> wrote:
> On Tue, 2006-07-18 at 11:22 -0400, Vishal Patil wrote:
> > B-trees are good for parellel updates as well. Anyway it would be
> > great to have inputs from other folks about how B-trees could help
> > inside the kernel (if at all)
>
> B-trees are mostly used in file systems in the kernel.  For example NTFS
> and HFS (and I think HPFS) use B-trees for various metadata like
> directory indexes for example.
>
> But of course your implementation is purely userspace and cannot be used
> in the kernel (you use recursion for example...) so I am not sure how
> you envisage to help the kernel with your code...
>
> Best regards,
>
>         Anton
> --
> Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
> Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
> Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
> WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/
>
>


-- 
Motivation will almost always beat mere talent.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-19 13:34           ` Vishal Patil
@ 2006-07-19 16:14             ` Andrea Arcangeli
  2006-07-19 16:26               ` Vishal Patil
  0 siblings, 1 reply; 14+ messages in thread
From: Andrea Arcangeli @ 2006-07-19 16:14 UTC (permalink / raw)
  To: Vishal Patil; +Cc: Anton Altaparmakov, Gary Funck, Linux Kernel Mailing List

On Wed, Jul 19, 2006 at 09:34:43AM -0400, Vishal Patil wrote:
> I can get rid of recursions using loops, will need to work a little more on 
> it.

Before doing the above you may want to learn about all possible malloc
retvals too and to make sure the interface has all needed oom failure
paths that you're obviously missing.

One of the advantages of rbtree vs b-trees (and vs radixtrees too) is
the fact they require zero dynamic metadata allocations of ram. They
use the same trick of list.h to avoid it while still being mostly
generic and sharable library code. Imagine rbtrees like scalable
lists. The kernel usage is quite optimized too, the mmap path for
example does a single lookup and it stores the last "lookup" point
before restarting with an insertion while keeping the mmap_sem (or
mutex renaming of the day) on hold so to avoid the insertion operation
to start over with a second (wasteful) lookup (again very similar to
what you could do if you had list, and the rebalancing is a very
immediate operation too involving only a limited number of pointers).

> Also I will be working on developing a patch for VM management using
> B-trees instead of RB-trees.

Once you start changing those bits, you'll notice the further
requirement of the btrees due to the oom failures in code paths that
are already reasonably complex with vma oom failures.

As speed of cache raises faster than speed of ram, memory seeks tends
to cost more than they did in the past, but I doubt it worth it, most
important especially in the common case of very few vmas. I like the
common case of only a few dozen vmas to be so fast and low
overhead. The corner cases like uml and oracle already use nonlinear,
to also avoid the ram overhead of the vmas, with btree the lowmem
overhead would be even higher (the only 4/8 bytes of overhead of the
rbtrees would even be fixable with David's patch, but nobody
considered it very important so far to eliminate those 4/8 bytes
32bit/64bit per vma, though we can do that in the future). So even if
btree would be faster for those extreme corner cases, it would still
not be a replacement for the nonlinear (I wish there was a decent
replacement for nonlinear, whose only reason to exist seems to be uml
on 64bit archs).

If I would be in you, as a slightly more likely to succeed experiment,
I would be looking into replacing the pagecache radix-tree with a
btree, as long as you can leave intact the tagging properties we have
in the radix-tree needed for finding only dirty elements in the tree
etc... (we use that to avoid separate dirty lists for the pages). You
should also size the order to automatically match the cache size of
the arch (dunno if it's better at compile or run time). I'm no a
radix-tree guru but the btree may save some ram if you've all
pagecache pages scattered all over the place with random access. It
also won't require all levels to be allocated. However it will require
rebalancing, something the radix tree doesn't require, it seems a bit
of a tradeoff, and I suspect the radix-tree will still win in all
common cases. But at least all oom failure paths should already exists
for you, so that should avoid you having to touch much code externally
to your own btree files.

I wish you to have fun with the btrees, I remember I had fun back then
when I was playing with the rbtrees ;).

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-19 16:14             ` Andrea Arcangeli
@ 2006-07-19 16:26               ` Vishal Patil
  2006-08-07  2:18                 ` Vishal Patil
  0 siblings, 1 reply; 14+ messages in thread
From: Vishal Patil @ 2006-07-19 16:26 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Anton Altaparmakov, Gary Funck, Linux Kernel Mailing List

Andrea

Thank you for your time and a very valuable input. I was thinking of
implementing the VM management using B-trees because I wanted to play
with something interesting in the kernel. However I will definately
look into your idea of page cache as well.

Will keep everyone informed about my progress.

- Vishal


On 7/19/06, Andrea Arcangeli <andrea@suse.de> wrote:
> On Wed, Jul 19, 2006 at 09:34:43AM -0400, Vishal Patil wrote:
> > I can get rid of recursions using loops, will need to work a little more on
> > it.
>
> Before doing the above you may want to learn about all possible malloc
> retvals too and to make sure the interface has all needed oom failure
> paths that you're obviously missing.
>
> One of the advantages of rbtree vs b-trees (and vs radixtrees too) is
> the fact they require zero dynamic metadata allocations of ram. They
> use the same trick of list.h to avoid it while still being mostly
> generic and sharable library code. Imagine rbtrees like scalable
> lists. The kernel usage is quite optimized too, the mmap path for
> example does a single lookup and it stores the last "lookup" point
> before restarting with an insertion while keeping the mmap_sem (or
> mutex renaming of the day) on hold so to avoid the insertion operation
> to start over with a second (wasteful) lookup (again very similar to
> what you could do if you had list, and the rebalancing is a very
> immediate operation too involving only a limited number of pointers).
>
> > Also I will be working on developing a patch for VM management using
> > B-trees instead of RB-trees.
>
> Once you start changing those bits, you'll notice the further
> requirement of the btrees due to the oom failures in code paths that
> are already reasonably complex with vma oom failures.
>
> As speed of cache raises faster than speed of ram, memory seeks tends
> to cost more than they did in the past, but I doubt it worth it, most
> important especially in the common case of very few vmas. I like the
> common case of only a few dozen vmas to be so fast and low
> overhead. The corner cases like uml and oracle already use nonlinear,
> to also avoid the ram overhead of the vmas, with btree the lowmem
> overhead would be even higher (the only 4/8 bytes of overhead of the
> rbtrees would even be fixable with David's patch, but nobody
> considered it very important so far to eliminate those 4/8 bytes
> 32bit/64bit per vma, though we can do that in the future). So even if
> btree would be faster for those extreme corner cases, it would still
> not be a replacement for the nonlinear (I wish there was a decent
> replacement for nonlinear, whose only reason to exist seems to be uml
> on 64bit archs).
>
> If I would be in you, as a slightly more likely to succeed experiment,
> I would be looking into replacing the pagecache radix-tree with a
> btree, as long as you can leave intact the tagging properties we have
> in the radix-tree needed for finding only dirty elements in the tree
> etc... (we use that to avoid separate dirty lists for the pages). You
> should also size the order to automatically match the cache size of
> the arch (dunno if it's better at compile or run time). I'm no a
> radix-tree guru but the btree may save some ram if you've all
> pagecache pages scattered all over the place with random access. It
> also won't require all levels to be allocated. However it will require
> rebalancing, something the radix tree doesn't require, it seems a bit
> of a tradeoff, and I suspect the radix-tree will still win in all
> common cases. But at least all oom failure paths should already exists
> for you, so that should avoid you having to touch much code externally
> to your own btree files.
>
> I wish you to have fun with the btrees, I remember I had fun back then
> when I was playing with the rbtrees ;).
>


-- 
Motivation will almost always beat mere talent.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Generic B-tree implementation
  2006-07-19 16:26               ` Vishal Patil
@ 2006-08-07  2:18                 ` Vishal Patil
  0 siblings, 0 replies; 14+ messages in thread
From: Vishal Patil @ 2006-08-07  2:18 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Anton Altaparmakov, Gary Funck, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 4589 bytes --]

Folks

Following Andrea's suggestions I have implemented the Linux kernel
page cache using B-Trees. I am attaching the patch along with this
email. Note that the patch was developed against 2.6.16.2. Also the
patch offers the B-tree data structure as a library (like the radix
tree)

Also I haven't made any performance measurements to compare it with
the radix tree implementation. However ideas to do this are most
welcome.

Thanks

- Vishal

On 7/19/06, Vishal Patil <vishpat@gmail.com> wrote:
> Andrea
>
> Thank you for your time and a very valuable input. I was thinking of
> implementing the VM management using B-trees because I wanted to play
> with something interesting in the kernel. However I will definately
> look into your idea of page cache as well.
>
> Will keep everyone informed about my progress.
>
> - Vishal
>
>
> On 7/19/06, Andrea Arcangeli <andrea@suse.de> wrote:
> > On Wed, Jul 19, 2006 at 09:34:43AM -0400, Vishal Patil wrote:
> > > I can get rid of recursions using loops, will need to work a little more on
> > > it.
> >
> > Before doing the above you may want to learn about all possible malloc
> > retvals too and to make sure the interface has all needed oom failure
> > paths that you're obviously missing.
> >
> > One of the advantages of rbtree vs b-trees (and vs radixtrees too) is
> > the fact they require zero dynamic metadata allocations of ram. They
> > use the same trick of list.h to avoid it while still being mostly
> > generic and sharable library code. Imagine rbtrees like scalable
> > lists. The kernel usage is quite optimized too, the mmap path for
> > example does a single lookup and it stores the last "lookup" point
> > before restarting with an insertion while keeping the mmap_sem (or
> > mutex renaming of the day) on hold so to avoid the insertion operation
> > to start over with a second (wasteful) lookup (again very similar to
> > what you could do if you had list, and the rebalancing is a very
> > immediate operation too involving only a limited number of pointers).
> >
> > > Also I will be working on developing a patch for VM management using
> > > B-trees instead of RB-trees.
> >
> > Once you start changing those bits, you'll notice the further
> > requirement of the btrees due to the oom failures in code paths that
> > are already reasonably complex with vma oom failures.
> >
> > As speed of cache raises faster than speed of ram, memory seeks tends
> > to cost more than they did in the past, but I doubt it worth it, most
> > important especially in the common case of very few vmas. I like the
> > common case of only a few dozen vmas to be so fast and low
> > overhead. The corner cases like uml and oracle already use nonlinear,
> > to also avoid the ram overhead of the vmas, with btree the lowmem
> > overhead would be even higher (the only 4/8 bytes of overhead of the
> > rbtrees would even be fixable with David's patch, but nobody
> > considered it very important so far to eliminate those 4/8 bytes
> > 32bit/64bit per vma, though we can do that in the future). So even if
> > btree would be faster for those extreme corner cases, it would still
> > not be a replacement for the nonlinear (I wish there was a decent
> > replacement for nonlinear, whose only reason to exist seems to be uml
> > on 64bit archs).
> >
> > If I would be in you, as a slightly more likely to succeed experiment,
> > I would be looking into replacing the pagecache radix-tree with a
> > btree, as long as you can leave intact the tagging properties we have
> > in the radix-tree needed for finding only dirty elements in the tree
> > etc... (we use that to avoid separate dirty lists for the pages). You
> > should also size the order to automatically match the cache size of
> > the arch (dunno if it's better at compile or run time). I'm no a
> > radix-tree guru but the btree may save some ram if you've all
> > pagecache pages scattered all over the place with random access. It
> > also won't require all levels to be allocated. However it will require
> > rebalancing, something the radix tree doesn't require, it seems a bit
> > of a tradeoff, and I suspect the radix-tree will still win in all
> > common cases. But at least all oom failure paths should already exists
> > for you, so that should avoid you having to touch much code externally
> > to your own btree files.
> >
> > I wish you to have fun with the btrees, I remember I had fun back then
> > when I was playing with the rbtrees ;).
> >
>
>
> --
> Motivation will almost always beat mere talent.
>


-- 
Motivation will almost always beat mere talent.

[-- Attachment #2: btree.rc1.patch --]
[-- Type: text/x-patch, Size: 54694 bytes --]

diff -urN linux-2.6.16.2/mm/filemap.c linux-2.6.16.2-btree/mm/filemap.c
--- linux-2.6.16.2/mm/filemap.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/filemap.c	2006-08-06 21:37:55.000000000 -0400
@@ -112,10 +112,16 @@
  */
 void __remove_from_page_cache(struct page *page)
 {
+	struct page * bpage;
 	struct address_space *mapping = page->mapping;
+	unsigned long index = page->index;	
 
-	radix_tree_delete(&mapping->page_tree, page->index);
-	page->mapping = NULL;
+	bpage = btree_delete(&mapping->btree,mapping->btree.root,index).val;
+	if ( bpage != page) {
+		panic("DEBUG: Deleting wrong page in remove from page" 
+			"cache %d,0x%x\n",index,bpage);
+	} 
+     	page->mapping = NULL;
 	mapping->nrpages--;
 	pagecache_acct(-1);
 }
@@ -395,12 +401,14 @@
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 		pgoff_t offset, gfp_t gfp_mask)
 {
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
-
-	if (error == 0) {
+        int btree_error = btree_preload(gfp_mask & ~__GFP_HIGHMEM);
+        
+	if (btree_error == 0) {
 		write_lock_irq(&mapping->tree_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
-		if (!error) {
+                btree_error = 
+			btree_insert(&mapping->btree,offset,page);
+                
+		if (!btree_error) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			page->mapping = mapping;
@@ -409,9 +417,11 @@
 			pagecache_acct(1);
 		}
 		write_unlock_irq(&mapping->tree_lock);
-		radix_tree_preload_end();
+                if(btree_error == 0) {
+                        btree_preload_end();
+                }                        
 	}
-	return error;
+	return btree_error;
 }
 
 EXPORT_SYMBOL(add_to_page_cache);
@@ -522,7 +532,7 @@
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = btree_lookup(&mapping->btree,offset);
 	if (page)
 		page_cache_get(page);
 	read_unlock_irq(&mapping->tree_lock);
@@ -539,7 +549,7 @@
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = btree_lookup(&mapping->btree,offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
 	read_unlock_irq(&mapping->tree_lock);
@@ -563,10 +573,9 @@
 				unsigned long offset)
 {
 	struct page *page;
-
 	read_lock_irq(&mapping->tree_lock);
 repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = btree_lookup(&mapping->btree,offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
@@ -658,8 +667,10 @@
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup(&mapping->page_tree,
-				(void **)pages, start, nr_pages);
+
+	ret = btree_gang_lookup(&mapping->btree,(void **)pages,
+				start,nr_pages);
+
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
 	read_unlock_irq(&mapping->tree_lock);
@@ -677,8 +688,9 @@
 	unsigned int ret;
 
 	read_lock_irq(&mapping->tree_lock);
-	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
+	ret = btree_gang_lookup_tag(&mapping->btree,
 				(void **)pages, *index, nr_pages, tag);
+		
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
 	if (ret)
diff -urN linux-2.6.16.2/mm/page-writeback.c linux-2.6.16.2-btree/mm/page-writeback.c
--- linux-2.6.16.2/mm/page-writeback.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/page-writeback.c	2006-08-06 22:08:50.000000000 -0400
@@ -634,8 +634,15 @@
 				BUG_ON(mapping2 != mapping);
 				if (mapping_cap_account_dirty(mapping))
 					inc_page_state(nr_dirty);
-				radix_tree_tag_set(&mapping->page_tree,
-					page_index(page), PAGECACHE_TAG_DIRTY);
+				btree_tag_set(&mapping->btree,
+				page_index(page), PAGECACHE_TAG_DIRTY);
+
+				if (btree_tag_set(&mapping->btree,
+				page_index(page), PAGECACHE_TAG_DIRTY) !=0 ) {
+					panic("Problem setting the tag\n");
+				}					
+
+				
 			}
 			write_unlock_irq(&mapping->tree_lock);
 			if (mapping->host) {
@@ -714,9 +721,16 @@
 	if (mapping) {
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		if (TestClearPageDirty(page)) {
-			radix_tree_tag_clear(&mapping->page_tree,
+			btree_tag_clear(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+
+			if (btree_tag_get(&mapping->btree,
+					page_index(page),
+					PAGECACHE_TAG_DIRTY)  == 1) {
+				panic("Problem clearing the tag\n");
+			}				
+
 			write_unlock_irqrestore(&mapping->tree_lock, flags);
 			if (mapping_cap_account_dirty(mapping))
 				dec_page_state(nr_dirty);
@@ -769,10 +783,17 @@
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestClearPageWriteback(page);
-		if (ret)
-			radix_tree_tag_clear(&mapping->page_tree,
+		if (ret) {
+			btree_tag_clear(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
+			if (btree_tag_get(&mapping->btree,
+				page_index(page),
+				PAGECACHE_TAG_WRITEBACK) == 1) {
+				panic("Problem clearing the tag\n");
+			}				
+
+		}			
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestClearPageWriteback(page);
@@ -790,14 +811,28 @@
 
 		write_lock_irqsave(&mapping->tree_lock, flags);
 		ret = TestSetPageWriteback(page);
-		if (!ret)
-			radix_tree_tag_set(&mapping->page_tree,
+		if (!ret) {
+			btree_tag_set(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-		if (!PageDirty(page))
-			radix_tree_tag_clear(&mapping->page_tree,
+			if (btree_tag_get(&mapping->btree,
+				page_index(page),
+				PAGECACHE_TAG_WRITEBACK) != 1) {
+				panic("Problem setting the tag\n");
+			}				
+
+		}			
+		if (!PageDirty(page)) {
+			btree_tag_clear(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+			if (btree_tag_get(&mapping->btree,
+				page_index(page),
+				PAGECACHE_TAG_DIRTY) == 1) {
+				panic("Problem clearing the tag");
+			}				
+
+		}			
 		write_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
@@ -817,7 +852,7 @@
 	int ret;
 
 	read_lock_irqsave(&mapping->tree_lock, flags);
-	ret = radix_tree_tagged(&mapping->page_tree, tag);
+	ret = btree_tagged(&mapping->btree,tag);
 	read_unlock_irqrestore(&mapping->tree_lock, flags);
 	return ret;
 }
diff -urN linux-2.6.16.2/mm/readahead.c linux-2.6.16.2-btree/mm/readahead.c
--- linux-2.6.16.2/mm/readahead.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/readahead.c	2006-08-06 22:04:38.000000000 -0400
@@ -282,7 +282,8 @@
 		if (page_offset > end_index)
 			break;
 
-		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		page = btree_lookup(&mapping->btree,page_offset);
+	
 		if (page)
 			continue;
 
diff -urN linux-2.6.16.2/mm/swap_state.c linux-2.6.16.2-btree/mm/swap_state.c
--- linux-2.6.16.2/mm/swap_state.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/swap_state.c	2006-08-06 21:38:25.000000000 -0400
@@ -37,6 +37,7 @@
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
+        .btree          = BTREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 
 	.tree_lock	= RW_LOCK_UNLOCKED,
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
@@ -71,16 +72,21 @@
 static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			       gfp_t gfp_mask)
 {
-	int error;
+	int btree_error;
 
 	BUG_ON(PageSwapCache(page));
 	BUG_ON(PagePrivate(page));
-	error = radix_tree_preload(gfp_mask);
-	if (!error) {
+
+        btree_error = btree_preload(gfp_mask);
+	
+	if (!btree_error) {
 		write_lock_irq(&swapper_space.tree_lock);
-		error = radix_tree_insert(&swapper_space.page_tree,
+                if (!btree_error) {
+                       btree_error = btree_insert(&swapper_space.btree,
 						entry.val, page);
-		if (!error) {
+                }
+                                               
+		if (!btree_error) {
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
@@ -89,9 +95,12 @@
 			pagecache_acct(1);
 		}
 		write_unlock_irq(&swapper_space.tree_lock);
-		radix_tree_preload_end();
+
+                if (!btree_error) {
+                        btree_preload_end();
+                }
 	}
-	return error;
+	return btree_error;
 }
 
 static int add_to_swap_cache(struct page *page, swp_entry_t entry)
@@ -127,7 +136,10 @@
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+        if (btree_delete(&swapper_space.btree,swapper_space.btree.root,
+                                page_private(page)).val != page) {
+		panic("DEBUG: Deleting wrong page from swap cache\n");			
+	}
 	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
diff -urN linux-2.6.16.2/mm/vmscan.c linux-2.6.16.2-btree/mm/vmscan.c
--- linux-2.6.16.2/mm/vmscan.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/mm/vmscan.c	2006-08-06 15:08:34.000000000 -0400
@@ -692,7 +692,7 @@
 				struct page *page, int nr_refs)
 {
 	struct address_space *mapping = page_mapping(page);
-	struct page **radix_pointer;
+	struct page * btree_pointer;
 
 	/*
 	 * Avoid doing any of the following work if the page count
@@ -733,12 +733,12 @@
 
 	write_lock_irq(&mapping->tree_lock);
 
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
+	btree_pointer = (struct page *)btree_lookup_slot(
+						&mapping->btree,
 						page_index(page));
 
 	if (!page_mapping(page) || page_count(page) != nr_refs ||
-			*radix_pointer != page) {
+			btree_pointer != page) {
 		write_unlock_irq(&mapping->tree_lock);
 		return -EAGAIN;
 	}
@@ -759,7 +759,7 @@
 		set_page_private(newpage, page_private(page));
 	}
 
-	*radix_pointer = newpage;
+	btree_pointer = newpage;
 	__put_page(page);
 	write_unlock_irq(&mapping->tree_lock);
 
diff -urN linux-2.6.16.2/lib/btree.c linux-2.6.16.2-btree/lib/btree.c
--- linux-2.6.16.2/lib/btree.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.16.2-btree/lib/btree.c	2006-08-06 21:57:32.000000000 -0400
@@ -0,0 +1,1273 @@
+/*
+ *  btree.c
+ *
+ *  Copyright (C) 2006 Vishal Patil (vishpat AT gmail DOT com) 
+ *
+*/
+
+#include <linux/btree.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/radix-tree.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>  
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/mm.h>
+
+typedef enum {left = -1,right = 1} position_t;
+
+typedef struct {
+	btree_node * node;
+	unsigned int index;
+}node_pos;
+
+struct btree_node {
+	struct btree_node * next;		        // Pointer used for linked list 
+	bool leaf;			        // Used to indicate whether leaf or not
+        unsigned int nr_active;		        // Number of active keys
+	unsigned int level;		        // Level in the B-Tree
+        bt_key_val key_vals[2*BTREE_ORDER - 1];	// Array of keys and values
+        struct btree_node * children[2*BTREE_ORDER];	// Array of pointers to child nodes
+        unsigned long tags[BTREE_TAGS][BTREE_TAG_LONGS];
+                                                // Bitmap required by page
+                                                // cache
+	unsigned long padding[6];
+} ;
+
+/*
+*       B-Tree node and key val cache
+*/
+static kmem_cache_t *btree_node_cachep;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct btree_preload {
+        int nr;
+        struct btree_node *nodes[BTREE_MAX_PATH];
+};
+DEFINE_PER_CPU(struct btree_preload, btree_preloads) = { 0, };
+
+static btree_node * allocate_btree_node (btree * btree,unsigned int order);
+static node_pos get_btree_node(btree * btree,unsigned long key);
+static bt_key_val remove_key_from_leaf(btree * btree, node_pos * node_pos);
+static btree_node * merge_nodes(btree * btree, btree_node * n1, bt_key_val kv ,
+        btree_node * n2);
+static void move_key(btree * btree, btree_node * node, unsigned int index, 
+                                        position_t pos);
+static node_pos get_max_key_pos(btree * btree, btree_node * subtree);
+static node_pos get_min_key_pos(btree * btree, btree_node * subtree);
+static btree_node * merge_siblings(btree * btree, btree_node * parent,
+                      unsigned int index,position_t pos);
+static unsigned int __lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items, int tag);
+static void print_single_node(btree *btree, btree_node * node); 
+static void transfer_tags(btree_node * source_node, int source_index,
+                btree_node * dest_node,int dest_index);
+static void clear_tags(btree_node * node, int index);
+
+static void
+btree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags) {
+        memset(node, 0, sizeof(struct btree_node));
+}
+
+void __init btree_init(void) {
+        btree_node_cachep = kmem_cache_create("btree_node",
+                sizeof(btree_node), 0,
+                SLAB_PANIC, btree_node_ctor, NULL);
+}
+
+unsigned long btree_key_fn (void * page) {
+        return (((struct page*)page)->index);
+}
+
+unsigned long btree_value_fn (unsigned long key) {
+        return *((unsigned long*)key);
+}
+
+void btree_print_fn (unsigned long key) {
+        printk("%lu ", *((unsigned long*)key));
+}
+
+static inline void tag_set(struct btree_node *node, int tag, int offset)
+{
+        __set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct btree_node *node, int tag, int offset)
+{
+        __clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct btree_node *node, int tag, int offset)
+{
+        return test_bit(offset, node->tags[tag]);
+}
+
+/*
+ * Load up this CPU's btree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int btree_preload(gfp_t gfp_mask) {
+        struct btree_preload *btp;
+        struct btree_node *node;
+        int ret = -ENOMEM;
+
+        preempt_disable();
+        btp = &__get_cpu_var(btree_preloads);
+        while (btp->nr < ARRAY_SIZE(btp->nodes)) {
+                preempt_enable();
+                node = kmem_cache_alloc(btree_node_cachep, gfp_mask);
+                if (node == NULL)
+                        goto out;
+                preempt_disable();
+                btp = &__get_cpu_var(btree_preloads);
+                if (btp->nr < ARRAY_SIZE(btp->nodes))
+                        btp->nodes[btp->nr++] = node;
+                else
+                        kmem_cache_free(btree_node_cachep, node);
+        }
+        ret = 0;
+out:
+        return ret;
+}
+
+/**
+*       Function used to allocate memory for the btree node
+*       @param btree The btree
+*       @param order Order of the B-Tree
+*       @return The allocated B-tree node
+*/
+static btree_node * allocate_btree_node (btree * btree,unsigned int order) {
+        btree_node * node;
+
+        // Allocate memory for the node
+        node = kmem_cache_alloc(btree_node_cachep,btree->gfp_mask);
+
+        // If this is NULL get preloaded node
+        if (node  == NULL && !(btree->gfp_mask & __GFP_WAIT)) {
+                struct btree_preload *btp;
+
+                btp = &__get_cpu_var(btree_preloads);
+                if (btp->nr) {
+                        node = btp->nodes[btp->nr - 1];
+                        btp->nodes[btp->nr - 1] = NULL;
+                        btp->nr--;
+                }
+        }
+       
+        // Initialize the number of active nodes
+        node->nr_active = 0;
+
+	// Use to determine whether it is a leaf
+	node->leaf = true;
+
+	// Use to determine the level in the tree
+	node->level = 0;
+
+	//Initialize the linked list pointer to NULL
+	node->next = NULL;
+
+        return node;
+}
+
+/**
+*       Function used to free the memory allocated to the b-tree 
+*       @param node The node to be freed
+*       @param order Order of the B-Tree
+*       @return The allocated B-tree node
+*/
+static int free_btree_node (btree_node * node) {
+        kmem_cache_free(btree_node_cachep,node);
+        return 0;
+}
+
+/**
+*	Used to split the child node and adjust the parent so that
+*	it has two children
+*	@param parent Parent Node
+*	@param index Index of the child node
+*	@param child  Full child node
+*	
+*/
+static void btree_split_child(btree * btree, btree_node * parent, 
+				unsigned int index,
+				btree_node * child) {
+	int i = 0;	
+	unsigned int order = btree->order;
+
+	btree_node * new_child = allocate_btree_node(btree,btree->order); 
+	new_child->leaf = child->leaf;
+	new_child->level = child->level;
+	new_child->nr_active = btree->order - 1;
+
+	// Copy the higher order keys to the new child	
+	for(i=0;i<order - 1;i++) {
+		new_child->key_vals[i] = child->key_vals[i + order];
+                transfer_tags(child,i + order,new_child,i); 
+                
+                if(!child->leaf) {
+			new_child->children[i] = 
+				child->children[i + order];
+		} 
+	}
+	
+	// Copy the last child pointer
+	if(!child->leaf) {
+		new_child->children[i] = 
+		child->children[i + order];
+	}
+
+	child->nr_active = order - 1;
+	
+	for(i = parent->nr_active + 1;i > index + 1;i--) {
+		parent->children[i] = parent->children[i - 1];
+	}
+	parent->children[index + 1] = new_child;
+
+	for(i = parent->nr_active;i > index;i--) {
+		parent->key_vals[i] = parent->key_vals[i - 1];
+                transfer_tags(parent,i - 1,parent,i);                
+	}
+
+	parent->key_vals[index] = child->key_vals[order - 1];	
+        transfer_tags(child,order - 1,parent,index);
+	parent->nr_active++;
+
+}
+
+/**
+*	Used to insert a key in the non-full node
+*	@param btree The btree
+*	@param node The node to which the key will be added
+*	@param the key value pair
+*	@return void
+*/
+
+static int btree_insert_nonfull (btree * btree, btree_node * parent_node,
+				bt_key_val * key_val) {
+	
+	unsigned long key = key_val->key;
+	int i,j;
+	btree_node * child;	
+	btree_node * node = parent_node;
+	
+insert:	i = node->nr_active - 1;
+	if(node->leaf) {
+		while(i >= 0 && key < node->key_vals[i].key) {
+			node->key_vals[i + 1] = node->key_vals[i];
+                        transfer_tags(node,i,node,i + 1);
+			i--;
+		}
+
+		node->key_vals[i + 1].key = key;
+                node->key_vals[i + 1].val = key_val->val;
+
+		// Identical key found
+          	if (i >= 0 && 
+			key == node->key_vals[i].key) {
+                        for (j = i + 1; j <node->nr_active; j++ ) {
+                                node->key_vals[j] =
+                                        node->key_vals[j + 1];
+				transfer_tags(node,j,node,j + 1);
+                        }
+			return -EEXIST;
+                } else
+		node->nr_active++; 
+	} else {
+		while (i >= 0 && key < node->key_vals[i].key) {
+			i--;
+		}
+
+                // Identical key found  
+                if (i >= 0 && key == node->key_vals[i].key) {
+                        return -EEXIST;
+                }
+
+		i++;
+		child = node->children[i]; 
+		
+		if(child->nr_active == 2*btree->order - 1) {
+
+ 			// Check for identical key here
+                        for (j=0;j<child->nr_active;j++) {
+                                if (key == child->key_vals[j].key) {
+                                        return -EEXIST;
+                                }
+
+                                if (key < child->key_vals[j].key) {
+                                        break;
+                                }
+                        }
+
+			btree_split_child(btree,node,i,child);
+			if(key > 
+				node->key_vals[i].key) {
+				i++;	
+			}	
+		}
+		node = node->children[i];
+		goto insert;	
+	}
+       return 0;	
+}
+
+
+/**
+*       Function used to insert node into a B-Tree
+*       @param root Root of the B-Tree
+*       @param node The node to be inserted
+*       @param compare Function used to compare the two nodes of the tree
+*       @return success or failure
+*/
+int btree_insert(btree * btree, unsigned long key, void * val) {
+	btree_node * rnode = NULL;
+        bt_key_val key_val;
+        int ret = -1;	
+       	
+        key_val.key = key;
+        key_val.val = val;
+
+        // During the B-tree initialization
+        if(unlikely(btree->root == NULL)) {
+                btree->root = allocate_btree_node(btree,btree->order);
+        }
+        rnode = btree->root;
+
+	if(rnode->nr_active == (2*btree->order - 1)) {
+		btree_node * new_root;
+		new_root = allocate_btree_node(btree,btree->order);
+		new_root->level = btree->root->level + 1;
+		btree->root = new_root;	
+		new_root->leaf = false;
+		new_root->nr_active = 0;
+		new_root->children[0]  = rnode;
+		btree_split_child(btree,new_root,0,rnode);
+		ret = btree_insert_nonfull(btree,new_root,&key_val);	
+	} else
+		ret = btree_insert_nonfull(btree,rnode,&key_val);		
+        return ret;
+}
+EXPORT_SYMBOL(btree_insert);        
+
+/**
+*	Used to get the position of the MAX key within the subtree
+*	@param btree The btree
+*	@param subtree The subtree to be searched
+*	@return The node containing the key and position of the key
+*/
+static node_pos get_max_key_pos(btree * btree, btree_node * subtree) {
+	node_pos node_pos;
+	btree_node * node = subtree; 
+	
+	node_pos.node = NULL;
+	node_pos.index = 0;
+
+	while(true) {
+		if(node == NULL) {
+			break;
+		}
+
+		if(node->leaf) {
+			node_pos.node 	= node;
+			node_pos.index 	= node->nr_active - 1;
+			return node_pos;	
+		} else {
+			node_pos.node 	= node;
+			node_pos.index 	= node->nr_active - 1;
+			node = node->children[node->nr_active];	
+		}
+	}
+	return node_pos;
+}
+
+/**
+*	Used to get the position of the MAX key within the subtree
+*	@param btree The btree
+*	@param subtree The subtree to be searched
+*	@return The node containing the key and position of the key
+*/
+static node_pos get_min_key_pos(btree * btree, btree_node * subtree) {
+	node_pos node_pos;
+	btree_node * node = subtree; 
+
+	node_pos.node = NULL;
+	node_pos.index = 0;
+
+	while(true) {
+		if(node == NULL) {
+			break;
+		}
+
+		if(node->leaf) {
+			node_pos.node 	= node;
+			node_pos.index 	= 0;
+			return node_pos;	
+		} else {
+			node_pos.node 	= node;
+			node_pos.index 	= 0;
+			node = node->children[0];	
+		}
+	}
+	return node_pos;
+}
+
+/**
+*	Merge nodes n1 and n2 (case 3b from Cormen)
+*	@param btree The btree 
+*	@param node The parent node 
+*	@param index of the child
+*	@param pos left or right
+*	@return none 
+*/
+static btree_node * merge_siblings(btree * btree, btree_node * parent, unsigned int index , 
+					position_t pos) {
+	unsigned int j;
+	btree_node * new_node;
+	btree_node * n1, * n2;
+        
+        if (index == (parent->nr_active)) {   
+               index--;
+	       n1 = parent->children[parent->nr_active - 1];
+	       n2 = parent->children[parent->nr_active];
+        } else {
+               n1 = parent->children[index];
+	       n2 = parent->children[index + 1];
+        }
+
+	//Merge the current node with the left node
+	new_node = allocate_btree_node(btree,btree->order);
+	new_node->level = n1->level;
+	new_node->leaf = n1->leaf;
+
+	for(j=0;j<btree->order - 1; j++) {
+		new_node->key_vals[j] =	n1->key_vals[j];
+                transfer_tags(n1,j,new_node,j);
+		new_node->children[j] =	n1->children[j];
+	}
+	
+	new_node->key_vals[btree->order - 1] =	parent->key_vals[index];
+        transfer_tags(parent,index,new_node,btree->order - 1);
+	new_node->children[btree->order - 1] =	n1->children[btree->order - 1];
+
+	for(j=0;j<btree->order - 1; j++) {
+		new_node->key_vals[j + btree->order] = 	n2->key_vals[j];
+                transfer_tags(n2,j,new_node,j + btree->order);
+		new_node->children[j + btree->order] = 	n2->children[j];
+	}
+	new_node->children[2*btree->order - 1] = n2->children[btree->order - 1];
+
+	parent->children[index] = new_node;
+
+	for(j = index;j<parent->nr_active;j++) {
+		parent->key_vals[j] = parent->key_vals[j + 1];
+                transfer_tags(parent,j + 1,parent,j);
+		parent->children[j + 1] = parent->children[j + 2];
+	}
+
+	new_node->nr_active = n1->nr_active + n2->nr_active + 1;
+	parent->nr_active--;
+
+	free_btree_node(n1);
+	free_btree_node(n2);
+
+	if (parent->nr_active == 0 && btree->root == parent) {
+		free_btree_node(parent);
+		btree->root = new_node;
+		if(new_node->level)
+			new_node->leaf = false;
+		else
+			new_node->leaf = true; 
+	}
+
+	return new_node;
+}
+
+/**
+*	Move the key from node to another	
+*	@param btree The B-Tree
+*	@param node The parent node
+*	@param index of the key to be moved done 
+*	@param pos the position of the child to receive the key 
+*	@return none
+*/
+static void move_key(btree * btree, btree_node * node, unsigned int index, position_t pos) {
+	btree_node * lchild;
+	btree_node * rchild;
+	unsigned int i;
+
+	if(pos == right) {
+		index--;
+	}
+	lchild = node->children[index];
+	rchild = node->children[index + 1];
+	
+	// Move the key from the parent to the left child
+	if(pos == left) {
+		lchild->key_vals[lchild->nr_active] = node->key_vals[index];
+                transfer_tags(node,index,lchild,lchild->nr_active);
+
+		lchild->children[lchild->nr_active + 1] = rchild->children[0];
+		rchild->children[0] = NULL;
+		lchild->nr_active++;
+
+		node->key_vals[index] = rchild->key_vals[0];
+                transfer_tags(rchild,0,node,index);
+                
+		for(i=0;i<rchild->nr_active - 1;i++) {
+			rchild->key_vals[i] = rchild->key_vals[i + 1];
+                        transfer_tags(rchild,i + 1,rchild,i);
+			rchild->children[i] = rchild->children[i + 1];
+		}
+		rchild->children[rchild->nr_active - 1] = 
+				rchild->children[rchild->nr_active];
+		rchild->nr_active--;
+	} else {
+		// Move the key from the parent to the right child
+		for(i=rchild->nr_active;i > 0 ; i--) {
+			rchild->key_vals[i] = rchild->key_vals[i - 1];
+                        transfer_tags(rchild,i - 1,rchild,i);
+			rchild->children[i + 1] = rchild->children[i];		
+		}
+		rchild->children[1] = rchild->children[0];		
+		rchild->children[0] = NULL;
+
+		rchild->key_vals[0] = node->key_vals[index];
+                transfer_tags(node,index,rchild,0);
+
+		rchild->children[0] = lchild->children[lchild->nr_active];	
+		lchild->children[lchild->nr_active] = NULL;
+
+		node->key_vals[index] = lchild->key_vals[lchild->nr_active - 1];
+		transfer_tags(lchild,lchild->nr_active - 1,node,index);
+                
+		lchild->nr_active--;
+		rchild->nr_active++;		
+	}				
+}
+
+/**
+*	Merge nodes n1 and n2
+*	@param n1 First node
+*	@param n2 Second node
+*	@return combined node
+*/
+static btree_node * merge_nodes(btree * btree, btree_node * n1, bt_key_val kv,
+                                                btree_node * n2) {
+	btree_node * new_node;
+	unsigned int i;	
+
+	new_node = allocate_btree_node(btree,btree->order);
+	new_node->leaf = true;
+	
+	for(i=0;i<n1->nr_active;i++) {
+		new_node->key_vals[i]   = n1->key_vals[i];
+                transfer_tags(n1,i,new_node,i);
+                new_node->children[i]   = n1->children[i];
+	}
+        new_node->children[n1->nr_active] = n1->children[n1->nr_active];
+        new_node->key_vals[n1->nr_active] = kv;
+
+	for(i=0;i<n2->nr_active;i++) {
+		new_node->key_vals[i + n1->nr_active + 1] = n2->key_vals[i];
+                transfer_tags(n2,i,new_node,i + n1->nr_active + 1);
+                new_node->children[i + n1->nr_active + 1] = n2->children[i];
+	}
+        new_node->children[2*btree->order - 1] = n2->children[n2->nr_active];
+	
+        new_node->nr_active = n1->nr_active + n2->nr_active + 1;
+        new_node->leaf = n1->leaf;
+        new_node->level = n1->level;
+
+	free_btree_node(n1);
+	free_btree_node(n2);
+
+	return new_node;
+}
+
+/**
+*	Used to remove a key from the B-tree node
+*	@param btree The btree
+*	@param node The node from which the key is to be removed 	
+*	@param key The key to be removed
+*	@return 0 on success -1 on error 
+*/
+
+bt_key_val remove_key_from_leaf(btree * btree, node_pos * node_pos) {
+	unsigned int keys_max = 2*btree->order - 1;
+	unsigned int i;
+	bt_key_val key_val;
+	btree_node * node = node_pos->node;
+
+        key_val.key = -1;
+        key_val.val = NULL;
+
+	if(node->leaf == false) {
+		return key_val;
+	}	
+	
+	key_val = node->key_vals[node_pos->index];
+
+	for(i=node_pos->index;i< keys_max - 1;i++) {
+		node->key_vals[i] = node->key_vals[i + 1];
+                transfer_tags(node,i + 1,node,i);
+	}
+	
+	node->nr_active--;
+
+	if(node->nr_active == 0 ) {
+                if (btree->root == node) {
+                        btree->root = NULL;
+                }
+		free_btree_node(node);
+	}
+	return key_val;
+}
+
+/**
+*       Function used to remove a node from a  B-Tree
+*       @param btree The B-Tree
+*       @param key Key of the node to be removed
+*       @param value function to map the key to an unique integer value        
+*       @param compare Function used to compare the two nodes of the tree
+*       @return The removed key value pair 
+*/
+
+bt_key_val btree_delete(btree * btree,btree_node * subtree,unsigned long key) {
+	unsigned int i,index;
+	btree_node * node = NULL, * rsibling, *lsibling;
+	btree_node * comb_node, * parent;
+	node_pos sub_node_pos;
+	node_pos node_pos;
+	bt_key_val key_val, removed_kv,to_remove_kv;
+	unsigned long kv = key;	
+
+	node = subtree;
+	parent = NULL;
+	removed_kv.key = -1;
+	removed_kv.val = NULL;	
+
+	// Empty subtree
+	if (node == NULL) {
+		return removed_kv;
+	}
+
+del_loop:for (i = 0;;i = 0) {
+             
+            //If there are no keys simply return
+            if(!node->nr_active)
+                return removed_kv;
+
+	    // Fix the index of the key greater than or equal
+	    // to the key that we would like to search
+		
+	    while (i < node->nr_active && kv > 
+			    node->key_vals[i].key) {
+		    i++;
+	    }
+	    index = i;
+
+	    // If we find such key break		    
+	    if(i < node->nr_active && 
+			kv == node->key_vals[i].key) {
+			break;
+	    }
+            if(node->leaf)
+                return removed_kv;
+                
+    	    //Store the parent node
+	    parent = node;
+
+	    // To get a child node 
+	    node = node->children[i];
+
+            //If NULL not found
+            if (node == NULL)
+                return removed_kv;
+
+            if (index == (parent->nr_active)) {   
+                lsibling =  parent->children[parent->nr_active - 1];
+	    	rsibling = NULL; 
+            } else if (index == 0) {
+                lsibling = NULL;
+                rsibling = parent->children[1];
+            } else {
+        	lsibling = parent->children[i - 1];
+	    	rsibling = parent->children[i + 1];
+            }
+
+	    if (node->nr_active == btree->order - 1 && parent) {
+		// The current node has (t - 1) keys but the right sibling has > (t - 1)
+		// keys
+		if (rsibling && (rsibling->nr_active > btree->order - 1)) {
+			move_key(btree,parent,i,left);
+		} else
+		// The current node has (t - 1) keys but the left sibling has (t - 1)
+		// keys
+		if (lsibling && (lsibling->nr_active > btree->order - 1)) {
+			move_key(btree,parent,i,right);
+		} else
+		// Left sibling has (t - 1) keys
+	        if(lsibling && (lsibling->nr_active == btree->order - 1)) {
+		        node = merge_siblings(btree,parent,i,left);
+		} else
+                // Right sibling has (t - 1) keys
+	        if(rsibling && (rsibling->nr_active == btree->order - 1)) {
+		        node = merge_siblings(btree,parent,i,right);
+		}
+	    }
+        }            
+	
+	//Case 1 : The node containing the key is found and is the leaf node.
+	//Also the leaf node has keys greater than the minimum required.
+	//Simply remove the key
+	if(node->leaf && (node->nr_active > btree->order - 1)) {
+		node_pos.node = node;
+		node_pos.index = index;
+		return remove_key_from_leaf(btree,&node_pos);
+	}
+
+	//If the leaf node is the root permit deletion even if the number of keys is
+	//less than (t - 1)
+	if(node->leaf && (node == btree->root)) {
+		node_pos.node = node;
+		node_pos.index = index;
+		return remove_key_from_leaf(btree,&node_pos);
+	}
+
+
+	//Case 2: The node containing the key is found and is an internal node
+	if(node->leaf == false) {
+		if(node->children[index]->nr_active > btree->order - 1 ) {
+			to_remove_kv = node->key_vals[index];
+
+			sub_node_pos = 
+				get_max_key_pos(btree,node->children[index]);
+                        key_val = 
+				sub_node_pos.node->key_vals[sub_node_pos.index];
+        		node->key_vals[index] = key_val;	
+                        transfer_tags(node,index,sub_node_pos.node,
+                                sub_node_pos.index);
+
+			sub_node_pos.node->key_vals[sub_node_pos.index] = 
+				to_remove_kv;
+                        node = node->children[index];
+                        goto del_loop;
+               
+		} else if ((node->children[index + 1]->nr_active > btree->order - 1) ) {
+			to_remove_kv = node->key_vals[index];
+
+			sub_node_pos = 
+                                get_min_key_pos(btree,node->children[index + 1]);
+                        key_val = sub_node_pos.node->key_vals[sub_node_pos.index];
+        		node->key_vals[index] = key_val;	
+                        transfer_tags(node,index,sub_node_pos.node,
+                                sub_node_pos.index);
+
+			sub_node_pos.node->key_vals[sub_node_pos.index] = 
+				to_remove_kv;
+
+                        node = node->children[index + 1];
+                        goto del_loop;
+              
+		} else if ( 
+			node->children[index]->nr_active == btree->order - 1 &&
+			node->children[index + 1]->nr_active == btree->order - 1) {
+
+			comb_node = merge_nodes(btree,node->children[index],
+                                node->key_vals[index],
+				node->children[index + 1]);
+			node->children[index] = comb_node;
+			
+			for(i=index + 1;i<node->nr_active;i++) {
+				node->children[i] = node->children[i + 1];
+				node->key_vals[i - 1] = node->key_vals[i];
+                                transfer_tags(node,i,node,i - 1);
+			}
+			node->nr_active--;
+                        if (node->nr_active == 0 && btree->root == node) {
+                                free_btree_node(node);
+                                btree->root = comb_node;        
+                        } 
+                        node = comb_node;
+                        goto del_loop;
+		}		
+          }
+	
+	// Case 3:
+	// In this case start from the top of the tree and continue
+	// moving to the leaf node making sure that each node that
+	// we encounter on the way has atleast 't' (order of the tree)
+	// keys 
+	if(node->leaf && (node->nr_active > btree->order - 1)) {
+	      node_pos.node = node;
+	      node_pos.index = index;
+	      return remove_key_from_leaf(btree,&node_pos);
+	}
+
+        return removed_kv;
+}
+
+/**
+*	Function used to get the node containing the given key
+*	@param btree The btree to be searched
+*	@param key The the key to be searched
+*	@return The node and position of the key within the node 
+*/
+node_pos  get_btree_node(btree * btree,unsigned long key) {
+	node_pos kp;
+	unsigned long key_val = key;	
+	btree_node * node;
+	unsigned int i = 0;
+
+	node = btree->root;
+	kp.node = NULL;
+	kp.index = 0;
+
+	// Empty tree
+	if (node == NULL) {
+		return kp;
+	}	
+	
+	for (;;i = 0) {	
+
+	    // Fix the index of the key greater than or equal
+	    // to the key that we would like to search
+		
+	    while (i < node->nr_active && key_val > 
+			    node->key_vals[i].key ) {
+		    i++;
+	    }
+
+	    // If we find such key return the key-value pair		    
+	    if(i < node->nr_active && 
+			key_val == node->key_vals[i].key) {
+		    kp.node = node;
+		    kp.index = i;	
+		    return kp;
+	    }
+	
+	    // If the node is leaf and if we did not find the key 
+	    // return NULL
+	    if(node->leaf) {
+		    return kp;
+	    }
+
+	    // To got a child node 
+	    node = node->children[i];
+	}
+      	return kp;
+
+}
+
+/**
+*       Used to destory btree
+*       @param btree The B-tree
+*       @return none
+*/
+void btree_destroy(btree * btree) {
+       int i = 0;
+       unsigned int current_level;
+
+       btree_node * head, * tail, * node;
+       btree_node * child, * del_node;
+
+       node = btree->root;
+       current_level = node->level;
+       head = node;
+       tail = node;
+
+       while(true) {
+               if(head == NULL) {
+                       break;
+               }
+               if (head->level < current_level) {
+                       current_level = head->level;
+               }
+
+               if(head->leaf == false) {
+                       for(i = 0 ; i < head->nr_active + 1; i++) {
+                               child = head->children[i];
+                               tail->next = child;
+                               tail = child;
+                               child->next = NULL;
+                       }
+               }
+               del_node = head;
+               head = head->next;
+               free_btree_node(del_node);
+       }
+
+}
+
+int btree_tagged(btree * btree, int tag) {
+        int i = 0;
+        btree_node * head, * tail;
+        btree_node * child;
+
+        head = btree->root;
+        tail = btree->root;
+
+        while(true) {
+                if(head == NULL) {
+                        break;
+                }
+
+                for (i = 0; i < head->nr_active; i++ ) {
+                  
+                  if (tag != -1 && tag_get(head,tag,i)) 
+                     return 1;
+                  
+                }     
+
+                if(head->leaf == false) {
+                        for(i = 0 ; i < head->nr_active + 1; i++) {
+                                child = head->children[i];
+                                tail->next = child;
+                                tail = child;
+                                child->next = NULL;
+                        }
+                }
+                head = head->next;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(btree_tagged);
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct btree_node *node, int tag)
+{
+        int idx;
+        for (idx = 0; idx < BTREE_TAG_LONGS; idx++) {
+                if (node->tags[tag][idx])
+                        return 1;
+        }
+        return 0;
+}
+
+static void clear_tags(btree_node * node, int index) {
+	 int tag;
+        for (tag = 0;tag< BTREE_TAGS; tag++) {
+		tag_clear(node,tag,index);
+        }
+}
+
+static void transfer_tags(btree_node * source_node, int source_index,
+                btree_node * dest_node,int dest_index) {
+
+        int tag;
+        for (tag = 0;tag< BTREE_TAGS; tag++) {
+                if (tag_get(source_node,tag,source_index)) {
+                        tag_set(dest_node,tag,dest_index);
+                        tag_clear(source_node,tag,source_index);
+                } else {
+                        tag_clear(dest_node,tag,dest_index);
+                }
+        }
+}
+
+static unsigned int __lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items, int tag) {
+
+        int i = 0, j = 0;
+        int ret = 0;
+        unsigned long key;
+        btree_node * head, * tail;
+        btree_node * child;
+
+        head = btree->root;
+        tail = btree->root;
+
+        while(true) {
+                if(head == NULL) {
+                        break;
+                }
+
+                if ((head->key_vals[head->nr_active - 1].key 
+                                >= first_index) && (ret < max_items)) {
+                   for (i = 0; i < head->nr_active; i++ ) {
+                     
+                     if (tag != -1 && !tag_get(head,tag,i)) 
+                        continue;
+                     
+                     if ((key = head->key_vals[i].key) 
+                                                >= first_index) {
+                                // Insertion sort                
+                                for (j = (ret - 1);j>=0;j--) {
+                                      if (btree->key(results[j]) > key) {
+                                                results[j + 1] =
+                                                    results[j];    
+                                      } else {
+                                                break;
+                                      }
+                                }
+                               
+                                results[++j] = head->key_vals[i].val; 
+                                ret++;
+
+                                if(ret == max_items) {
+                                        break;
+                                }
+                     }           
+                   }     
+                }
+
+                if(head->leaf == false) {
+                        for(i = 0 ; i < head->nr_active + 1; i++) {
+                                child = head->children[i];
+                                tail->next = child;
+                                tail = child;
+                                child->next = NULL;
+                        }
+                }
+                head = head->next;
+        }
+        return ret;
+}
+
+/**
+*       Perform multiple lookup on the B-Tree
+*       @param btree The btree
+*       @param results where the results will be placed
+*       @param first_index start the lookup from this tree
+*       @param max_items Place upto these many items in results
+*       @return The number of items found
+*/
+unsigned int btree_gang_lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items) {
+        return __lookup(btree,results,first_index,max_items, -1);
+}
+
+
+EXPORT_SYMBOL(btree_gang_lookup);
+
+/**
+*       Perform multiple lookup on the B-Tree
+*       @param btree The btree
+*       @param results where the results will be placed
+*       @param first_index start the lookup from this tree
+*       @param max_items Place upto these many items in results
+*       @return The number of items found
+*/
+unsigned int btree_gang_lookup_tag(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items,int tag) {
+        return __lookup(btree,results,first_index,max_items, tag);
+}
+
+
+EXPORT_SYMBOL(btree_gang_lookup_tag);
+
+
+/**
+*       Function used to search a node in a B-Tree
+*       @param btree The B-tree to be searched
+*       @param key Key of the node to be search
+*       @return The key-value pair
+*/
+void * btree_lookup(btree * btree,unsigned long key) {
+
+	bt_key_val key_val;
+	node_pos kp;
+	
+	if (btree->root == NULL) {
+		return NULL;
+	}
+
+	kp = get_btree_node(btree,key);
+	key_val.val = NULL;
+
+	if(kp.node) {
+		key_val = kp.node->key_vals[kp.index];
+	} 
+	return key_val.val; 
+}
+EXPORT_SYMBOL(btree_lookup);
+
+/**
+*       Function used to get the slot in a node of a B-Tree
+*       @param btree The B-tree to be searched
+*       @param key Key of the node to be search
+*       @return The key-value pair
+*/
+bt_key_val * btree_lookup_slot(btree * btree,unsigned long key) {
+
+	bt_key_val * key_val = NULL;
+	node_pos kp;
+
+	if (btree->root == NULL) {
+		return NULL;
+	}
+
+	kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+	}
+	return key_val; 
+}
+EXPORT_SYMBOL(btree_lookup_slot);
+
+/**
+*       Used to set the tag for a node
+*       @param btree The btree
+*       @param key The key of the page for which the tag is to be set
+*       @param tag The tag to be set
+*       @return The btree key and val for which the tag is set  
+*/
+
+bt_key_val * btree_tag_set(btree * btree,unsigned long key, int tag) {
+	bt_key_val * key_val = NULL;
+	node_pos kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+                if(!tag_get(kp.node,tag,kp.index))
+                        tag_set(kp.node,tag,kp.index);
+	}
+        BUG_ON(key_val == NULL);
+	return key_val; 
+}
+EXPORT_SYMBOL(btree_tag_set);
+
+/**
+*       Used to clear the tag for a node
+*       @param btree The btree
+*       @param key The key of the page for which the tag is to be set
+*       @param tag The tag to be set
+*       @return The btree key and val for which the tag is cleared  
+*/
+
+bt_key_val * btree_tag_clear(btree * btree,unsigned long key, int tag) {
+         bt_key_val * key_val = NULL;
+	node_pos kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+                if(tag_get(kp.node,tag,kp.index))
+                        tag_clear(kp.node,tag,kp.index);
+	}
+        BUG_ON(key_val == NULL);
+	return key_val; 
+}
+EXPORT_SYMBOL(btree_tag_clear);
+
+/**
+*       Used to verify whether a tag is set
+*       @param btree The btree
+*       @param key The key
+*       @param tag The tag
+*       @return
+*       0: tag not present
+*       1: tag present, set
+*       -1: tag present, unset 
+*/
+int btree_tag_get(btree * btree,unsigned long key, int tag) {
+        int ret = 0;
+        bt_key_val * key_val = NULL;
+	node_pos kp = get_btree_node(btree,key);
+
+	if(kp.node) {
+		key_val = &kp.node->key_vals[kp.index];
+                ret = tag_get(kp.node,tag,kp.index);
+                return ret? 1 : -1;
+	}
+        BUG_ON(key_val == NULL);
+	return ret; 
+}
+EXPORT_SYMBOL(btree_tag_get);
+
+
+/**
+*	Get the max key in the btree
+*	@param btree The btree
+*	@return The max key 
+*/
+unsigned long btree_get_max_key(btree * btree) {
+	node_pos node_pos;
+	node_pos = get_max_key_pos(btree,btree->root);
+	return node_pos.node->key_vals[node_pos.index].key;
+}
+
+/**
+*	Get the min key in the btree
+*	@param btree The btree
+*	@return The max key 
+*/
+unsigned long btree_get_min_key(btree * btree) {
+	node_pos node_pos;
+	node_pos = get_min_key_pos(btree,btree->root);
+	return node_pos.node->key_vals[node_pos.index].key;
+}
+
+/**
+*	Used to print the keys of the btree_node
+*	@param node The node whose keys are to be printed
+*	@return none
+*/
+
+static void print_single_node(btree *btree, btree_node * node) {
+	
+	int i = 0;
+	
+	printk(" { ");	
+	while(i < node->nr_active) {
+		printk("%d(0x%x) ", node->key_vals[i].key,
+			node->key_vals[i].val);
+		i++;
+	}
+	printk("} (0x%x,%d,%d) ", node,node->nr_active,node->level);	
+}
+
+/**
+*       Function used to print the B-tree
+*       @param root Root of the B-Tree
+*       @param print_key Function used to print the key value
+*       @return none
+*/
+
+void print_subtree(btree *btree,btree_node * node) {
+	
+	int i = 0;
+	unsigned int current_level;
+
+	btree_node * head, * tail;
+
+	current_level = node->level;
+	head = node;
+	tail = node;
+	
+	printk("DEBUG: Printing subtree\n");
+	while(true) {
+		if(head == NULL) {
+			break;
+		}
+		if (head->level < current_level) {
+			current_level = head->level;
+			printk("\n");
+		}
+		print_single_node(btree,head);
+
+		if(head->leaf == false) {	
+			for(i = 0 ; i < head->nr_active + 1; i++) {
+				tail->next = head->children[i];
+				tail = head->children[i];
+				head->children[i]->next = NULL;
+			}
+		}
+		head = head->next;	
+	}
+	printk("\n");
+}
+EXPORT_SYMBOL(print_subtree);
+
diff -urN linux-2.6.16.2/lib/Makefile linux-2.6.16.2-btree/lib/Makefile
--- linux-2.6.16.2/lib/Makefile	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/lib/Makefile	2006-08-05 10:57:12.000000000 -0400
@@ -3,7 +3,7 @@
 #
 
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
-	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
+	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o btree.o\
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
 	 sha1.o
 
diff -urN linux-2.6.16.2/init/main.c linux-2.6.16.2-btree/init/main.c
--- linux-2.6.16.2/init/main.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/init/main.c	2006-08-06 15:20:06.000000000 -0400
@@ -84,6 +84,7 @@
 extern void pidmap_init(void);
 extern void prio_tree_init(void);
 extern void radix_tree_init(void);
+extern void btree_init(void);
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
@@ -529,7 +530,8 @@
 	key_init();
 	security_init();
 	vfs_caches_init(num_physpages);
-	radix_tree_init();
+//	radix_tree_init();
+        btree_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff -urN linux-2.6.16.2/fs/buffer.c linux-2.6.16.2-btree/fs/buffer.c
--- linux-2.6.16.2/fs/buffer.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/fs/buffer.c	2006-08-06 15:10:49.000000000 -0400
@@ -859,9 +859,15 @@
 		if (page->mapping) {	/* Race with truncate? */
 			if (mapping_cap_account_dirty(mapping))
 				inc_page_state(nr_dirty);
-			radix_tree_tag_set(&mapping->page_tree,
+			btree_tag_set(&mapping->btree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+
+			if (btree_tag_get(&mapping->btree,
+			  page_index(page),PAGECACHE_TAG_DIRTY) != 1) {
+				panic("Problem setting the tag\n");
+			}				
+
 		}
 		write_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff -urN linux-2.6.16.2/fs/inode.c linux-2.6.16.2-btree/fs/inode.c
--- linux-2.6.16.2/fs/inode.c	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/fs/inode.c	2006-08-06 21:36:22.000000000 -0400
@@ -16,6 +16,7 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/hash.h>
+#include <linux/btree.h>
 #include <linux/swap.h>
 #include <linux/security.h>
 #include <linux/pagemap.h>
@@ -196,6 +197,7 @@
 	mutex_init(&inode->i_mutex);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+        INIT_BTREE(&inode->i_data.btree, GFP_ATOMIC);
 	rwlock_init(&inode->i_data.tree_lock);
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
diff -urN linux-2.6.16.2/include/linux/btree.h linux-2.6.16.2-btree/include/linux/btree.h
--- linux-2.6.16.2/include/linux/btree.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.6.16.2-btree/include/linux/btree.h	2006-08-06 21:41:02.000000000 -0400
@@ -0,0 +1,89 @@
+/*
+ *  btree.h
+ *
+ *  Copyright (C) 2006 Vishal Patil (vishpat AT gmail DOT com) 
+ *
+*/
+
+#ifndef _BTREE_H_
+#define _BTREE_H_
+
+#include <linux/sched.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
+
+#define BTREE_ORDER 8 
+#define BTREE_TAGS 2
+#define BTREE_TAG_LONGS    \
+        ((BTREE_ORDER + BITS_PER_LONG - 1) / BITS_PER_LONG)
+#define BTREE_MAX_PATH 8
+
+
+typedef enum  {false,true} bool;
+typedef struct btree_node btree_node;
+
+typedef struct {
+        unsigned long key;
+	void * val;
+} bt_key_val;
+
+
+typedef struct btree {
+	unsigned int order;			// B-Tree order
+	btree_node * root;		        // Root of the B-Tree
+       	unsigned long (*value)(unsigned long key);	// Generate uint value for the key
+        unsigned long (*key)(void * data);      // Get the value of the key from
+                                                // from the data
+	void (*print_key)(unsigned long key);		// Print the key
+        gfp_t                   gfp_mask;
+} btree; 
+
+extern int btree_insert(btree * btree, unsigned long key, void * val);
+extern bt_key_val btree_delete(btree * btree,btree_node * subtree,
+                                unsigned long key);
+extern void * btree_lookup(btree * btree,  unsigned long key);
+extern unsigned int btree_gang_lookup(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items);
+extern unsigned int btree_gang_lookup_tag(btree * btree, void ** results,
+        unsigned long first_index,unsigned int max_items,int tag);
+extern bt_key_val * btree_lookup_slot(btree * btree, unsigned long key);
+extern bt_key_val * btree_tag_set(btree * btree,unsigned long key, int tag);
+extern int btree_tag_get(btree * btree,unsigned long key, int tag);
+extern bt_key_val * btree_tag_clear(btree * btree,unsigned long key, int tag);
+extern void btree_destroy(btree * btree);
+extern unsigned long btree_get_max_key(btree * btree);
+extern unsigned long btree_get_min_key(btree * btree);
+extern unsigned long btree_value_fn(unsigned long key);
+extern unsigned long btree_key_fn(void * page);
+extern void btree_print_fn(unsigned long key);
+extern int btree_preload(gfp_t gfp_mask);
+extern int btree_tagged(btree * btree, int tag);
+
+static inline void btree_preload_end(void)
+{
+        preempt_enable();
+}
+
+#define BTREE_INIT(mask) { \
+        .order          = BTREE_ORDER,                            \
+        .root           = NULL,                                   \
+        .value          = btree_value_fn,                         \
+        .key            = btree_key_fn,                           \
+        .print_key      = btree_print_fn,                         \
+        .gfp_mask       = (mask)                                  \
+}
+
+#define INIT_BTREE(btree,mask)                            \
+do {                                                      \
+        (btree)->order = BTREE_ORDER;                     \
+        (btree)->root = NULL;                             \
+        (btree)->value            = btree_value_fn;       \
+        (btree)->key              = btree_key_fn;         \
+        (btree)->print_key        = btree_print_fn;       \
+        (btree)->gfp_mask         = (mask);               \
+} while (0)
+
+
+extern void print_subtree(btree * btree,btree_node * node);
+
+#endif
diff -urN linux-2.6.16.2/include/linux/fs.h linux-2.6.16.2-btree/include/linux/fs.h
--- linux-2.6.16.2/include/linux/fs.h	2006-04-07 12:56:47.000000000 -0400
+++ linux-2.6.16.2-btree/include/linux/fs.h	2006-08-05 10:57:12.000000000 -0400
@@ -214,6 +214,7 @@
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/radix-tree.h>
+#include <linux/btree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
 #include <linux/sched.h>
@@ -372,6 +373,7 @@
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
+        struct btree            btree;          /* BTree of all pages*/
 	rwlock_t		tree_lock;	/* and rwlock protecting it */
 	unsigned int		i_mmap_writable;/* count VM_SHARED mappings */
 	struct prio_tree_root	i_mmap;		/* tree of private and shared mappings */

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2006-08-07  2:18 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-07-18  2:02 Generic B-tree implementation Vishal Patil
2006-07-18  2:58 ` Horst von Brand
2006-07-18  3:08   ` Vishal Patil
2006-07-18  3:20     ` Valdis.Kletnieks
2006-07-18  4:27 ` Gary Funck
2006-07-18 13:30   ` Vishal Patil
2006-07-18 15:00     ` Gary Funck
2006-07-18 15:13       ` Bob Copeland
2006-07-18 15:22       ` Vishal Patil
2006-07-19  7:33         ` Anton Altaparmakov
2006-07-19 13:34           ` Vishal Patil
2006-07-19 16:14             ` Andrea Arcangeli
2006-07-19 16:26               ` Vishal Patil
2006-08-07  2:18                 ` Vishal Patil

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox