public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] initial version of reference cache
@ 2008-07-25 19:29 Yan Zheng
  2008-07-25 23:38 ` Chris Mason
  2008-07-28 15:09 ` Chris Mason
  0 siblings, 2 replies; 9+ messages in thread
From: Yan Zheng @ 2008-07-25 19:29 UTC (permalink / raw)
  To: linux-btrfs

Hello,

This is the initial version of leaf reference cache. The cache stores leaf node's extent references in memory, this can improve the performance of snapshot dropping. Outlines of this patch are (1) allocate struct dirty_root when starting transaction (2) put reference cache in struct dirty_root (3) cache extent references when tree leaves are cow'ed (4) when dropping snapshot, use cached references directly to avoid reading tree leaf. 

I only can access a notebook currenly, so benchmarking isn't enough. I appreciate any help and comment.

Regards
YZ

---
diff -r eb4767aa190e Makefile
--- a/Makefile	Thu Jul 24 12:25:50 2008 -0400
+++ b/Makefile	Sat Jul 26 01:07:24 2008 +0800
@@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   ref-cache.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff -r eb4767aa190e ctree.c
--- a/ctree.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.c	Sat Jul 26 00:46:09 2008 +0800
@@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_h
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf);
+	ret = btrfs_inc_ref(trans, new_root, buf, 0);
 	kfree(new_root);
 
 	if (ret)
@@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		different_trans = 1;
-		ret = btrfs_inc_ref(trans, root, buf);
+		ret = btrfs_inc_ref(trans, root, buf, 1);
 		if (ret)
 			return ret;
 	} else {
diff -r eb4767aa190e ctree.h
--- a/ctree.h	Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.h	Sat Jul 26 00:46:09 2008 +0800
@@ -592,6 +592,10 @@ struct btrfs_fs_info {
 	u64 last_alloc;
 	u64 last_data_alloc;
 
+	spinlock_t ref_cache_lock;
+	u64 total_ref_cache_size;
+	u64 running_ref_cache_size;
+
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
@@ -613,6 +617,8 @@ struct btrfs_root {
 	spinlock_t node_lock;
 
 	struct extent_buffer *commit_root;
+	struct btrfs_leaf_ref_tree *ref_tree;
+
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_tr
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf);
+		  struct extent_buffer *buf, int cache_ref);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 bytenr, u64 num_bytes,
 		      u64 root_objectid, u64 ref_generation,
diff -r eb4767aa190e disk-io.c
--- a/disk-io.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/disk-io.c	Sat Jul 26 00:46:09 2008 +0800
@@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u3
 	root->node = NULL;
 	root->inode = NULL;
 	root->commit_root = NULL;
+	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
+		printk("btrfs: total reference cache size %Lu\n",
+			root->fs_info->total_ref_cache_size);
+
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
 			mutex_unlock(&root->fs_info->trans_mutex);
 			goto sleep;
 		}
+
+		printk("btrfs: running reference cache size %Lu\n",
+			root->fs_info->running_ref_cache_size);
+
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
 			mutex_unlock(&root->fs_info->trans_mutex);
@@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct sup
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
+	spin_lock_init(&fs_info->ref_cache_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
@@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root)
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
 	}
+	if (fs_info->total_ref_cache_size) {
+		printk("btrfs: at umount reference cache size %Lu\n",
+			fs_info->total_ref_cache_size);
+	}
+	
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff -r eb4767aa190e extent-tree.c
--- a/extent-tree.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/extent-tree.c	Sat Jul 26 02:01:27 2008 +0800
@@ -26,6 +26,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -927,7 +928,7 @@ out:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf)
+		  struct extent_buffer *buf, int cache_ref)
 {
 	u64 bytenr;
 	u32 nritems;
@@ -937,6 +938,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 	int level;
 	int ret;
 	int faili;
+	int nr_file_extents = 0;
 
 	if (!root->ref_cows)
 		return 0;
@@ -959,6 +961,9 @@ int btrfs_inc_ref(struct btrfs_trans_han
 			if (disk_bytenr == 0)
 				continue;
 
+			if (buf != root->commit_root)
+				nr_file_extents++;
+
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
@@ -988,6 +993,53 @@ int btrfs_inc_ref(struct btrfs_trans_han
 			}
 		}
 	}
+	/* cache orignal leaf block's references */
+	if (cache_ref && nr_file_extents > 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_extent_info *info;
+
+		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		if (!ref) {
+			WARN_ON(1);
+			goto out;
+		}
+
+		btrfs_item_key_to_cpu(buf, &ref->key, 0);
+
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ref->nritems = nr_file_extents;
+		info = ref->extents;
+		
+		for (i = 0; i < nritems; i++) {
+			u64 disk_bytenr;
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
+				continue;
+
+			info->bytenr = disk_bytenr;
+			info->num_bytes =
+				btrfs_file_extent_disk_num_bytes(buf, fi);
+			info->objectid = key.objectid;
+			info->offset = key.offset;
+			info++;
+		}
+
+		BUG_ON(!root->ref_tree);
+		ret = btrfs_add_leaf_ref(root, ref);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(ref);
+	}
+out:
 	return 0;
 fail:
 	WARN_ON(1);
@@ -2215,9 +2267,9 @@ struct extent_buffer *btrfs_alloc_free_b
 	return buf;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct extent_buffer *leaf)
+static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
+				  	   struct btrfs_root *root,
+					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2266,6 +2318,30 @@ static int noinline drop_leaf_ref(struct
 	return 0;
 }
 
+static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
+				  	 struct btrfs_root *root,
+					 struct btrfs_leaf_ref *ref)
+{
+	int i;
+	int ret;
+	struct btrfs_extent_info *info = ref->extents;
+
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	for (i = 0; i < ref->nritems; i++) {
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = __btrfs_free_extent(trans, root,
+					info->bytenr, info->num_bytes,
+					ref->owner, ref->generation,
+					info->objectid, info->offset, 0);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+		BUG_ON(ret);
+		info++;
+	}
+	mutex_lock(&root->fs_info->alloc_mutex);
+
+	return 0;
+}
+
 static void noinline reada_walk_down(struct btrfs_root *root,
 				     struct extent_buffer *node,
 				     int slot)
@@ -2341,6 +2417,7 @@ static int noinline walk_down_tree(struc
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
+	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -2370,7 +2447,7 @@ static int noinline walk_down_tree(struc
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref(trans, root, cur);
+			ret = drop_leaf_ref_no_cache(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2391,6 +2468,21 @@ static int noinline walk_down_tree(struc
 			BUG_ON(ret);
 			continue;
 		}
+		
+		if (*level == 1) {
+			struct btrfs_key key;
+			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
+			ref = btrfs_lookup_leaf_ref(root, &key);
+			if (ref) {
+				ret = drop_leaf_ref(trans, root, ref);
+				BUG_ON(ret);
+				btrfs_remove_leaf_ref(root, ref);
+				btrfs_free_leaf_ref(ref);
+				*level = 0;
+				break;
+			}
+		}
+
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
@@ -2435,17 +2527,19 @@ out:
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
 	if (path->nodes[*level] == root->node) {
-		root_owner = root->root_key.objectid;
 		parent = path->nodes[*level];
+		bytenr = path->nodes[*level]->start;
 	} else {
 		parent = path->nodes[*level + 1];
-		root_owner = btrfs_header_owner(parent);
-	}
-
+		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+	}
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
-	ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
-				path->nodes[*level]->len,
-				root_owner, root_gen, 0, 0, 1);
+
+	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+				  root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
diff -r eb4767aa190e transaction.c
--- a/transaction.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/transaction.c	Sat Jul 26 00:46:10 2008 +0800
@@ -24,12 +24,20 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
 #define BTRFS_ROOT_TRANS_TAG 0
+
+struct dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+	struct btrfs_leaf_ref_tree ref_tree;
+};
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -84,6 +92,7 @@ static noinline int join_transaction(str
 
 static noinline int record_root_in_trans(struct btrfs_root *root)
 {
+	struct dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
 	if (root->ref_cows && root->last_trans < running_trans_id) {
 		WARN_ON(root == root->fs_info->extent_root);
@@ -91,7 +100,25 @@ static noinline int record_root_in_trans
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_TRANS_TAG);
+
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+
+			dirty->latest_root = root;
+			INIT_LIST_HEAD(&dirty->list);
+			btrfs_leaf_ref_tree_init(&dirty->ref_tree);
+			dirty->ref_tree.generation = running_trans_id;
+
 			root->commit_root = btrfs_root_node(root);
+			root->ref_tree = &dirty->ref_tree;
+
+			memcpy(dirty->root, root, sizeof(*root));
+			spin_lock_init(&dirty->root->node_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+			dirty->root->node = root->commit_root;
+			dirty->root->commit_root = NULL;
 		} else {
 			WARN_ON(1);
 		}
@@ -310,12 +337,6 @@ int btrfs_commit_tree_roots(struct btrfs
 	return 0;
 }
 
-struct dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 int btrfs_add_dead_root(struct btrfs_root *root,
 			struct btrfs_root *latest,
 			struct list_head *dead_list)
@@ -325,8 +346,10 @@ int btrfs_add_dead_root(struct btrfs_roo
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
+	btrfs_leaf_ref_tree_init(&dirty->ref_tree);
 	dirty->root = root;
 	dirty->latest_root = latest;
+	root->ref_tree = NULL;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -354,11 +377,23 @@ static noinline int add_dirty_roots(stru
 			radix_tree_tag_clear(radix,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
+
+			BUG_ON(!root->ref_tree);
+			dirty = container_of(root->ref_tree, struct dirty_root,
+					     ref_tree);
+
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
+
+				BUG_ON(!btrfs_leaf_ref_tree_empty(
+							root->ref_tree));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
+				root->ref_tree = NULL;
+				
+				kfree(dirty->root);
+				kfree(dirty);
 
 				/* make sure to update the root on disk
 				 * so we get any updates to the block used
@@ -370,23 +405,12 @@ static noinline int add_dirty_roots(stru
 						&root->root_item);
 				continue;
 			}
-			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-			BUG_ON(!dirty);
-			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
-			BUG_ON(!dirty->root);
 
 			memset(&root->root_item.drop_progress, 0,
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
-
-			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->node = root->commit_root;
-			dirty->latest_root = root;
-			spin_lock_init(&dirty->root->node_lock);
-			mutex_init(&dirty->root->objectid_mutex);
-
 			root->commit_root = NULL;
-
+			root->ref_tree = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -409,6 +433,7 @@ static noinline int add_dirty_roots(stru
 				list_add(&dirty->list, list);
 			} else {
 				WARN_ON(1);
+				free_extent_buffer(dirty->root->node);
 				kfree(dirty->root);
 				kfree(dirty);
 			}
@@ -514,6 +539,9 @@ static noinline int drop_dirty_roots(str
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		if (dirty->root->ref_tree)
+			WARN_ON(!btrfs_leaf_ref_tree_empty(dirty->root->ref_tree));
+	
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -697,6 +725,10 @@ int btrfs_commit_transaction(struct btrf
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
+
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->running_ref_cache_size = 0;
+	spin_unlock(&root->fs_info->ref_cache_lock);
 
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] initial version of reference cache
@ 2008-07-25 20:57 Yan Zheng
  0 siblings, 0 replies; 9+ messages in thread
From: Yan Zheng @ 2008-07-25 20:57 UTC (permalink / raw)
  To: linux-btrfs

I miss two new created files in previous patch, please use this one. Thanks

---
diff -r eb4767aa190e Makefile
--- a/Makefile	Thu Jul 24 12:25:50 2008 -0400
+++ b/Makefile	Sat Jul 26 03:47:26 2008 +0800
@@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   ref-cache.o

 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff -r eb4767aa190e ctree.c
--- a/ctree.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.c	Sat Jul 26 03:47:26 2008 +0800
@@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_h
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);

 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf);
+	ret = btrfs_inc_ref(trans, new_root, buf, 0);
 	kfree(new_root);

 	if (ret)
@@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		different_trans = 1;
-		ret = btrfs_inc_ref(trans, root, buf);
+		ret = btrfs_inc_ref(trans, root, buf, 1);
 		if (ret)
 			return ret;
 	} else {
diff -r eb4767aa190e ctree.h
--- a/ctree.h	Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.h	Sat Jul 26 03:47:26 2008 +0800
@@ -592,6 +592,10 @@ struct btrfs_fs_info {
 	u64 last_alloc;
 	u64 last_data_alloc;

+	spinlock_t ref_cache_lock;
+	u64 total_ref_cache_size;
+	u64 running_ref_cache_size;
+
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
@@ -613,6 +617,8 @@ struct btrfs_root {
 	spinlock_t node_lock;

 	struct extent_buffer *commit_root;
+	struct btrfs_leaf_ref_tree *ref_tree;
+
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_tr
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf);
+		  struct extent_buffer *buf, int cache_ref);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 bytenr, u64 num_bytes,
 		      u64 root_objectid, u64 ref_generation,
diff -r eb4767aa190e disk-io.c
--- a/disk-io.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/disk-io.c	Sat Jul 26 03:47:26 2008 +0800
@@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u3
 	root->node = NULL;
 	root->inode = NULL;
 	root->commit_root = NULL;
+	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);

+		printk("btrfs: total reference cache size %Lu\n",
+			root->fs_info->total_ref_cache_size);
+
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
 			mutex_unlock(&root->fs_info->trans_mutex);
 			goto sleep;
 		}
+
+		printk("btrfs: running reference cache size %Lu\n",
+			root->fs_info->running_ref_cache_size);
+
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
 			mutex_unlock(&root->fs_info->trans_mutex);
@@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct sup
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
+	spin_lock_init(&fs_info->ref_cache_lock);

 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
@@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root)
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
 	}
+	if (fs_info->total_ref_cache_size) {
+		printk("btrfs: at umount reference cache size %Lu\n",
+			fs_info->total_ref_cache_size);
+	}
+	
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);

diff -r eb4767aa190e extent-tree.c
--- a/extent-tree.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/extent-tree.c	Sat Jul 26 03:47:26 2008 +0800
@@ -26,6 +26,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ref-cache.h"

 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -927,7 +928,7 @@ out:
 }

 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf)
+		  struct extent_buffer *buf, int cache_ref)
 {
 	u64 bytenr;
 	u32 nritems;
@@ -937,6 +938,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 	int level;
 	int ret;
 	int faili;
+	int nr_file_extents = 0;

 	if (!root->ref_cows)
 		return 0;
@@ -959,6 +961,9 @@ int btrfs_inc_ref(struct btrfs_trans_han
 			if (disk_bytenr == 0)
 				continue;

+			if (buf != root->commit_root)
+				nr_file_extents++;
+
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
@@ -988,6 +993,53 @@ int btrfs_inc_ref(struct btrfs_trans_han
 			}
 		}
 	}
+	/* cache orignal leaf block's references */
+	if (cache_ref && nr_file_extents > 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_extent_info *info;
+
+		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		if (!ref) {
+			WARN_ON(1);
+			goto out;
+		}
+
+		btrfs_item_key_to_cpu(buf, &ref->key, 0);
+
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ref->nritems = nr_file_extents;
+		info = ref->extents;
+		
+		for (i = 0; i < nritems; i++) {
+			u64 disk_bytenr;
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
+				continue;
+
+			info->bytenr = disk_bytenr;
+			info->num_bytes =
+				btrfs_file_extent_disk_num_bytes(buf, fi);
+			info->objectid = key.objectid;
+			info->offset = key.offset;
+			info++;
+		}
+
+		BUG_ON(!root->ref_tree);
+		ret = btrfs_add_leaf_ref(root, ref);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(ref);
+	}
+out:
 	return 0;
 fail:
 	WARN_ON(1);
@@ -2215,9 +2267,9 @@ struct extent_buffer *btrfs_alloc_free_b
 	return buf;
 }

-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct extent_buffer *leaf)
+static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
+				  	   struct btrfs_root *root,
+					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2266,6 +2318,30 @@ static int noinline drop_leaf_ref(struct
 	return 0;
 }

+static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
+				  	 struct btrfs_root *root,
+					 struct btrfs_leaf_ref *ref)
+{
+	int i;
+	int ret;
+	struct btrfs_extent_info *info = ref->extents;
+
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	for (i = 0; i < ref->nritems; i++) {
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = __btrfs_free_extent(trans, root,
+					info->bytenr, info->num_bytes,
+					ref->owner, ref->generation,
+					info->objectid, info->offset, 0);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+		BUG_ON(ret);
+		info++;
+	}
+	mutex_lock(&root->fs_info->alloc_mutex);
+
+	return 0;
+}
+
 static void noinline reada_walk_down(struct btrfs_root *root,
 				     struct extent_buffer *node,
 				     int slot)
@@ -2341,6 +2417,7 @@ static int noinline walk_down_tree(struc
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
+	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -2370,7 +2447,7 @@ static int noinline walk_down_tree(struc
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref(trans, root, cur);
+			ret = drop_leaf_ref_no_cache(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2391,6 +2468,21 @@ static int noinline walk_down_tree(struc
 			BUG_ON(ret);
 			continue;
 		}
+		
+		if (*level == 1) {
+			struct btrfs_key key;
+			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
+			ref = btrfs_lookup_leaf_ref(root, &key);
+			if (ref) {
+				ret = drop_leaf_ref(trans, root, ref);
+				BUG_ON(ret);
+				btrfs_remove_leaf_ref(root, ref);
+				btrfs_free_leaf_ref(ref);
+				*level = 0;
+				break;
+			}
+		}
+
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
@@ -2435,17 +2527,19 @@ out:
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);

 	if (path->nodes[*level] == root->node) {
-		root_owner = root->root_key.objectid;
 		parent = path->nodes[*level];
+		bytenr = path->nodes[*level]->start;
 	} else {
 		parent = path->nodes[*level + 1];
-		root_owner = btrfs_header_owner(parent);
-	}
-
+		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+	}
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
-	ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
-				path->nodes[*level]->len,
-				root_owner, root_gen, 0, 0, 1);
+
+	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+				  root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
diff -r eb4767aa190e ref-cache.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ref-cache.c	Fri Jul 25 21:56:56 2008 +0800
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
+{
+	struct btrfs_leaf_ref *ref;
+
+	ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS);
+	if (ref) {
+		memset(ref, 0, sizeof(*ref));
+		atomic_set(&ref->usage, 1);
+	}
+	return ref;
+}
+
+void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
+{
+	if (!ref)
+		return;
+	WARN_ON(atomic_read(&ref->usage) == 0);
+	if (atomic_dec_and_test(&ref->usage)) {
+		BUG_ON(ref->in_tree);
+		kfree(ref);
+	}
+}
+
+static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct btrfs_leaf_ref *entry;
+	int ret;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		ret = comp_keys(key, &entry->key);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else if (ret > 0)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+	
+	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key)
+{
+	struct rb_node * n = root->rb_node;
+	struct btrfs_leaf_ref *entry;
+	int ret;
+
+	while(n) {
+		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		ret = comp_keys(key, &entry->key);
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     struct btrfs_key *key)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (!tree)
+		return NULL;
+
+	spin_lock(&tree->lock);
+	if (tree->last && comp_keys(key, &tree->last->key) == 0) {
+		ref = tree->last;
+	} else {
+		rb = tree_search(&tree->root, key);
+		if (rb) {
+			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+			tree->last = ref;
+		}
+	}
+	if (ref)
+		atomic_inc(&ref->usage);
+	spin_unlock(&tree->lock);
+	return ref;
+}
+
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	int ret = 0;
+	struct rb_node *rb;
+	size_t size = btrfs_leaf_ref_size(ref->nritems);
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_transaction *trans = root->fs_info->running_transaction;
+
+	spin_lock(&tree->lock);
+	rb = tree_insert(&tree->root, &ref->key, &ref->rb_node);
+	if (rb) {
+		ret = -EEXIST;
+	} else {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		if (trans && tree->generation == trans->transid)
+			root->fs_info->running_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
+		tree->last = ref;
+		atomic_inc(&ref->usage);
+	}
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	size_t size = btrfs_leaf_ref_size(ref->nritems);
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_transaction *trans = root->fs_info->running_transaction;
+
+	BUG_ON(!ref->in_tree);
+	spin_lock(&tree->lock);
+	rb_erase(&ref->rb_node, &tree->root);
+	ref->in_tree = 0;
+	
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->total_ref_cache_size -= size;
+	if (trans && tree->generation == trans->transid)
+		root->fs_info->running_ref_cache_size -= size;
+	spin_unlock(&root->fs_info->ref_cache_lock);
+
+	if (tree->last == ref) {
+		struct rb_node *next = rb_next(&ref->rb_node);
+		if (next) {
+			tree->last = rb_entry(next, struct btrfs_leaf_ref,
+					      rb_node);
+		} else
+			tree->last = NULL;
+	}
+	spin_unlock(&tree->lock);
+
+	btrfs_free_leaf_ref(ref);
+	return 0;
+}
+
diff -r eb4767aa190e ref-cache.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ref-cache.h	Fri Jul 25 21:58:24 2008 +0800
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct btrfs_extent_info {
+	u64 bytenr;
+	u64 num_bytes;
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_leaf_ref {
+	struct rb_node rb_node;
+	struct btrfs_key key;
+	int in_tree;
+	atomic_t usage;
+
+	u64 bytenr;
+	u64 owner;
+	u64 generation;
+	int nritems;
+	struct btrfs_extent_info extents[];
+};
+
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct btrfs_leaf_ref *last;
+	u64 generation;
+	spinlock_t lock;
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+	return sizeof(struct btrfs_leaf_ref) +
+	       sizeof(struct btrfs_extent_info) * nr_extents;
+}
+
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+	tree->root.rb_node = NULL;
+	tree->last = NULL;
+	tree->generation = 0;
+	spin_lock_init(&tree->lock);
+}
+
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+	return RB_EMPTY_ROOT(&tree->root);
+}
+
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     struct btrfs_key *key);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
diff -r eb4767aa190e transaction.c
--- a/transaction.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/transaction.c	Sat Jul 26 03:47:26 2008 +0800
@@ -24,12 +24,20 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
+#include "ref-cache.h"

 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;

 #define BTRFS_ROOT_TRANS_TAG 0
+
+struct dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+	struct btrfs_leaf_ref_tree ref_tree;
+};

 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -84,6 +92,7 @@ static noinline int join_transaction(str

 static noinline int record_root_in_trans(struct btrfs_root *root)
 {
+	struct dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
 	if (root->ref_cows && root->last_trans < running_trans_id) {
 		WARN_ON(root == root->fs_info->extent_root);
@@ -91,7 +100,25 @@ static noinline int record_root_in_trans
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_TRANS_TAG);
+
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+
+			dirty->latest_root = root;
+			INIT_LIST_HEAD(&dirty->list);
+			btrfs_leaf_ref_tree_init(&dirty->ref_tree);
+			dirty->ref_tree.generation = running_trans_id;
+
 			root->commit_root = btrfs_root_node(root);
+			root->ref_tree = &dirty->ref_tree;
+
+			memcpy(dirty->root, root, sizeof(*root));
+			spin_lock_init(&dirty->root->node_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+			dirty->root->node = root->commit_root;
+			dirty->root->commit_root = NULL;
 		} else {
 			WARN_ON(1);
 		}
@@ -310,12 +337,6 @@ int btrfs_commit_tree_roots(struct btrfs
 	return 0;
 }

-struct dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 int btrfs_add_dead_root(struct btrfs_root *root,
 			struct btrfs_root *latest,
 			struct list_head *dead_list)
@@ -325,8 +346,10 @@ int btrfs_add_dead_root(struct btrfs_roo
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
+	btrfs_leaf_ref_tree_init(&dirty->ref_tree);
 	dirty->root = root;
 	dirty->latest_root = latest;
+	root->ref_tree = NULL;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -354,11 +377,23 @@ static noinline int add_dirty_roots(stru
 			radix_tree_tag_clear(radix,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
+
+			BUG_ON(!root->ref_tree);
+			dirty = container_of(root->ref_tree, struct dirty_root,
+					     ref_tree);
+
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
+
+				BUG_ON(!btrfs_leaf_ref_tree_empty(
+							root->ref_tree));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
+				root->ref_tree = NULL;
+				
+				kfree(dirty->root);
+				kfree(dirty);

 				/* make sure to update the root on disk
 				 * so we get any updates to the block used
@@ -370,23 +405,12 @@ static noinline int add_dirty_roots(stru
 						&root->root_item);
 				continue;
 			}
-			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-			BUG_ON(!dirty);
-			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
-			BUG_ON(!dirty->root);

 			memset(&root->root_item.drop_progress, 0,
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
-
-			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->node = root->commit_root;
-			dirty->latest_root = root;
-			spin_lock_init(&dirty->root->node_lock);
-			mutex_init(&dirty->root->objectid_mutex);
-
 			root->commit_root = NULL;
-
+			root->ref_tree = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -409,6 +433,7 @@ static noinline int add_dirty_roots(stru
 				list_add(&dirty->list, list);
 			} else {
 				WARN_ON(1);
+				free_extent_buffer(dirty->root->node);
 				kfree(dirty->root);
 				kfree(dirty);
 			}
@@ -514,6 +539,9 @@ static noinline int drop_dirty_roots(str
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);

+		if (dirty->root->ref_tree)
+			WARN_ON(!btrfs_leaf_ref_tree_empty(dirty->root->ref_tree));
+	
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -697,6 +725,10 @@ int btrfs_commit_transaction(struct btrf
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
+
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->running_ref_cache_size = 0;
+	spin_unlock(&root->fs_info->ref_cache_lock);

 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] initial version of reference cache
  2008-07-25 19:29 [PATCH] initial version of reference cache Yan Zheng
@ 2008-07-25 23:38 ` Chris Mason
  2008-07-26 14:26   ` Yan Zheng
  2008-07-28 15:09 ` Chris Mason
  1 sibling, 1 reply; 9+ messages in thread
From: Chris Mason @ 2008-07-25 23:38 UTC (permalink / raw)
  To: Yan Zheng; +Cc: linux-btrfs

On Fri, 2008-07-25 at 14:29 -0500, Yan Zheng wrote:
> Hello,
> 
> This is the initial version of leaf reference cache. The cache stores leaf node's extent references in memory, this can improve the performance of snapshot dropping. Outlines of this patch are (1) allocate struct dirty_root when starting transaction (2) put reference cache in struct dirty_root (3) cache extent references when tree leaves are cow'ed (4) when dropping snapshot, use cached references directly to avoid reading tree leaf. 
> 
> I only can access a notebook currenly, so benchmarking isn't enough. I appreciate any help and comment.
> 

I have modified this locally to always cache leaves, even when they
don't have file extents in them.  That way, walk_down_tree will find the
cache and won't have to read the leaf (that doesn't have any extents).

So far, it is working very well.  I did a run with fs_mark to create 58
million files and had very steady numbers.  The unmount took 4 seconds.
It used to take over an hour.

One question, why not use the block number (byte number) as the key to
the rbtree instead of the key?

-chris



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] initial version of reference cache
  2008-07-25 23:38 ` Chris Mason
@ 2008-07-26 14:26   ` Yan Zheng
  0 siblings, 0 replies; 9+ messages in thread
From: Yan Zheng @ 2008-07-26 14:26 UTC (permalink / raw)
  To: Chris Mason; +Cc: linux-btrfs

2008/7/26 Chris Mason <chris.mason@oracle.com>:
> I have modified this locally to always cache leaves, even when they
> don't have file extents in them.  That way, walk_down_tree will find the
> cache and won't have to read the leaf (that doesn't have any extents).
>
> So far, it is working very well.  I did a run with fs_mark to create 58
> million files and had very steady numbers.  The unmount took 4 seconds.
> It used to take over an hour.
>
> One question, why not use the block number (byte number) as the key to
> the rbtree instead of the key?
>

When dropping old snapshots, tree leaves are processed in ascending
order of btrfs_key. After a given tree leaf is processed, we remove
the corresponding cache entry and update tree->last to point to next
entry in the tree. Therefore btrfs_lookup_leaf_ref can find the wanted
entry in tree->last in most cases.

Regards
YZ

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] initial version of reference cache
  2008-07-25 19:29 [PATCH] initial version of reference cache Yan Zheng
  2008-07-25 23:38 ` Chris Mason
@ 2008-07-28 15:09 ` Chris Mason
       [not found]   ` <f058a9c30807280952m2386aad4pa8a08ffaf930c370@mail.gmail.com>
       [not found]   ` <4891C236.3000604@redhat.com>
  1 sibling, 2 replies; 9+ messages in thread
From: Chris Mason @ 2008-07-28 15:09 UTC (permalink / raw)
  To: Yan Zheng; +Cc: linux-btrfs

[-- Attachment #1: Type: text/plain, Size: 356 bytes --]

Yan and I are hammering this out a little, I've attached my current
patches.

I was seeing cache misses after long stress runs, which I think is
coming from references on the higher levels of the tree making us skip
some leaves while dropping the transaction that added them.

My new version using a single cache per root, and should avoid these
misses.



[-- Attachment #2: fixes --]
[-- Type: text/x-patch, Size: 3568 bytes --]

diff -r c038dde2ad20 extent-tree.c
--- a/extent-tree.c	Fri Jul 25 15:58:39 2008 -0400
+++ b/extent-tree.c	Sun Jul 27 06:39:00 2008 -0400
@@ -994,7 +994,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 		}
 	}
 	/* cache orignal leaf block's references */
-	if (cache_ref && nr_file_extents > 0) {
+	if (level == 0 && cache_ref && buf != root->commit_root) {
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
@@ -1012,7 +1012,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 		ref->nritems = nr_file_extents;
 		info = ref->extents;
 		
-		for (i = 0; i < nritems; i++) {
+		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -2490,7 +2490,6 @@ static int noinline walk_down_tree(struc
 
 			if (path->slots[*level] == 0)
 				reada_walk_down(root, cur, path->slots[*level]);
-
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			cond_resched();
diff -r c038dde2ad20 ref-cache.c
--- a/ref-cache.c	Fri Jul 25 15:58:39 2008 -0400
+++ b/ref-cache.c	Sun Jul 27 06:39:00 2008 -0400
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/sched.h>
 #include "ctree.h"
 #include "ref-cache.h"
 #include "transaction.h"
@@ -110,6 +111,34 @@ static struct rb_node *tree_search(struc
 	return NULL;
 }
 
+int btrfs_remove_leaf_refs(struct btrfs_root *root)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (!tree)
+		return 0;
+
+	spin_lock(&tree->lock);
+	while(!btrfs_leaf_ref_tree_empty(tree)) {
+		tree->last = NULL;
+		rb = rb_first(&tree->root);
+		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+		rb_erase(&ref->rb_node, &tree->root);
+		ref->in_tree = 0;
+
+		spin_unlock(&tree->lock);
+
+		btrfs_free_leaf_ref(ref);
+
+		cond_resched();
+		spin_lock(&tree->lock);
+	}
+	spin_unlock(&tree->lock);
+	return 0;
+}
+
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     struct btrfs_key *key)
 {
@@ -170,8 +199,6 @@ int btrfs_remove_leaf_ref(struct btrfs_r
 
 	BUG_ON(!ref->in_tree);
 	spin_lock(&tree->lock);
-	rb_erase(&ref->rb_node, &tree->root);
-	ref->in_tree = 0;
 	
 	spin_lock(&root->fs_info->ref_cache_lock);
 	root->fs_info->total_ref_cache_size -= size;
@@ -187,6 +214,10 @@ int btrfs_remove_leaf_ref(struct btrfs_r
 		} else
 			tree->last = NULL;
 	}
+
+	rb_erase(&ref->rb_node, &tree->root);
+	ref->in_tree = 0;
+
 	spin_unlock(&tree->lock);
 
 	btrfs_free_leaf_ref(ref);
diff -r c038dde2ad20 ref-cache.h
--- a/ref-cache.h	Fri Jul 25 15:58:39 2008 -0400
+++ b/ref-cache.h	Sun Jul 27 06:39:00 2008 -0400
@@ -68,4 +68,5 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     struct btrfs_key *key);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+int btrfs_remove_leaf_refs(struct btrfs_root *root);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
diff -r c038dde2ad20 transaction.c
--- a/transaction.c	Fri Jul 25 15:58:39 2008 -0400
+++ b/transaction.c	Sun Jul 27 06:39:00 2008 -0400
@@ -539,9 +539,8 @@ static noinline int drop_dirty_roots(str
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		if (dirty->root->ref_tree)
-			WARN_ON(!btrfs_leaf_ref_tree_empty(dirty->root->ref_tree));
-	
+		btrfs_remove_leaf_refs(dirty->root);
+
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);

[-- Attachment #3: one-per-root --]
[-- Type: text/x-patch, Size: 11126 bytes --]

diff -r cf052b443059 ctree.h
--- a/ctree.h	Sun Jul 27 06:39:00 2008 -0400
+++ b/ctree.h	Mon Jul 28 11:03:41 2008 -0400
@@ -594,7 +594,6 @@ struct btrfs_fs_info {
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
-	u64 running_ref_cache_size;
 
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
@@ -606,10 +605,17 @@ struct btrfs_fs_info {
 	void *bdev_holder;
 };
 
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct btrfs_leaf_ref *last;
+	spinlock_t lock;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
+struct dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -618,6 +624,8 @@ struct btrfs_root {
 
 	struct extent_buffer *commit_root;
 	struct btrfs_leaf_ref_tree *ref_tree;
+	struct btrfs_leaf_ref_tree ref_tree_struct;
+	struct dirty_root *dirty_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
diff -r cf052b443059 disk-io.c
--- a/disk-io.c	Sun Jul 27 06:39:00 2008 -0400
+++ b/disk-io.c	Mon Jul 28 11:03:41 2008 -0400
@@ -40,6 +40,7 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -737,6 +738,10 @@ static int __setup_root(u32 nodesize, u3
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->orphan_lock);
 	mutex_init(&root->objectid_mutex);
+
+	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+	root->ref_tree = &root->ref_tree_struct;
+
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1176,9 +1181,6 @@ static int transaction_kthread(void *arg
 			goto sleep;
 		}
 
-		printk("btrfs: running reference cache size %Lu\n",
-			root->fs_info->running_ref_cache_size);
-
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
 			mutex_unlock(&root->fs_info->trans_mutex);
diff -r cf052b443059 extent-tree.c
--- a/extent-tree.c	Sun Jul 27 06:39:00 2008 -0400
+++ b/extent-tree.c	Mon Jul 28 11:03:41 2008 -0400
@@ -2468,11 +2468,11 @@ static int noinline walk_down_tree(struc
 			BUG_ON(ret);
 			continue;
 		}
-		
+
 		if (*level == 1) {
 			struct btrfs_key key;
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
-			ref = btrfs_lookup_leaf_ref(root, &key);
+			ref = btrfs_lookup_leaf_ref(root, ptr_gen, &key);
 			if (ref) {
 				ret = drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
@@ -2481,8 +2481,8 @@ static int noinline walk_down_tree(struc
 				*level = 0;
 				break;
 			}
+printk("extent cache miss bytenr %Lu gen %Lu\n", bytenr, ptr_gen);
 		}
-
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
diff -r cf052b443059 ref-cache.c
--- a/ref-cache.c	Sun Jul 27 06:39:00 2008 -0400
+++ b/ref-cache.c	Mon Jul 28 11:03:42 2008 -0400
@@ -44,8 +44,13 @@ void btrfs_free_leaf_ref(struct btrfs_le
 	}
 }
 
-static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+static int comp_keys(u64 gen1, u64 gen2, struct btrfs_key *k1,
+		     struct btrfs_key *k2)
 {
+	if (gen1 > gen2)
+		return 1;
+	if (gen1 < gen2)
+		return -1;
 	if (k1->objectid > k2->objectid)
 		return 1;
 	if (k1->objectid < k2->objectid)
@@ -61,7 +66,8 @@ static int comp_keys(struct btrfs_key *k
 	return 0;
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
+static struct rb_node *tree_insert(struct rb_root *root, u64 gen,
+				   struct btrfs_key *key,
 				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
@@ -74,7 +80,7 @@ static struct rb_node *tree_insert(struc
 		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
-		ret = comp_keys(key, &entry->key);
+		ret = comp_keys(gen, entry->generation, key, &entry->key);
 		if (ret < 0)
 			p = &(*p)->rb_left;
 		else if (ret > 0)
@@ -90,7 +96,8 @@ static struct rb_node *tree_insert(struc
 	return NULL;
 }
 
-static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key)
+static struct rb_node *tree_search(struct rb_root *root, u64 gen,
+				   struct btrfs_key *key)
 {
 	struct rb_node * n = root->rb_node;
 	struct btrfs_leaf_ref *entry;
@@ -100,7 +107,7 @@ static struct rb_node *tree_search(struc
 		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
-		ret = comp_keys(key, &entry->key);
+		ret = comp_keys(gen, entry->generation, key, &entry->key);
 		if (ret < 0)
 			n = n->rb_left;
 		else if (ret > 0)
@@ -140,6 +147,7 @@ int btrfs_remove_leaf_refs(struct btrfs_
 }
 
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     u64 generation,
 					     struct btrfs_key *key)
 {
 	struct rb_node *rb;
@@ -150,10 +158,11 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf
 		return NULL;
 
 	spin_lock(&tree->lock);
-	if (tree->last && comp_keys(key, &tree->last->key) == 0) {
+	if (tree->last && comp_keys(generation, tree->last->generation,
+				    key, &tree->last->key) == 0) {
 		ref = tree->last;
 	} else {
-		rb = tree_search(&tree->root, key);
+		rb = tree_search(&tree->root, generation, key);
 		if (rb) {
 			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
 			tree->last = ref;
@@ -171,17 +180,15 @@ int btrfs_add_leaf_ref(struct btrfs_root
 	struct rb_node *rb;
 	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-	struct btrfs_transaction *trans = root->fs_info->running_transaction;
 
 	spin_lock(&tree->lock);
-	rb = tree_insert(&tree->root, &ref->key, &ref->rb_node);
+	rb = tree_insert(&tree->root, ref->generation, &ref->key,
+			 &ref->rb_node);
 	if (rb) {
 		ret = -EEXIST;
 	} else {
 		spin_lock(&root->fs_info->ref_cache_lock);
 		root->fs_info->total_ref_cache_size += size;
-		if (trans && tree->generation == trans->transid)
-			root->fs_info->running_ref_cache_size += size;
 		spin_unlock(&root->fs_info->ref_cache_lock);
 
 		tree->last = ref;
@@ -195,15 +202,12 @@ int btrfs_remove_leaf_ref(struct btrfs_r
 {
 	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-	struct btrfs_transaction *trans = root->fs_info->running_transaction;
 
 	BUG_ON(!ref->in_tree);
 	spin_lock(&tree->lock);
 	
 	spin_lock(&root->fs_info->ref_cache_lock);
 	root->fs_info->total_ref_cache_size -= size;
-	if (trans && tree->generation == trans->transid)
-		root->fs_info->running_ref_cache_size -= size;
 	spin_unlock(&root->fs_info->ref_cache_lock);
 
 	if (tree->last == ref) {
diff -r cf052b443059 ref-cache.h
--- a/ref-cache.h	Sun Jul 27 06:39:00 2008 -0400
+++ b/ref-cache.h	Mon Jul 28 11:03:42 2008 -0400
@@ -15,6 +15,8 @@
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
+#ifndef __REFCACHE__
+#define __REFCACHE__
 
 struct btrfs_extent_info {
 	u64 bytenr;
@@ -36,13 +38,6 @@ struct btrfs_leaf_ref {
 	struct btrfs_extent_info extents[];
 };
 
-struct btrfs_leaf_ref_tree {
-	struct rb_root root;
-	struct btrfs_leaf_ref *last;
-	u64 generation;
-	spinlock_t lock;
-};
-
 static inline size_t btrfs_leaf_ref_size(int nr_extents)
 {
 	return sizeof(struct btrfs_leaf_ref) + 
@@ -53,7 +48,6 @@ static inline void btrfs_leaf_ref_tree_i
 {
 	tree->root.rb_node = NULL;
 	tree->last = NULL;
-	tree->generation = 0;
 	spin_lock_init(&tree->lock);
 }
 
@@ -66,7 +60,9 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_
 struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
 void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-					     struct btrfs_key *key);
+					     u64 gen, struct btrfs_key *key);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 int btrfs_remove_leaf_refs(struct btrfs_root *root);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+
+#endif
diff -r cf052b443059 transaction.c
--- a/transaction.c	Sun Jul 27 06:39:00 2008 -0400
+++ b/transaction.c	Mon Jul 28 11:03:42 2008 -0400
@@ -36,7 +36,6 @@ struct dirty_root {
 	struct list_head list;
 	struct btrfs_root *root;
 	struct btrfs_root *latest_root;
-	struct btrfs_leaf_ref_tree ref_tree;
 };
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
@@ -108,13 +107,13 @@ static noinline int record_root_in_trans
 
 			dirty->latest_root = root;
 			INIT_LIST_HEAD(&dirty->list);
-			btrfs_leaf_ref_tree_init(&dirty->ref_tree);
-			dirty->ref_tree.generation = running_trans_id;
 
 			root->commit_root = btrfs_root_node(root);
-			root->ref_tree = &dirty->ref_tree;
+			root->dirty_root = dirty;
 
 			memcpy(dirty->root, root, sizeof(*root));
+			dirty->root->ref_tree = &root->ref_tree_struct;
+
 			spin_lock_init(&dirty->root->node_lock);
 			mutex_init(&dirty->root->objectid_mutex);
 			dirty->root->node = root->commit_root;
@@ -333,6 +332,8 @@ int btrfs_commit_tree_roots(struct btrfs
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 		update_cowonly_root(trans, root);
+		if (root->fs_info->closing)
+			btrfs_remove_leaf_refs(root);
 	}
 	return 0;
 }
@@ -346,10 +347,8 @@ int btrfs_add_dead_root(struct btrfs_roo
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
-	btrfs_leaf_ref_tree_init(&dirty->ref_tree);
 	dirty->root = root;
 	dirty->latest_root = latest;
-	root->ref_tree = NULL;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -379,18 +378,14 @@ static noinline int add_dirty_roots(stru
 				     BTRFS_ROOT_TRANS_TAG);
 
 			BUG_ON(!root->ref_tree);
-			dirty = container_of(root->ref_tree, struct dirty_root,
-					     ref_tree);
+			dirty = root->dirty_root;
 
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
 
-				BUG_ON(!btrfs_leaf_ref_tree_empty(
-							root->ref_tree));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
-				root->ref_tree = NULL;
 				
 				kfree(dirty->root);
 				kfree(dirty);
@@ -410,7 +405,6 @@ static noinline int add_dirty_roots(stru
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
 			root->commit_root = NULL;
-			root->ref_tree = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -538,8 +532,6 @@ static noinline int drop_dirty_roots(str
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
-
-		btrfs_remove_leaf_refs(dirty->root);
 
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
@@ -725,10 +717,6 @@ int btrfs_commit_transaction(struct btrf
 			      &dirty_fs_roots);
 	BUG_ON(ret);
 
-	spin_lock(&root->fs_info->ref_cache_lock);
-	root->fs_info->running_ref_cache_size = 0;
-	spin_unlock(&root->fs_info->ref_cache_lock);
-
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 

[-- Attachment #4: ref-cache.patch --]
[-- Type: text/x-patch, Size: 14622 bytes --]

diff -r eb4767aa190e Makefile
--- a/Makefile	Thu Jul 24 12:25:50 2008 -0400
+++ b/Makefile	Sat Jul 26 01:07:24 2008 +0800
@@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   ref-cache.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff -r eb4767aa190e ctree.c
--- a/ctree.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.c	Sat Jul 26 00:46:09 2008 +0800
@@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_h
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf);
+	ret = btrfs_inc_ref(trans, new_root, buf, 0);
 	kfree(new_root);
 
 	if (ret)
@@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		different_trans = 1;
-		ret = btrfs_inc_ref(trans, root, buf);
+		ret = btrfs_inc_ref(trans, root, buf, 1);
 		if (ret)
 			return ret;
 	} else {
diff -r eb4767aa190e ctree.h
--- a/ctree.h	Thu Jul 24 12:25:50 2008 -0400
+++ b/ctree.h	Sat Jul 26 00:46:09 2008 +0800
@@ -592,6 +592,10 @@ struct btrfs_fs_info {
 	u64 last_alloc;
 	u64 last_data_alloc;
 
+	spinlock_t ref_cache_lock;
+	u64 total_ref_cache_size;
+	u64 running_ref_cache_size;
+
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
@@ -613,6 +617,8 @@ struct btrfs_root {
 	spinlock_t node_lock;
 
 	struct extent_buffer *commit_root;
+	struct btrfs_leaf_ref_tree *ref_tree;
+
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_tr
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf);
+		  struct extent_buffer *buf, int cache_ref);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 bytenr, u64 num_bytes,
 		      u64 root_objectid, u64 ref_generation,
diff -r eb4767aa190e disk-io.c
--- a/disk-io.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/disk-io.c	Sat Jul 26 00:46:09 2008 +0800
@@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u3
 	root->node = NULL;
 	root->inode = NULL;
 	root->commit_root = NULL;
+	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
+		printk("btrfs: total reference cache size %Lu\n",
+			root->fs_info->total_ref_cache_size);
+
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
 			mutex_unlock(&root->fs_info->trans_mutex);
 			goto sleep;
 		}
+
+		printk("btrfs: running reference cache size %Lu\n",
+			root->fs_info->running_ref_cache_size);
+
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
 			mutex_unlock(&root->fs_info->trans_mutex);
@@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct sup
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
+	spin_lock_init(&fs_info->ref_cache_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
@@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root)
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
 	}
+	if (fs_info->total_ref_cache_size) {
+		printk("btrfs: at umount reference cache size %Lu\n",
+			fs_info->total_ref_cache_size);
+	}
+	
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff -r eb4767aa190e extent-tree.c
--- a/extent-tree.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/extent-tree.c	Sat Jul 26 02:01:27 2008 +0800
@@ -26,6 +26,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -927,7 +928,7 @@ out:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf)
+		  struct extent_buffer *buf, int cache_ref)
 {
 	u64 bytenr;
 	u32 nritems;
@@ -937,6 +938,7 @@ int btrfs_inc_ref(struct btrfs_trans_han
 	int level;
 	int ret;
 	int faili;
+	int nr_file_extents = 0;
 
 	if (!root->ref_cows)
 		return 0;
@@ -959,6 +961,9 @@ int btrfs_inc_ref(struct btrfs_trans_han
 			if (disk_bytenr == 0)
 				continue;
 
+			if (buf != root->commit_root)
+				nr_file_extents++;
+
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
@@ -988,6 +993,53 @@ int btrfs_inc_ref(struct btrfs_trans_han
 			}
 		}
 	}
+	/* cache orignal leaf block's references */
+	if (cache_ref && nr_file_extents > 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_extent_info *info;
+
+		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		if (!ref) {
+			WARN_ON(1);
+			goto out;
+		}
+
+		btrfs_item_key_to_cpu(buf, &ref->key, 0);
+
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ref->nritems = nr_file_extents;
+		info = ref->extents;
+		
+		for (i = 0; i < nritems; i++) {
+			u64 disk_bytenr;
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
+				continue;
+
+			info->bytenr = disk_bytenr;
+			info->num_bytes =
+				btrfs_file_extent_disk_num_bytes(buf, fi);
+			info->objectid = key.objectid;
+			info->offset = key.offset;
+			info++;
+		}
+
+		BUG_ON(!root->ref_tree);
+		ret = btrfs_add_leaf_ref(root, ref);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(ref);
+	}
+out:
 	return 0;
 fail:
 	WARN_ON(1);
@@ -2215,9 +2267,9 @@ struct extent_buffer *btrfs_alloc_free_b
 	return buf;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct extent_buffer *leaf)
+static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
+				  	   struct btrfs_root *root,
+					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2266,6 +2318,30 @@ static int noinline drop_leaf_ref(struct
 	return 0;
 }
 
+static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
+				  	 struct btrfs_root *root,
+					 struct btrfs_leaf_ref *ref)
+{
+	int i;
+	int ret;
+	struct btrfs_extent_info *info = ref->extents;
+
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	for (i = 0; i < ref->nritems; i++) {
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = __btrfs_free_extent(trans, root,
+					info->bytenr, info->num_bytes,
+					ref->owner, ref->generation,
+					info->objectid, info->offset, 0);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+		BUG_ON(ret);
+		info++;
+	}
+	mutex_lock(&root->fs_info->alloc_mutex);
+
+	return 0;
+}
+
 static void noinline reada_walk_down(struct btrfs_root *root,
 				     struct extent_buffer *node,
 				     int slot)
@@ -2341,6 +2417,7 @@ static int noinline walk_down_tree(struc
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
+	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -2370,7 +2447,7 @@ static int noinline walk_down_tree(struc
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref(trans, root, cur);
+			ret = drop_leaf_ref_no_cache(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2391,6 +2468,21 @@ static int noinline walk_down_tree(struc
 			BUG_ON(ret);
 			continue;
 		}
+		
+		if (*level == 1) {
+			struct btrfs_key key;
+			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
+			ref = btrfs_lookup_leaf_ref(root, &key);
+			if (ref) {
+				ret = drop_leaf_ref(trans, root, ref);
+				BUG_ON(ret);
+				btrfs_remove_leaf_ref(root, ref);
+				btrfs_free_leaf_ref(ref);
+				*level = 0;
+				break;
+			}
+		}
+
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
@@ -2435,17 +2527,19 @@ out:
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
 	if (path->nodes[*level] == root->node) {
-		root_owner = root->root_key.objectid;
 		parent = path->nodes[*level];
+		bytenr = path->nodes[*level]->start;
 	} else {
 		parent = path->nodes[*level + 1];
-		root_owner = btrfs_header_owner(parent);
-	}
-
+		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+	}
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
-	ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
-				path->nodes[*level]->len,
-				root_owner, root_gen, 0, 0, 1);
+
+	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+				  root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
diff -r eb4767aa190e transaction.c
--- a/transaction.c	Thu Jul 24 12:25:50 2008 -0400
+++ b/transaction.c	Sat Jul 26 00:46:10 2008 +0800
@@ -24,12 +24,20 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
 #define BTRFS_ROOT_TRANS_TAG 0
+
+struct dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+	struct btrfs_leaf_ref_tree ref_tree;
+};
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -84,6 +92,7 @@ static noinline int join_transaction(str
 
 static noinline int record_root_in_trans(struct btrfs_root *root)
 {
+	struct dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
 	if (root->ref_cows && root->last_trans < running_trans_id) {
 		WARN_ON(root == root->fs_info->extent_root);
@@ -91,7 +100,25 @@ static noinline int record_root_in_trans
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_TRANS_TAG);
+
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+
+			dirty->latest_root = root;
+			INIT_LIST_HEAD(&dirty->list);
+			btrfs_leaf_ref_tree_init(&dirty->ref_tree);
+			dirty->ref_tree.generation = running_trans_id;
+
 			root->commit_root = btrfs_root_node(root);
+			root->ref_tree = &dirty->ref_tree;
+
+			memcpy(dirty->root, root, sizeof(*root));
+			spin_lock_init(&dirty->root->node_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+			dirty->root->node = root->commit_root;
+			dirty->root->commit_root = NULL;
 		} else {
 			WARN_ON(1);
 		}
@@ -310,12 +337,6 @@ int btrfs_commit_tree_roots(struct btrfs
 	return 0;
 }
 
-struct dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 int btrfs_add_dead_root(struct btrfs_root *root,
 			struct btrfs_root *latest,
 			struct list_head *dead_list)
@@ -325,8 +346,10 @@ int btrfs_add_dead_root(struct btrfs_roo
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
+	btrfs_leaf_ref_tree_init(&dirty->ref_tree);
 	dirty->root = root;
 	dirty->latest_root = latest;
+	root->ref_tree = NULL;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -354,11 +377,23 @@ static noinline int add_dirty_roots(stru
 			radix_tree_tag_clear(radix,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
+
+			BUG_ON(!root->ref_tree);
+			dirty = container_of(root->ref_tree, struct dirty_root,
+					     ref_tree);
+
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
+
+				BUG_ON(!btrfs_leaf_ref_tree_empty(
+							root->ref_tree));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
+				root->ref_tree = NULL;
+				
+				kfree(dirty->root);
+				kfree(dirty);
 
 				/* make sure to update the root on disk
 				 * so we get any updates to the block used
@@ -370,23 +405,12 @@ static noinline int add_dirty_roots(stru
 						&root->root_item);
 				continue;
 			}
-			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-			BUG_ON(!dirty);
-			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
-			BUG_ON(!dirty->root);
 
 			memset(&root->root_item.drop_progress, 0,
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
-
-			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->node = root->commit_root;
-			dirty->latest_root = root;
-			spin_lock_init(&dirty->root->node_lock);
-			mutex_init(&dirty->root->objectid_mutex);
-
 			root->commit_root = NULL;
-
+			root->ref_tree = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -409,6 +433,7 @@ static noinline int add_dirty_roots(stru
 				list_add(&dirty->list, list);
 			} else {
 				WARN_ON(1);
+				free_extent_buffer(dirty->root->node);
 				kfree(dirty->root);
 				kfree(dirty);
 			}
@@ -514,6 +539,9 @@ static noinline int drop_dirty_roots(str
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		if (dirty->root->ref_tree)
+			WARN_ON(!btrfs_leaf_ref_tree_empty(dirty->root->ref_tree));
+	
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -697,6 +725,10 @@ int btrfs_commit_transaction(struct btrf
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
+
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->running_ref_cache_size = 0;
+	spin_unlock(&root->fs_info->ref_cache_lock);
 
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);

[-- Attachment #5: ref-cache1.patch --]
[-- Type: text/x-patch, Size: 7370 bytes --]

diff -r eb4767aa190e ref-cache.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ref-cache.c	Fri Jul 25 21:56:56 2008 +0800
@@ -0,0 +1,195 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
+{
+	struct btrfs_leaf_ref *ref;
+
+	ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS);
+	if (ref) {
+		memset(ref, 0, sizeof(*ref));
+		atomic_set(&ref->usage, 1);
+	}
+	return ref;
+}
+
+void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
+{
+	if (!ref)
+		return;
+	WARN_ON(atomic_read(&ref->usage) == 0);
+	if (atomic_dec_and_test(&ref->usage)) {
+		BUG_ON(ref->in_tree);
+		kfree(ref);
+	}
+}
+
+static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct btrfs_leaf_ref *entry;
+	int ret;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		ret = comp_keys(key, &entry->key);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else if (ret > 0)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+	
+	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key)
+{
+	struct rb_node * n = root->rb_node;
+	struct btrfs_leaf_ref *entry;
+	int ret;
+
+	while(n) {
+		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		ret = comp_keys(key, &entry->key);
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     struct btrfs_key *key)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (!tree)
+		return NULL;
+
+	spin_lock(&tree->lock);
+	if (tree->last && comp_keys(key, &tree->last->key) == 0) {
+		ref = tree->last;
+	} else {
+		rb = tree_search(&tree->root, key);
+		if (rb) {
+			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+			tree->last = ref;
+		}
+	}
+	if (ref)
+		atomic_inc(&ref->usage);
+	spin_unlock(&tree->lock);
+	return ref;
+}
+
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	int ret = 0;
+	struct rb_node *rb;
+	size_t size = btrfs_leaf_ref_size(ref->nritems);
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_transaction *trans = root->fs_info->running_transaction;
+
+	spin_lock(&tree->lock);
+	rb = tree_insert(&tree->root, &ref->key, &ref->rb_node);
+	if (rb) {
+		ret = -EEXIST;
+	} else {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		if (trans && tree->generation == trans->transid)
+			root->fs_info->running_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
+		tree->last = ref;
+		atomic_inc(&ref->usage);
+	}
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	size_t size = btrfs_leaf_ref_size(ref->nritems);
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_transaction *trans = root->fs_info->running_transaction;
+
+	BUG_ON(!ref->in_tree);
+	spin_lock(&tree->lock);
+	rb_erase(&ref->rb_node, &tree->root);
+	ref->in_tree = 0;
+	
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->total_ref_cache_size -= size;
+	if (trans && tree->generation == trans->transid)
+		root->fs_info->running_ref_cache_size -= size;
+	spin_unlock(&root->fs_info->ref_cache_lock);
+
+	if (tree->last == ref) {
+		struct rb_node *next = rb_next(&ref->rb_node);
+		if (next) {
+			tree->last = rb_entry(next, struct btrfs_leaf_ref,
+					      rb_node);
+		} else
+			tree->last = NULL;
+	}
+	spin_unlock(&tree->lock);
+
+	btrfs_free_leaf_ref(ref);
+	return 0;
+}
+
diff -r eb4767aa190e ref-cache.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ref-cache.h	Fri Jul 25 21:58:24 2008 +0800
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct btrfs_extent_info {
+	u64 bytenr;
+	u64 num_bytes;
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_leaf_ref {
+	struct rb_node rb_node;
+	struct btrfs_key key;
+	int in_tree;
+	atomic_t usage;
+
+	u64 bytenr;
+	u64 owner;
+	u64 generation;
+	int nritems;
+	struct btrfs_extent_info extents[];
+};
+
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct btrfs_leaf_ref *last;
+	u64 generation;
+	spinlock_t lock;
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+	return sizeof(struct btrfs_leaf_ref) + 
+	       sizeof(struct btrfs_extent_info) * nr_extents;
+}
+
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+	tree->root.rb_node = NULL;
+	tree->last = NULL;
+	tree->generation = 0;
+	spin_lock_init(&tree->lock);
+}
+
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+	return RB_EMPTY_ROOT(&tree->root);
+}
+
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     struct btrfs_key *key);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] initial version of reference cache
       [not found]   ` <f058a9c30807280952m2386aad4pa8a08ffaf930c370@mail.gmail.com>
@ 2008-07-28 16:53     ` Miguel Sousa Filipe
  2008-07-29  0:25       ` Chris Mason
  0 siblings, 1 reply; 9+ messages in thread
From: Miguel Sousa Filipe @ 2008-07-28 16:53 UTC (permalink / raw)
  To: linux-btrfs

Hi all,

On Mon, Jul 28, 2008 at 4:09 PM, Chris Mason <chris.mason@oracle.com> wrote:
> Yan and I are hammering this out a little, I've attached my current
> patches.
>
> I was seeing cache misses after long stress runs, which I think is
> coming from references on the higher levels of the tree making us skip
> some leaves while dropping the transaction that added them.
>
> My new version using a single cache per root, and should avoid these
> misses.

Just curious, what is the cache size and or eviction policy?
Is it LFU or LRU, fifo, something custom? Would ARC be a good policy
for this purpose?

Sorry if this is totally off-topic or clueless, just curious.. hell,
I'm not even shure what data/info/object is the cache for...

Kind regards,


--
Miguel Sousa Filipe

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] initial version of reference cache
  2008-07-28 16:53     ` Miguel Sousa Filipe
@ 2008-07-29  0:25       ` Chris Mason
  0 siblings, 0 replies; 9+ messages in thread
From: Chris Mason @ 2008-07-29  0:25 UTC (permalink / raw)
  To: Miguel Sousa Filipe; +Cc: linux-btrfs

On Mon, 2008-07-28 at 17:53 +0100, Miguel Sousa Filipe wrote:
> Hi all,
> 
> On Mon, Jul 28, 2008 at 4:09 PM, Chris Mason <chris.mason@oracle.com> wrote:
> > Yan and I are hammering this out a little, I've attached my current
> > patches.
> >
> > I was seeing cache misses after long stress runs, which I think is
> > coming from references on the higher levels of the tree making us skip
> > some leaves while dropping the transaction that added them.
> >
> > My new version using a single cache per root, and should avoid these
> > misses.
> 
> Just curious, what is the cache size and or eviction policy?
> Is it LFU or LRU, fifo, something custom? Would ARC be a good policy
> for this purpose?
> 

Since this is a cache of the extents pointers in a COW block, things are
only written once, and these items are only read once most of the time.
So, we don't need anything fancy, just a direct access list that we can
use to pull off really old things.

In this case, it is just a list head.

-chris



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: still see locking issues...
       [not found]   ` <4891C236.3000604@redhat.com>
@ 2008-07-31 14:11     ` Chris Mason
  2008-07-31 14:37     ` Chris Mason
  1 sibling, 0 replies; 9+ messages in thread
From: Chris Mason @ 2008-07-31 14:11 UTC (permalink / raw)
  To: rwheeler; +Cc: linux-btrfs

On Thu, 2008-07-31 at 09:46 -0400, Ric Wheeler wrote:
> Not sure if this is just an 8-way, multithreaded test, but my run of the 
> latest unstable tree ground to a halt with lock messages (see the 
> attached, compressed log for details ;-))  On the good news side, it did 
> run for around 8 hours before the first message was logged.

I just merged Yan's missing hunks that might help explain this.  At
least for fs_mark, you shouldn't really be hitting the throttling code
very often.

But, until I fully thread the allocator I can't completely get rid of
the stalls.  That's one of the first things on my list for after v0.16

One of your procs is stuck in generic_unplug_device, and that one didn't
come from me ;)  Sounds like the IO subsystem might be taking short
breaks as well.

-chris



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: still see locking issues...
       [not found]   ` <4891C236.3000604@redhat.com>
  2008-07-31 14:11     ` still see locking issues Chris Mason
@ 2008-07-31 14:37     ` Chris Mason
  1 sibling, 0 replies; 9+ messages in thread
From: Chris Mason @ 2008-07-31 14:37 UTC (permalink / raw)
  To: rwheeler; +Cc: linux-btrfs

On Thu, 2008-07-31 at 09:46 -0400, Ric Wheeler wrote:
> Not sure if this is just an 8-way, multithreaded test, but my run of the 
> latest unstable tree ground to a halt with lock messages (see the 
> attached, compressed log for details ;-))  On the good news side, it did 
> run for around 8 hours before the first message was logged.

It also looks like writeback is getting stuck waiting for locked pages
that are waiting for the transaction lock.  I've changed around the
start_transaction code a bit to help with this, and I'll test a bit and
send out.

-chris



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2008-07-31 14:37 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-25 19:29 [PATCH] initial version of reference cache Yan Zheng
2008-07-25 23:38 ` Chris Mason
2008-07-26 14:26   ` Yan Zheng
2008-07-28 15:09 ` Chris Mason
     [not found]   ` <f058a9c30807280952m2386aad4pa8a08ffaf930c370@mail.gmail.com>
2008-07-28 16:53     ` Miguel Sousa Filipe
2008-07-29  0:25       ` Chris Mason
     [not found]   ` <4891C236.3000604@redhat.com>
2008-07-31 14:11     ` still see locking issues Chris Mason
2008-07-31 14:37     ` Chris Mason
  -- strict thread matches above, loose matches on Subject: below --
2008-07-25 20:57 [PATCH] initial version of reference cache Yan Zheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox