linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH 2/2] Btrfs: introduce free space cluster for each node
@ 2011-09-08  8:18 Miao Xie
  2011-11-01  7:39 ` Miao Xie
  0 siblings, 1 reply; 4+ messages in thread
From: Miao Xie @ 2011-09-08  8:18 UTC (permalink / raw)
  To: Linux Btrfs

This patch introduce free space cluster for each node in the b+tree. An=
d
we also simplify the fill method and the space allocation of the cluste=
r:
- Allocate free space cluster for each node
- Allocate the free space extent (>=3D256KB, split a large extent or ge=
t the
  contiguous blocks from the bitmaps) from the block group cache and ca=
che
  it in the cluster, instead of moving the free space entry from the bl=
ock
  group cache to the cluster directly. By this way, we just manage a ex=
tent
  in the cluster.
- When doing the tree block allocation, we can get the space just by sp=
lit
  the extent in the parent's cluster. By this way, we can allocate the =
tree
  block concurrently and also can store the child nodes/leaves into the
  contiguous blocks.

Beside that, we modify the source of the space cache to make it fit the=
 above
change. When we write out the information of the free space, we also wr=
ite out
the extent information in the clusters. And when we load the free space
information, we will try to merge the free space fragment at first, bec=
ause the
extent in the clusters is small, and may merge them to be a large one.
(Before applying this patch, we build the free space tree by the cached=
 information
directly. I think we needn't worry about the compatibility. At the wors=
t, we may
create lots of small extents which may be merge into a large one, but i=
t will not
break the use of Btrfs.=A3=A9

We did some performance test for this patch, we find it works well, and=
 make the
small file performance grow up.

The sequential write performance of the small file:
N_files		N_threads	File Size	Before		After
10240		8		2KB(Inline)	2.2055MB/s	4.1779MB/s
10240		8		4KB		1.001MB/s	1.1893MB/s

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/ctree.c            |   28 ++-
 fs/btrfs/ctree.h            |   50 +++-
 fs/btrfs/disk-io.c          |    2 +-
 fs/btrfs/extent-tree.c      |  107 +++++---
 fs/btrfs/extent_io.c        |    7 +-
 fs/btrfs/extent_io.h        |    3 +
 fs/btrfs/free-space-cache.c |  667 ++++++++++++++++-------------------=
--------
 fs/btrfs/free-space-cache.h |   13 +
 fs/btrfs/inode-map.c        |    1 +
 fs/btrfs/inode.c            |   25 +-
 fs/btrfs/ioctl.c            |    2 +-
 fs/btrfs/tree-log.c         |    7 +
 12 files changed, 419 insertions(+), 493 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3..1d3edd8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -238,7 +238,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *tran=
s,
 	else
 		btrfs_node_key(buf, &disk_key, 0);
=20
-	cow =3D btrfs_alloc_free_block(trans, root, buf->len, 0,
+	cow =3D btrfs_alloc_free_block(trans, root, buf->len, NULL, 0,
 				     new_root_objectid, &disk_key, level,
 				     buf->start, 0);
 	if (IS_ERR(cow))
@@ -444,9 +444,10 @@ static noinline int __btrfs_cow_block(struct btrfs=
_trans_handle *trans,
 	} else
 		parent_start =3D 0;
=20
-	cow =3D btrfs_alloc_free_block(trans, root, buf->len, parent_start,
-				     root->root_key.objectid, &disk_key,
-				     level, search_start, empty_size);
+	cow =3D btrfs_alloc_free_block(trans, root, buf->len, parent,
+				     parent_start, root->root_key.objectid,
+				     &disk_key, level, search_start,
+				     empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
=20
@@ -467,6 +468,10 @@ static noinline int __btrfs_cow_block(struct btrfs=
_trans_handle *trans,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
=20
+	WARN_ON(cow->cluster);
+	cow->cluster =3D buf->cluster;
+	buf->cluster =3D NULL;
+
 	update_ref_for_cow(trans, root, buf, cow, &last_ref);
=20
 	if (root->ref_cows)
@@ -2070,7 +2075,7 @@ static noinline int insert_new_root(struct btrfs_=
trans_handle *trans,
 	else
 		btrfs_node_key(lower, &lower_key, 0);
=20
-	c =3D btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+	c =3D btrfs_alloc_free_block(trans, root, root->nodesize, NULL, 0,
 				   root->root_key.objectid, &lower_key,
 				   level, root->node->start, 0);
 	if (IS_ERR(c))
@@ -2170,6 +2175,7 @@ static noinline int split_node(struct btrfs_trans=
_handle *trans,
 {
 	struct extent_buffer *c;
 	struct extent_buffer *split;
+	struct extent_buffer *parent =3D NULL;
 	struct btrfs_disk_key disk_key;
 	int mid;
 	int ret;
@@ -2197,9 +2203,12 @@ static noinline int split_node(struct btrfs_tran=
s_handle *trans,
 	mid =3D (c_nritems + 1) / 2;
 	btrfs_node_key(c, &disk_key, mid);
=20
-	split =3D btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-					root->root_key.objectid,
-					&disk_key, level, c->start, 0);
+	if (level + 1 < BTRFS_MAX_LEVEL)
+		parent =3D path->nodes[level + 1];
+
+	split =3D btrfs_alloc_free_block(trans, root, root->nodesize, parent,=
 0,
+				       root->root_key.objectid,
+				       &disk_key, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
=20
@@ -2951,7 +2960,8 @@ again:
 	else
 		btrfs_item_key(l, &disk_key, mid);
=20
-	right =3D btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+	right =3D btrfs_alloc_free_block(trans, root, root->leafsize,
+					path->nodes[1], 0,
 					root->root_key.objectid,
 					&disk_key, 0, l->start, 0);
 	if (IS_ERR(right))
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c364d50..8c33a74 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -789,14 +789,9 @@ struct btrfs_block_rsv {
  */
 struct btrfs_free_cluster {
 	spinlock_t lock;
-	spinlock_t refill_lock;
-	struct rb_root root;
+	struct mutex refill_lock;
=20
-	/* largest extent in this cluster */
-	u64 max_size;
-
-	/* first extent starting offset */
-	u64 window_start;
+	struct btrfs_free_space *info;
=20
 	struct btrfs_block_group_cache *block_group;
 	/*
@@ -2156,7 +2151,9 @@ u64 btrfs_find_block_group(struct btrfs_root *roo=
t,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle=
 *trans,
 					struct btrfs_root *root, u32 blocksize,
-					u64 parent, u64 root_objectid,
+					struct extent_buffer *parent,
+					u64 parent_start,
+					u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
 					u64 hint, u64 empty_size);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -2175,12 +2172,37 @@ int btrfs_alloc_logged_file_extent(struct btrfs=
_trans_handle *trans,
 				   struct btrfs_root *root,
 				   u64 root_objectid, u64 owner, u64 offset,
 				   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 num_bytes, u64 min_alloc_size,
-				  u64 empty_size, u64 hint_byte,
-				  u64 search_end, struct btrfs_key *ins,
-				  u64 data);
+int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   u64 num_bytes, u64 min_alloc_size,
+			   u64 empty_size, u64 hint_byte,
+			   u64 search_end, struct btrfs_key *ins,
+			   u64 data, struct btrfs_free_cluster *cluster);
+static inline int btrfs_reserve_extent_data(struct btrfs_trans_handle =
*trans,
+					    struct btrfs_root *root,
+					    u64 num_bytes, u64 min_alloc_size,
+					    u64 empty_size, u64 hint_byte,
+					    u64 search_end,
+					    struct btrfs_key *ins)
+{
+	return __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+				      empty_size, hint_byte, search_end, ins,
+				      1, NULL);
+}
+
+static inline int btrfs_reserve_extent_mdata(struct btrfs_trans_handle=
 *trans,
+					     struct btrfs_root *root,
+					     u64 num_bytes, u64 min_alloc_size,
+					     u64 empty_size, u64 hint_byte,
+					     u64 search_end,
+					     struct btrfs_key *ins,
+					     struct btrfs_free_cluster *cluster)
+{
+	return __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+				      empty_size, hint_byte, search_end, ins,
+				      0, cluster);
+}
+
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root =
*root,
 		  struct extent_buffer *buf, int full_backref);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root =
*root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac6..951a57f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1171,7 +1171,7 @@ static struct btrfs_root *alloc_log_tree(struct b=
trfs_trans_handle *trans,
 	 */
 	root->ref_cows =3D 0;
=20
-	leaf =3D btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+	leaf =3D btrfs_alloc_free_block(trans, root, root->leafsize, NULL, 0,
 				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 80d6148..43cc5c4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4672,6 +4672,8 @@ void btrfs_free_tree_block(struct btrfs_trans_han=
dle *trans,
 	struct btrfs_block_group_cache *cache =3D NULL;
 	int ret;
=20
+	btrfs_free_extent_cluster(buf);
+
 	if (root->root_key.objectid !=3D BTRFS_TREE_LOG_OBJECTID) {
 		ret =3D btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
 						parent, root->root_key.objectid,
@@ -4853,8 +4855,10 @@ enum btrfs_loop_type {
 	LOOP_FIND_IDEAL =3D 0,
 	LOOP_CACHING_NOWAIT =3D 1,
 	LOOP_CACHING_WAIT =3D 2,
-	LOOP_ALLOC_CHUNK =3D 3,
-	LOOP_NO_EMPTY_SIZE =3D 4,
+	LOOP_RECLAIM_CLUSTERS =3D 3,
+	LOOP_RECLAIM_ALL_CLUSTERS =3D 4,
+	LOOP_ALLOC_CHUNK =3D 5,
+	LOOP_NO_EMPTY_SIZE =3D 6,
 };
=20
 /*
@@ -4870,7 +4874,8 @@ static noinline int find_free_extent(struct btrfs=
_trans_handle *trans,
 				     u64 num_bytes, u64 empty_size,
 				     u64 search_start, u64 search_end,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     u64 data)
+				     u64 data,
+				     struct btrfs_free_cluster *cluster)
 {
 	int ret =3D 0;
 	struct btrfs_root *root =3D orig_root->fs_info->extent_root;
@@ -4912,9 +4917,12 @@ static noinline int find_free_extent(struct btrf=
s_trans_handle *trans,
 		allowed_chunk_alloc =3D 1;
=20
 	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-		last_ptr =3D &root->fs_info->meta_alloc_cluster;
+		if (cluster)
+			last_ptr =3D cluster;
+		else
+			last_ptr =3D &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
-			empty_cluster =3D 64 * 1024;
+			empty_cluster =3D 256 * 1024;
 	}
=20
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
@@ -4925,7 +4933,7 @@ static noinline int find_free_extent(struct btrfs=
_trans_handle *trans,
 	if (last_ptr) {
 		spin_lock(&last_ptr->lock);
 		if (last_ptr->block_group)
-			hint_byte =3D last_ptr->window_start;
+			hint_byte =3D last_ptr->info->offset;
 		spin_unlock(&last_ptr->lock);
 	}
=20
@@ -5043,6 +5051,13 @@ have_block_group:
 		if (unlikely(block_group->ro))
 			goto loop;
=20
+
+		if (loop =3D=3D LOOP_RECLAIM_CLUSTERS) {
+			btrfs_reclaim_extent_clusters(block_group,
+						      empty_cluster * 2);
+		} else if (loop =3D=3D LOOP_RECLAIM_ALL_CLUSTERS)
+			btrfs_reclaim_extent_clusters(block_group, 0);
+
 		spin_lock(&block_group->free_space_ctl->tree_lock);
 		if (cached &&
 		    block_group->free_space_ctl->free_space <
@@ -5066,7 +5081,7 @@ have_block_group:
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
 			 */
-			spin_lock(&last_ptr->refill_lock);
+			mutex_lock(&last_ptr->refill_lock);
 			if (last_ptr->block_group &&
 			    (last_ptr->block_group->ro ||
 			    !block_group_bits(last_ptr->block_group, data))) {
@@ -5078,7 +5093,7 @@ have_block_group:
 						 num_bytes, search_start);
 			if (offset) {
 				/* we have a block, we're done */
-				spin_unlock(&last_ptr->refill_lock);
+				mutex_unlock(&last_ptr->refill_lock);
 				goto checks;
 			}
=20
@@ -5097,7 +5112,7 @@ have_block_group:
 				block_group =3D last_ptr->block_group;
 				btrfs_get_block_group(block_group);
 				spin_unlock(&last_ptr->lock);
-				spin_unlock(&last_ptr->refill_lock);
+				mutex_unlock(&last_ptr->refill_lock);
=20
 				last_ptr_loop =3D 1;
 				search_start =3D block_group->key.objectid;
@@ -5123,7 +5138,7 @@ refill_cluster:
 			/* allocate a cluster in this block group */
 			ret =3D btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
-					       offset, num_bytes,
+					       search_start, num_bytes,
 					       empty_cluster + empty_size);
 			if (ret =3D=3D 0) {
 				/*
@@ -5135,12 +5150,12 @@ refill_cluster:
 						  search_start);
 				if (offset) {
 					/* we found one, proceed */
-					spin_unlock(&last_ptr->refill_lock);
+					mutex_unlock(&last_ptr->refill_lock);
 					goto checks;
 				}
 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
 				   && !failed_cluster_refill) {
-				spin_unlock(&last_ptr->refill_lock);
+				mutex_unlock(&last_ptr->refill_lock);
=20
 				failed_cluster_refill =3D true;
 				wait_block_group_cache_progress(block_group,
@@ -5155,7 +5170,7 @@ refill_cluster:
 			 * to use, and go to the next block group
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
-			spin_unlock(&last_ptr->refill_lock);
+			mutex_unlock(&last_ptr->refill_lock);
 			goto loop;
 		}
=20
@@ -5197,9 +5212,11 @@ checks:
 		ins->objectid =3D search_start;
 		ins->offset =3D num_bytes;
=20
-		if (offset < search_start)
+		if (offset < search_start) {
 			btrfs_add_free_space(block_group, offset,
 					     search_start - offset);
+			offset =3D search_start;
+		}
 		BUG_ON(offset > search_start);
=20
 		ret =3D btrfs_update_reserved_bytes(block_group, num_bytes, 1,
@@ -5210,13 +5227,6 @@ checks:
 		}
=20
 		/* we are all good, lets return */
-		ins->objectid =3D search_start;
-		ins->offset =3D num_bytes;
-
-		if (offset < search_start)
-			btrfs_add_free_space(block_group, offset,
-					     search_start - offset);
-		BUG_ON(offset > search_start);
 		btrfs_put_block_group(block_group);
 		break;
 loop:
@@ -5236,6 +5246,12 @@ loop:
 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
 	 *			caching kthreads as we move along
 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is cachin=
g
+	 * LOOP_RECLAIM_CLUSTERS, reclaim free space from some clusters, and
+	 *			  by this way, we may find enough continuous
+	 *			  space to fill the cluster, and then search
+	 *			  the free space again.
+	 * LOOP_RECLAIM_ALL_CLUSTERS, reclaim free space from all the cluster=
s,
+	 *			      and then search again.
 	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
 	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
 	 *			again
@@ -5362,12 +5378,12 @@ again:
 	up_read(&info->groups_sem);
 }
=20
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 num_bytes, u64 min_alloc_size,
-			 u64 empty_size, u64 hint_byte,
-			 u64 search_end, struct btrfs_key *ins,
-			 u64 data)
+int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   u64 num_bytes, u64 min_alloc_size,
+			   u64 empty_size, u64 hint_byte,
+			   u64 search_end, struct btrfs_key *ins,
+			   u64 data, struct btrfs_free_cluster *cluster)
 {
 	int ret;
 	u64 search_start =3D 0;
@@ -5386,7 +5402,7 @@ again:
 	WARN_ON(num_bytes < root->sectorsize);
 	ret =3D find_free_extent(trans, root, num_bytes, empty_size,
 			       search_start, search_end, hint_byte,
-			       ins, data);
+			       ins, data, cluster);
=20
 	if (ret =3D=3D -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes =3D num_bytes >> 1;
@@ -5741,13 +5757,15 @@ static void unuse_block_rsv(struct btrfs_block_=
rsv *block_rsv, u32 blocksize)
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle=
 *trans,
 					struct btrfs_root *root, u32 blocksize,
-					u64 parent, u64 root_objectid,
+					struct extent_buffer *parent,
+					u64 parent_start, u64 root_objectid,
 					struct btrfs_disk_key *key, int level,
 					u64 hint, u64 empty_size)
 {
 	struct btrfs_key ins;
 	struct btrfs_block_rsv *block_rsv;
 	struct extent_buffer *buf;
+	struct btrfs_free_cluster *cluster =3D NULL;
 	u64 flags =3D 0;
 	int ret;
=20
@@ -5756,8 +5774,19 @@ struct extent_buffer *btrfs_alloc_free_block(str=
uct btrfs_trans_handle *trans,
 	if (IS_ERR(block_rsv))
 		return ERR_CAST(block_rsv);
=20
-	ret =3D btrfs_reserve_extent(trans, root, blocksize, blocksize,
-				   empty_size, hint, (u64)-1, &ins, 0);
+	/*
+	 * We needn't worry about allocation failure, because if failed,
+	 * we will use the default metadata cluster in fs_info
+	 */
+	if (parent && !parent->cluster)
+		parent->cluster =3D btrfs_alloc_free_cluster();
+
+	if (parent)
+		cluster =3D parent->cluster;
+
+	ret =3D btrfs_reserve_extent_mdata(trans, root, blocksize, blocksize,
+					 empty_size, hint, (u64)-1, &ins,
+					 cluster);
 	if (ret) {
 		unuse_block_rsv(block_rsv, blocksize);
 		return ERR_PTR(ret);
@@ -5768,11 +5797,11 @@ struct extent_buffer *btrfs_alloc_free_block(st=
ruct btrfs_trans_handle *trans,
 	BUG_ON(IS_ERR(buf));
=20
 	if (root_objectid =3D=3D BTRFS_TREE_RELOC_OBJECTID) {
-		if (parent =3D=3D 0)
-			parent =3D ins.objectid;
+		if (parent_start =3D=3D 0)
+			parent_start =3D ins.objectid;
 		flags |=3D BTRFS_BLOCK_FLAG_FULL_BACKREF;
 	} else
-		BUG_ON(parent > 0);
+		BUG_ON(parent_start > 0);
=20
 	if (root_objectid !=3D BTRFS_TREE_LOG_OBJECTID) {
 		struct btrfs_delayed_extent_op *extent_op;
@@ -5788,7 +5817,7 @@ struct extent_buffer *btrfs_alloc_free_block(stru=
ct btrfs_trans_handle *trans,
 		extent_op->is_data =3D 0;
=20
 		ret =3D btrfs_add_delayed_tree_ref(trans, ins.objectid,
-					ins.offset, parent, root_objectid,
+					ins.offset, parent_start, root_objectid,
 					level, BTRFS_ADD_DELAYED_EXTENT,
 					extent_op);
 		BUG_ON(ret);
@@ -7231,18 +7260,18 @@ int btrfs_remove_block_group(struct btrfs_trans=
_handle *trans,
=20
 	/* make sure this block group isn't part of an allocation cluster */
 	cluster =3D &root->fs_info->data_alloc_cluster;
-	spin_lock(&cluster->refill_lock);
+	mutex_lock(&cluster->refill_lock);
 	btrfs_return_cluster_to_free_space(block_group, cluster);
-	spin_unlock(&cluster->refill_lock);
+	mutex_unlock(&cluster->refill_lock);
=20
 	/*
 	 * make sure this block group isn't part of a metadata
 	 * allocation cluster
 	 */
 	cluster =3D &root->fs_info->meta_alloc_cluster;
-	spin_lock(&cluster->refill_lock);
+	mutex_lock(&cluster->refill_lock);
 	btrfs_return_cluster_to_free_space(block_group, cluster);
-	spin_unlock(&cluster->refill_lock);
+	mutex_unlock(&cluster->refill_lock);
=20
 	path =3D btrfs_alloc_path();
 	if (!path) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164..78196f2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "free-space-cache.h"
=20
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -2988,6 +2989,7 @@ static struct extent_buffer *__alloc_extent_buffe=
r(struct extent_io_tree *tree,
 	spin_unlock_irqrestore(&leak_lock, flags);
 #endif
 	atomic_set(&eb->refs, 1);
+	eb->cluster =3D NULL;
=20
 	return eb;
 }
@@ -3827,7 +3829,10 @@ out:
 	spin_unlock(&tree->buffer_lock);
=20
 	/* at this point we can safely release the extent buffer */
-	if (atomic_read(&eb->refs) =3D=3D 0)
+	if (atomic_read(&eb->refs) =3D=3D 0) {
+		btrfs_free_extent_cluster(eb);
 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+	}
 	return ret;
 }
+
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3..8ee8953 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -146,6 +146,9 @@ struct extent_buffer {
 	 * to unlock
 	 */
 	wait_queue_head_t read_lock_wq;
+
+	/* Used for the node to allocate space for its children */
+	struct btrfs_free_cluster *cluster;
 };
=20
 static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index e555899..e540e77 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -31,9 +31,15 @@
 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
=20
 static struct kmem_cache *btrfs_free_space_cachep;
+static struct kmem_cache *btrfs_free_cluster_cachep;
=20
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info);
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
+				 struct btrfs_free_space *info,
+				 bool update_stat);
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info, bool update_stat);
=20
 int __init free_space_cache_init(void)
 {
@@ -43,6 +49,14 @@ int __init free_space_cache_init(void)
 	if (!btrfs_free_space_cachep)
 		return -ENOMEM;
=20
+	btrfs_free_cluster_cachep =3D kmem_cache_create("extent_clusters",
+			sizeof(struct btrfs_free_cluster), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,	NULL);
+	if (!btrfs_free_cluster_cachep) {
+		kmem_cache_destroy(btrfs_free_space_cachep);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
=20
@@ -50,6 +64,8 @@ void free_space_cache_exit(void)
 {
 	if (btrfs_free_space_cachep)
 		kmem_cache_destroy(btrfs_free_space_cachep);
+	if (btrfs_free_cluster_cachep)
+		kmem_cache_destroy(btrfs_free_cluster_cachep);
 }
=20
 static struct inode *__lookup_free_space_inode(struct btrfs_root *root=
,
@@ -397,11 +413,32 @@ int __load_free_space_cache(struct btrfs_root *ro=
ot, struct inode *inode,
=20
 			if (entry->type =3D=3D BTRFS_FREE_SPACE_EXTENT) {
 				spin_lock(&ctl->tree_lock);
+				if (try_merge_free_space(ctl, e, true))
+					goto link;
+
+				ret =3D insert_into_bitmap(ctl, e, true);
+				if (ret < 0) {
+					spin_unlock(&ctl->tree_lock);
+					printk(KERN_ERR "Inserting into bitmap "
+					       "failed, dumping\n");
+					kmem_cache_free(btrfs_free_space_cachep,
+							e);
+					kunmap(page);
+					unlock_page(page);
+					page_cache_release(page);
+					goto free_cache;
+				} else if (ret) {
+					ret =3D 0;
+					goto next_entry;
+				}
+link:
 				ret =3D link_free_space(ctl, e);
 				spin_unlock(&ctl->tree_lock);
 				if (ret) {
 					printk(KERN_ERR "Duplicate entries in "
 					       "free space cache, dumping\n");
+					kmem_cache_free(btrfs_free_space_cachep,
+							e);
 					kunmap(page);
 					unlock_page(page);
 					page_cache_release(page);
@@ -425,6 +462,9 @@ int __load_free_space_cache(struct btrfs_root *root=
, struct inode *inode,
 				if (ret) {
 					printk(KERN_ERR "Duplicate entries in "
 					       "free space cache, dumping\n");
+					kfree(e->bitmap);
+					kmem_cache_free(
+						btrfs_free_space_cachep, e);
 					kunmap(page);
 					unlock_page(page);
 					page_cache_release(page);
@@ -432,7 +472,7 @@ int __load_free_space_cache(struct btrfs_root *root=
, struct inode *inode,
 				}
 				list_add_tail(&e->list, &bitmaps);
 			}
-
+next_entry:
 			num_entries--;
 			offset +=3D sizeof(struct btrfs_free_space_entry);
 			if (offset + sizeof(struct btrfs_free_space_entry) >=3D
@@ -676,7 +716,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root=
, struct inode *inode,
 		while (node && !next_page) {
 			struct btrfs_free_space *e;
=20
-			e =3D rb_entry(node, struct btrfs_free_space, offset_index);
+			e =3D rb_entry(node, struct btrfs_free_space,
+				     offset_index);
 			entries++;
=20
 			entry->offset =3D cpu_to_le64(e->offset);
@@ -689,10 +730,39 @@ int __btrfs_write_out_cache(struct btrfs_root *ro=
ot, struct inode *inode,
 				entry->type =3D BTRFS_FREE_SPACE_EXTENT;
 			}
 			node =3D rb_next(node);
-			if (!node && cluster) {
-				node =3D rb_first(&cluster->root);
+			offset +=3D sizeof(struct btrfs_free_space_entry);
+			if (offset + sizeof(struct btrfs_free_space_entry) >=3D
+			    PAGE_CACHE_SIZE)
+				next_page =3D true;
+			entry++;
+		}
+
+		/*
+		 * We want to write the extent in the cluster to our free space
+		 * cache.
+		 */
+		while (cluster && !next_page) {
+			struct btrfs_free_space *e;
+
+			e =3D cluster->info;
+			if (!e || !e->bytes)
+				break;
+
+			entries++;
+
+			entry->offset =3D cpu_to_le64(e->offset);
+			entry->bytes =3D cpu_to_le64(e->bytes);
+			entry->type =3D BTRFS_FREE_SPACE_EXTENT;
+
+			if (list_is_last(&cluster->block_group_list,
+					 &block_group->cluster_list))
 				cluster =3D NULL;
-			}
+			else
+				cluster =3D list_entry(
+						cluster->block_group_list.next,
+						struct btrfs_free_cluster,
+						block_group_list);
+
 			offset +=3D sizeof(struct btrfs_free_space_entry);
 			if (offset + sizeof(struct btrfs_free_space_entry) >=3D
 			    PAGE_CACHE_SIZE)
@@ -1125,19 +1195,30 @@ static void unlink_free_space(struct btrfs_free=
_space_ctl *ctl,
 	ctl->free_space -=3D info->bytes;
 }
=20
+static int __link_free_space(struct btrfs_free_space_ctl *ctl,
+			     struct btrfs_free_space *info)
+{
+	int ret;
+
+	ret =3D tree_insert_offset(&ctl->free_space_offset, info->offset,
+				 &info->offset_index, (info->bitmap !=3D NULL));
+	if (ret)
+		return ret;
+	ctl->free_extents++;
+	return 0;
+}
+
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info)
 {
-	int ret =3D 0;
+	int ret;
=20
 	BUG_ON(!info->bitmap && !info->bytes);
-	ret =3D tree_insert_offset(&ctl->free_space_offset, info->offset,
-				 &info->offset_index, (info->bitmap !=3D NULL));
+	ret =3D __link_free_space(ctl, info);
 	if (ret)
 		return ret;
=20
 	ctl->free_space +=3D info->bytes;
-	ctl->free_extents++;
 	return ret;
 }
=20
@@ -1212,7 +1293,7 @@ static void bitmap_clear_bits(struct btrfs_free_s=
pace_ctl *ctl,
=20
 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 			    struct btrfs_free_space *info, u64 offset,
-			    u64 bytes)
+			    u64 bytes, bool update_stat)
 {
 	unsigned long start, count;
=20
@@ -1223,7 +1304,8 @@ static void bitmap_set_bits(struct btrfs_free_spa=
ce_ctl *ctl,
 	bitmap_set(info->bitmap, start, count);
=20
 	info->bytes +=3D bytes;
-	ctl->free_space +=3D bytes;
+	if (update_stat)
+		ctl->free_space +=3D bytes;
 }
=20
 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
@@ -1394,7 +1476,7 @@ again:
=20
 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
 			       struct btrfs_free_space *info, u64 offset,
-			       u64 bytes)
+			       u64 bytes, bool update_stat)
 {
 	u64 bytes_to_set =3D 0;
 	u64 end;
@@ -1403,7 +1485,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_=
space_ctl *ctl,
=20
 	bytes_to_set =3D min(end - offset, bytes);
=20
-	bitmap_set_bits(ctl, info, offset, bytes_to_set);
+	bitmap_set_bits(ctl, info, offset, bytes_to_set, update_stat);
=20
 	return bytes_to_set;
=20
@@ -1451,10 +1533,9 @@ static struct btrfs_free_space_op free_space_op =
=3D {
 };
=20
 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
-			      struct btrfs_free_space *info)
+			      struct btrfs_free_space *info, bool update_stat)
 {
 	struct btrfs_free_space *bitmap_info;
-	struct btrfs_block_group_cache *block_group =3D NULL;
 	int added =3D 0;
 	u64 bytes, offset, bytes_added;
 	int ret;
@@ -1465,49 +1546,7 @@ static int insert_into_bitmap(struct btrfs_free_=
space_ctl *ctl,
 	if (!ctl->op->use_bitmap(ctl, info))
 		return 0;
=20
-	if (ctl->op =3D=3D &free_space_op)
-		block_group =3D ctl->private;
 again:
-	/*
-	 * Since we link bitmaps right into the cluster we need to see if we
-	 * have a cluster here, and if so and it has our bitmap we need to ad=
d
-	 * the free space to that bitmap.
-	 */
-	if (block_group && !list_empty(&block_group->cluster_list)) {
-		struct btrfs_free_cluster *cluster;
-		struct rb_node *node;
-		struct btrfs_free_space *entry;
-
-		cluster =3D list_entry(block_group->cluster_list.next,
-				     struct btrfs_free_cluster,
-				     block_group_list);
-		spin_lock(&cluster->lock);
-		node =3D rb_first(&cluster->root);
-		if (!node) {
-			spin_unlock(&cluster->lock);
-			goto no_cluster_bitmap;
-		}
-
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-		if (!entry->bitmap) {
-			spin_unlock(&cluster->lock);
-			goto no_cluster_bitmap;
-		}
-
-		if (entry->offset =3D=3D offset_to_bitmap(ctl, offset)) {
-			bytes_added =3D add_bytes_to_bitmap(ctl, entry,
-							  offset, bytes);
-			bytes -=3D bytes_added;
-			offset +=3D bytes_added;
-		}
-		spin_unlock(&cluster->lock);
-		if (!bytes) {
-			ret =3D 1;
-			goto out;
-		}
-	}
-
-no_cluster_bitmap:
 	bitmap_info =3D tree_search_offset(ctl, offset_to_bitmap(ctl, offset)=
,
 					 1, 0);
 	if (!bitmap_info) {
@@ -1515,7 +1554,8 @@ no_cluster_bitmap:
 		goto new_bitmap;
 	}
=20
-	bytes_added =3D add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes_added =3D add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
+					  update_stat);
 	bytes -=3D bytes_added;
 	offset +=3D bytes_added;
 	added =3D 0;
@@ -1533,6 +1573,12 @@ new_bitmap:
 		info =3D NULL;
 		goto again;
 	} else {
+		if (current->flags & PF_MEMALLOC) {
+			printk(KERN_INFO "Alloc memory when reclaim "
+			       "the pages\n");
+			return 0;
+		}
+
 		spin_unlock(&ctl->tree_lock);
=20
 		/* no pre-allocated info, allocate a new one */
@@ -1635,7 +1681,7 @@ int __btrfs_add_free_space(struct btrfs_free_spac=
e_ctl *ctl,
 	 * extent then we know we're going to have to allocate a new extent, =
so
 	 * before we do that see if we need to drop this into a bitmap
 	 */
-	ret =3D insert_into_bitmap(ctl, info);
+	ret =3D insert_into_bitmap(ctl, info, true);
 	if (ret < 0) {
 		goto out;
 	} else if (ret) {
@@ -1829,36 +1875,50 @@ __btrfs_return_cluster_to_free_space(
 {
 	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
-	struct rb_node *node;
+	int ret =3D 0;
=20
 	spin_lock(&cluster->lock);
-	if (cluster->block_group !=3D block_group)
+	if (cluster->block_group !=3D block_group) {
+		spin_unlock(&cluster->lock);
 		goto out;
+	}
=20
 	cluster->block_group =3D NULL;
-	cluster->window_start =3D 0;
+
+	entry =3D cluster->info;
+	cluster->info =3D NULL;
+
 	list_del_init(&cluster->block_group_list);
+	spin_unlock(&cluster->lock);
=20
-	node =3D rb_first(&cluster->root);
-	while (node) {
-		bool bitmap;
+	if (!entry)
+		goto out;
=20
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-		node =3D rb_next(&entry->offset_index);
-		rb_erase(&entry->offset_index, &cluster->root);
-
-		bitmap =3D (entry->bitmap !=3D NULL);
-		if (!bitmap)
-			try_merge_free_space(ctl, entry, false);
-		tree_insert_offset(&ctl->free_space_offset,
-				   entry->offset, &entry->offset_index, bitmap);
+	if (!entry->bytes) {
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+		goto out;
 	}
-	cluster->root =3D RB_ROOT;
=20
+	if (try_merge_free_space(ctl, entry, false))
+		goto link;
+
+	ret =3D insert_into_bitmap(ctl, entry, false);
+	if (ret < 0)
+		goto out;
+	else if (ret) {
+		ret =3D 0;
+		goto out;
+	}
+link:
+	ret =3D __link_free_space(ctl, entry);
+	if (ret) {
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+		printk(KERN_ERR "Duplicate entries in free space cache, "
+				"dumping\n");
+	}
 out:
-	spin_unlock(&cluster->lock);
 	btrfs_put_block_group(block_group);
-	return 0;
+	return ret;
 }
=20
 void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ct=
l *ctl)
@@ -1889,29 +1949,58 @@ void __btrfs_remove_free_space_cache(struct btr=
fs_free_space_ctl *ctl)
 	spin_unlock(&ctl->tree_lock);
 }
=20
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *blo=
ck_group)
+static void __btrfs_reclaim_extent_clusters(
+				struct btrfs_block_group_cache *block_group,
+				u64 to_reclaim)
 {
-	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
 	struct btrfs_free_cluster *cluster;
 	struct list_head *head;
+	u64 bytes;
+	bool reclaim_all =3D (to_reclaim =3D=3D 0);
=20
-	spin_lock(&ctl->tree_lock);
 	while ((head =3D block_group->cluster_list.next) !=3D
 	       &block_group->cluster_list) {
 		cluster =3D list_entry(head, struct btrfs_free_cluster,
 				     block_group_list);
=20
 		WARN_ON(cluster->block_group !=3D block_group);
+
+		bytes =3D cluster->info->bytes;
 		__btrfs_return_cluster_to_free_space(block_group, cluster);
+
+		if (!reclaim_all) {
+			if (to_reclaim > bytes)
+				to_reclaim -=3D bytes;
+			else {
+				to_reclaim =3D 0;
+				break;
+			}
+		}
+
 		if (need_resched()) {
-			spin_unlock(&ctl->tree_lock);
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
 			cond_resched();
-			spin_lock(&ctl->tree_lock);
+			spin_lock(&block_group->free_space_ctl->tree_lock);
 		}
 	}
+}
+
+void btrfs_reclaim_extent_clusters(struct btrfs_block_group_cache *blo=
ck_group,
+				   u64 to_reclaim)
+{
+	spin_lock(&block_group->free_space_ctl->tree_lock);
+	__btrfs_reclaim_extent_clusters(block_group, to_reclaim);
+	spin_unlock(&block_group->free_space_ctl->tree_lock);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *blo=
ck_group)
+{
+	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
+
+	spin_lock(&ctl->tree_lock);
+	__btrfs_reclaim_extent_clusters(block_group, 0);
 	__btrfs_remove_free_space_cache_locked(ctl);
 	spin_unlock(&ctl->tree_lock);
-
 }
=20
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_g=
roup,
@@ -1991,30 +2080,6 @@ int btrfs_return_cluster_to_free_space(
 	return ret;
 }
=20
-static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *blo=
ck_group,
-				   struct btrfs_free_cluster *cluster,
-				   struct btrfs_free_space *entry,
-				   u64 bytes, u64 min_start)
-{
-	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
-	int err;
-	u64 search_start =3D cluster->window_start;
-	u64 search_bytes =3D bytes;
-	u64 ret =3D 0;
-
-	search_start =3D min_start;
-	search_bytes =3D bytes;
-
-	err =3D search_bitmap(ctl, entry, &search_start, &search_bytes);
-	if (err)
-		return 0;
-
-	ret =3D search_start;
-	__bitmap_clear_bits(ctl, entry, ret, bytes);
-
-	return ret;
-}
-
 /*
  * given a cluster, try to allocate 'bytes' from it, returns 0
  * if it couldn't find anything suitably large, or a logical disk offs=
et
@@ -2025,56 +2090,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block=
_group_cache *block_group,
 			     u64 min_start)
 {
 	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
-	struct btrfs_free_space *entry =3D NULL;
-	struct rb_node *node;
 	u64 ret =3D 0;
=20
 	spin_lock(&cluster->lock);
-	if (bytes > cluster->max_size)
-		goto out;
-
 	if (cluster->block_group !=3D block_group)
 		goto out;
=20
-	node =3D rb_first(&cluster->root);
-	if (!node)
+	/*
+	 * If we set ->block_group, it means we have filled this cluster,
+	 * and ->info also has been set. So we needn't check ->info is
+	 * NULL or not now.
+	 */
+	if (bytes > cluster->info->bytes)
 		goto out;
=20
-	entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-	while(1) {
-		if (entry->bytes < bytes ||
-		    (!entry->bitmap && entry->offset < min_start)) {
-			node =3D rb_next(&entry->offset_index);
-			if (!node)
-				break;
-			entry =3D rb_entry(node, struct btrfs_free_space,
-					 offset_index);
-			continue;
-		}
-
-		if (entry->bitmap) {
-			ret =3D btrfs_alloc_from_bitmap(block_group,
-						      cluster, entry, bytes,
-						      min_start);
-			if (ret =3D=3D 0) {
-				node =3D rb_next(&entry->offset_index);
-				if (!node)
-					break;
-				entry =3D rb_entry(node, struct btrfs_free_space,
-						 offset_index);
-				continue;
-			}
-		} else {
-			ret =3D entry->offset;
-
-			entry->offset +=3D bytes;
-			entry->bytes -=3D bytes;
-		}
+	if (min_start >=3D cluster->info->offset + cluster->info->bytes)
+		goto out;
=20
-		if (entry->bytes =3D=3D 0)
-			rb_erase(&entry->offset_index, &cluster->root);
-		break;
-	}
+	ret =3D cluster->info->offset;
+	cluster->info->offset +=3D bytes;
+	cluster->info->bytes -=3D bytes;
 out:
 	spin_unlock(&cluster->lock);
=20
@@ -2082,260 +2117,12 @@ out:
 		return 0;
=20
 	spin_lock(&ctl->tree_lock);
-
 	ctl->free_space -=3D bytes;
-	if (entry->bytes =3D=3D 0) {
-		ctl->free_extents--;
-		if (entry->bitmap) {
-			kfree(entry->bitmap);
-			ctl->total_bitmaps--;
-			ctl->op->recalc_thresholds(ctl);
-		}
-		kmem_cache_free(btrfs_free_space_cachep, entry);
-	}
-
 	spin_unlock(&ctl->tree_lock);
=20
 	return ret;
 }
=20
-static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_=
group,
-				struct btrfs_free_space *entry,
-				struct btrfs_free_cluster *cluster,
-				u64 offset, u64 bytes, u64 min_bytes)
-{
-	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
-	unsigned long next_zero;
-	unsigned long i;
-	unsigned long search_bits;
-	unsigned long total_bits;
-	unsigned long found_bits;
-	unsigned long start =3D 0;
-	unsigned long total_found =3D 0;
-	int ret;
-	bool found =3D false;
-
-	i =3D offset_to_bit(entry->offset, block_group->sectorsize,
-			  max_t(u64, offset, entry->offset));
-	search_bits =3D bytes_to_bits(bytes, block_group->sectorsize);
-	total_bits =3D bytes_to_bits(min_bytes, block_group->sectorsize);
-
-again:
-	found_bits =3D 0;
-	for (i =3D find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
-	     i < BITS_PER_BITMAP;
-	     i =3D find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
-		next_zero =3D find_next_zero_bit(entry->bitmap,
-					       BITS_PER_BITMAP, i);
-		if (next_zero - i >=3D search_bits) {
-			found_bits =3D next_zero - i;
-			break;
-		}
-		i =3D next_zero;
-	}
-
-	if (!found_bits)
-		return -ENOSPC;
-
-	if (!found) {
-		start =3D i;
-		found =3D true;
-	}
-
-	total_found +=3D found_bits;
-
-	if (cluster->max_size < found_bits * block_group->sectorsize)
-		cluster->max_size =3D found_bits * block_group->sectorsize;
-
-	if (total_found < total_bits) {
-		i =3D find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-		if (i - start > total_bits * 2) {
-			total_found =3D 0;
-			cluster->max_size =3D 0;
-			found =3D false;
-		}
-		goto again;
-	}
-
-	cluster->window_start =3D start * block_group->sectorsize +
-		entry->offset;
-	rb_erase(&entry->offset_index, &ctl->free_space_offset);
-	ret =3D tree_insert_offset(&cluster->root, entry->offset,
-				 &entry->offset_index, 1);
-	BUG_ON(ret);
-
-	return 0;
-}
-
-/*
- * This searches the block group for just extents to fill the cluster =
with.
- */
-static noinline int
-setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
-			struct btrfs_free_cluster *cluster,
-			struct list_head *bitmaps, u64 offset, u64 bytes,
-			u64 min_bytes)
-{
-	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
-	struct btrfs_free_space *first =3D NULL;
-	struct btrfs_free_space *entry =3D NULL;
-	struct btrfs_free_space *prev =3D NULL;
-	struct btrfs_free_space *last;
-	struct rb_node *node;
-	u64 window_start;
-	u64 window_free;
-	u64 max_extent;
-	u64 max_gap =3D 128 * 1024;
-
-	entry =3D tree_search_offset(ctl, offset, 0, 1);
-	if (!entry)
-		return -ENOSPC;
-
-	/*
-	 * We don't want bitmaps, so just move along until we find a normal
-	 * extent entry.
-	 */
-	while (entry->bitmap) {
-		if (list_empty(&entry->list))
-			list_add_tail(&entry->list, bitmaps);
-		node =3D rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-	}
-
-	window_start =3D entry->offset;
-	window_free =3D entry->bytes;
-	max_extent =3D entry->bytes;
-	first =3D entry;
-	last =3D entry;
-	prev =3D entry;
-
-	while (window_free <=3D min_bytes) {
-		node =3D rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-
-		if (entry->bitmap) {
-			if (list_empty(&entry->list))
-				list_add_tail(&entry->list, bitmaps);
-			continue;
-		}
-
-		/*
-		 * we haven't filled the empty size and the window is
-		 * very large.  reset and try again
-		 */
-		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-		    entry->offset - window_start > (min_bytes * 2)) {
-			first =3D entry;
-			window_start =3D entry->offset;
-			window_free =3D entry->bytes;
-			last =3D entry;
-			max_extent =3D entry->bytes;
-		} else {
-			last =3D entry;
-			window_free +=3D entry->bytes;
-			if (entry->bytes > max_extent)
-				max_extent =3D entry->bytes;
-		}
-		prev =3D entry;
-	}
-
-	cluster->window_start =3D first->offset;
-
-	node =3D &first->offset_index;
-
-	/*
-	 * now we've found our entries, pull them out of the free space
-	 * cache and put them into the cluster rbtree
-	 */
-	do {
-		int ret;
-
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-		node =3D rb_next(&entry->offset_index);
-		if (entry->bitmap)
-			continue;
-
-		rb_erase(&entry->offset_index, &ctl->free_space_offset);
-		ret =3D tree_insert_offset(&cluster->root, entry->offset,
-					 &entry->offset_index, 0);
-		BUG_ON(ret);
-	} while (node && entry !=3D last);
-
-	cluster->max_size =3D max_extent;
-
-	return 0;
-}
-
-/*
- * This specifically looks for bitmaps that may work in the cluster, w=
e assume
- * that we have already failed to find extents that will work.
- */
-static noinline int
-setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
-		     struct btrfs_free_cluster *cluster,
-		     struct list_head *bitmaps, u64 offset, u64 bytes,
-		     u64 min_bytes)
-{
-	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
-	struct btrfs_free_space *entry;
-	struct rb_node *node;
-	int ret =3D -ENOSPC;
-
-	if (ctl->total_bitmaps =3D=3D 0)
-		return -ENOSPC;
-
-	/*
-	 * First check our cached list of bitmaps and see if there is an entr=
y
-	 * here that will work.
-	 */
-	list_for_each_entry(entry, bitmaps, list) {
-		if (entry->bytes < min_bytes)
-			continue;
-		ret =3D btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, min_bytes);
-		if (!ret)
-			return 0;
-	}
-
-	/*
-	 * If we do have entries on our list and we are here then we didn't f=
ind
-	 * anything, so go ahead and get the next entry after the last entry =
in
-	 * this list and start the search from there.
-	 */
-	if (!list_empty(bitmaps)) {
-		entry =3D list_entry(bitmaps->prev, struct btrfs_free_space,
-				   list);
-		node =3D rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-		goto search;
-	}
-
-	entry =3D tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1=
);
-	if (!entry)
-		return -ENOSPC;
-
-search:
-	node =3D &entry->offset_index;
-	do {
-		entry =3D rb_entry(node, struct btrfs_free_space, offset_index);
-		node =3D rb_next(&entry->offset_index);
-		if (!entry->bitmap)
-			continue;
-		if (entry->bytes < min_bytes)
-			continue;
-		ret =3D btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, min_bytes);
-	} while (ret && node);
-
-	return ret;
-}
-
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
  * is to find at least bytes free and up to empty_size + bytes free.
@@ -2351,11 +2138,12 @@ int btrfs_find_space_cluster(struct btrfs_trans=
_handle *trans,
 			     u64 offset, u64 bytes, u64 empty_size)
 {
 	struct btrfs_free_space_ctl *ctl =3D block_group->free_space_ctl;
-	struct list_head bitmaps;
-	struct btrfs_free_space *entry, *tmp;
+	struct btrfs_free_space *entry, *info;
 	u64 min_bytes;
 	int ret;
=20
+	BUG_ON(cluster->info);
+
 	/* for metadata, allow allocates with more holes */
 	if (btrfs_test_opt(root, SSD_SPREAD)) {
 		min_bytes =3D bytes + empty_size;
@@ -2369,9 +2157,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_=
handle *trans,
 			min_bytes =3D max(bytes, (bytes + empty_size) >> 1);
 		else
 			min_bytes =3D max(bytes, (bytes + empty_size) >> 4);
+		min_bytes =3D max(min_bytes, (u64)256 * 1024);
 	} else
 		min_bytes =3D max(bytes, (bytes + empty_size) >> 2);
=20
+	min_bytes =3D round_up(min_bytes, root->sectorsize);
+
+	info =3D kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+	if (!info)
+		return -ENOMEM;
+
 	spin_lock(&ctl->tree_lock);
=20
 	/*
@@ -2380,6 +2175,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_h=
andle *trans,
 	 */
 	if (ctl->free_space < min_bytes) {
 		spin_unlock(&ctl->tree_lock);
+		kmem_cache_free(btrfs_free_space_cachep, info);
 		return -ENOSPC;
 	}
=20
@@ -2388,26 +2184,44 @@ int btrfs_find_space_cluster(struct btrfs_trans=
_handle *trans,
 	/* someone already found a cluster, hooray */
 	if (cluster->block_group) {
 		ret =3D 0;
+		kmem_cache_free(btrfs_free_space_cachep, info);
 		goto out;
 	}
=20
-	INIT_LIST_HEAD(&bitmaps);
-	ret =3D setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offse=
t,
-				      bytes, min_bytes);
-	if (ret)
-		ret =3D setup_cluster_bitmap(block_group, cluster, &bitmaps,
-					   offset, bytes, min_bytes);
+	bytes =3D min_bytes;
+	entry =3D find_free_space(ctl, &offset, &min_bytes);
+	if (!entry) {
+		ret =3D -ENOSPC;
+		kmem_cache_free(btrfs_free_space_cachep, info);
+		goto out;
+	}
=20
-	/* Clear our temporary list */
-	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
-		list_del_init(&entry->list);
+	BUG_ON(min_bytes < bytes);
=20
-	if (!ret) {
-		atomic_inc(&block_group->count);
-		list_add_tail(&cluster->block_group_list,
-			      &block_group->cluster_list);
-		cluster->block_group =3D block_group;
+	ret =3D 0;
+	if (entry->bitmap) {
+		__bitmap_clear_bits(ctl, entry, offset, bytes);
+		if (!entry->bytes)
+			free_bitmap(ctl, entry);
+	} else {
+		__unlink_free_space(ctl, entry);
+		entry->offset +=3D bytes;
+		entry->bytes -=3D bytes;
+		if (!entry->bytes)
+			kmem_cache_free(btrfs_free_space_cachep, entry);
+		else
+			__link_free_space(ctl, entry);
 	}
+
+	info->offset =3D offset;
+	info->bytes =3D bytes;
+
+	cluster->info =3D info;
+
+	atomic_inc(&block_group->count);
+	list_add_tail(&cluster->block_group_list,
+		      &block_group->cluster_list);
+	cluster->block_group =3D block_group;
 out:
 	spin_unlock(&cluster->lock);
 	spin_unlock(&ctl->tree_lock);
@@ -2421,11 +2235,10 @@ out:
 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
 {
 	spin_lock_init(&cluster->lock);
-	spin_lock_init(&cluster->refill_lock);
-	cluster->root =3D RB_ROOT;
-	cluster->max_size =3D 0;
+	mutex_init(&cluster->refill_lock);
 	INIT_LIST_HEAD(&cluster->block_group_list);
 	cluster->block_group =3D NULL;
+	cluster->info =3D NULL;
 }
=20
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group=
,
@@ -2665,3 +2478,27 @@ int btrfs_write_out_ino_cache(struct btrfs_root =
*root,
 	iput(inode);
 	return ret;
 }
+
+struct btrfs_free_cluster *btrfs_alloc_free_cluster(void)
+{
+	struct btrfs_free_cluster *cluster;
+
+	cluster =3D kmem_cache_zalloc(btrfs_free_cluster_cachep, GFP_NOFS);
+	if (!cluster)
+		return NULL;
+
+	btrfs_init_free_cluster(cluster);
+	return cluster;
+}
+
+void btrfs_release_free_cluster(struct btrfs_free_cluster *cluster)
+{
+	/* return the free space in the cluster to the space cache. */
+	mutex_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(NULL, cluster);
+	mutex_unlock(&cluster->refill_lock);
+
+	/* free the cluster. */
+	kmem_cache_free(btrfs_free_cluster_cachep, cluster);
+}
+
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index c27ccba..1f7e75c 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -34,6 +34,7 @@ struct btrfs_free_space_ctl {
 	int extents_thresh;
 	int free_extents;
 	int total_bitmaps;
+	int total_clusters;
 	int unit;
 	u64 start;
 	struct btrfs_free_space_op *op;
@@ -113,4 +114,16 @@ int btrfs_return_cluster_to_free_space(
 			       struct btrfs_free_cluster *cluster);
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group=
,
 			   u64 *trimmed, u64 start, u64 end, u64 minlen);
+void btrfs_reclaim_extent_clusters(struct btrfs_block_group_cache *blo=
ck_group,
+				   u64 to_reclaim);
+struct btrfs_free_cluster *btrfs_alloc_free_cluster(void);
+void btrfs_release_free_cluster(struct btrfs_free_cluster *cluster);
+static inline void btrfs_free_extent_cluster(struct extent_buffer *eb)
+{
+	if (!eb->cluster)
+		return;
+
+	btrfs_release_free_cluster(eb->cluster);
+	eb->cluster =3D NULL;
+}
 #endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0..d501c1f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -457,6 +457,7 @@ again:
 	spin_unlock(&root->cache_lock);
=20
 	spin_lock(&ctl->tree_lock);
+	WARN_ON(ctl->total_clusters);
 	prealloc =3D sizeof(struct btrfs_free_space) * ctl->free_extents;
 	prealloc =3D ALIGN(prealloc, PAGE_CACHE_SIZE);
 	prealloc +=3D ctl->total_bitmaps * PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7a9e01f..38a5da9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -51,7 +51,6 @@
 #include "tree-log.h"
 #include "compression.h"
 #include "locking.h"
-#include "free-space-cache.h"
 #include "inode-map.h"
=20
 struct btrfs_iget_args {
@@ -623,11 +622,10 @@ retry:
 		trans =3D btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv =3D &root->fs_info->delalloc_block_rsv;
-		ret =3D btrfs_reserve_extent(trans, root,
-					   async_extent->compressed_size,
-					   async_extent->compressed_size,
-					   0, alloc_hint,
-					   (u64)-1, &ins, 1);
+		ret =3D btrfs_reserve_extent_data(trans, root,
+						async_extent->compressed_size,
+						async_extent->compressed_size,
+						0, alloc_hint, (u64)-1, &ins);
 		btrfs_end_transaction(trans, root);
=20
 		if (ret) {
@@ -828,9 +826,9 @@ static noinline int cow_file_range(struct inode *in=
ode,
 		unsigned long op;
=20
 		cur_alloc_size =3D disk_num_bytes;
-		ret =3D btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   root->sectorsize, 0, alloc_hint,
-					   (u64)-1, &ins, 1);
+		ret =3D btrfs_reserve_extent_data(trans, root, cur_alloc_size,
+						root->sectorsize, 0, alloc_hint,
+						(u64)-1, &ins);
 		BUG_ON(ret);
=20
 		em =3D alloc_extent_map();
@@ -5409,8 +5407,8 @@ static struct extent_map *btrfs_new_extent_direct=
(struct inode *inode,
 	trans->block_rsv =3D &root->fs_info->delalloc_block_rsv;
=20
 	alloc_hint =3D get_extent_allocation_hint(inode, start, len);
-	ret =3D btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
-				   alloc_hint, (u64)-1, &ins, 1);
+	ret =3D btrfs_reserve_extent_data(trans, root, len, root->sectorsize,=
 0,
+					alloc_hint, (u64)-1, &ins);
 	if (ret) {
 		em =3D ERR_PTR(ret);
 		goto out;
@@ -7276,8 +7274,9 @@ static int __btrfs_prealloc_file_range(struct ino=
de *inode, int mode,
 			}
 		}
=20
-		ret =3D btrfs_reserve_extent(trans, root, num_bytes, min_size,
-					   0, *alloc_hint, (u64)-1, &ins, 1);
+		ret =3D btrfs_reserve_extent_data(trans, root, num_bytes,
+						min_size, 0, *alloc_hint,
+						(u64)-1, &ins);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 970977a..808203c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -347,7 +347,7 @@ static noinline int create_subvol(struct btrfs_root=
 *root,
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
=20
-	leaf =3D btrfs_alloc_free_block(trans, root, root->leafsize,
+	leaf =3D btrfs_alloc_free_block(trans, root, root->leafsize, NULL,
 				      0, objectid, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret =3D PTR_ERR(leaf);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639f..21d08e2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -25,6 +25,7 @@
 #include "print-tree.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "free-space-cache.h"
=20
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -1763,6 +1764,8 @@ static noinline int walk_down_log_tree(struct btr=
fs_trans_handle *trans,
 				ret =3D btrfs_free_reserved_extent(root,
 							 bytenr, blocksize);
 				BUG_ON(ret);
+
+				btrfs_free_extent_cluster(next);
 			}
 			free_extent_buffer(next);
 			continue;
@@ -1832,6 +1835,8 @@ static noinline int walk_up_log_tree(struct btrfs=
_trans_handle *trans,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len);
 				BUG_ON(ret);
+
+				btrfs_free_extent_cluster(next);
 			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] =3D NULL;
@@ -1900,6 +1905,8 @@ static int walk_log_tree(struct btrfs_trans_handl=
e *trans,
 			ret =3D btrfs_free_reserved_extent(log, next->start,
 							 next->len);
 			BUG_ON(ret);
+
+			btrfs_free_extent_cluster(next);
 		}
 	}
=20
--=20
1.7.4
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" =
in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-11-02  2:16 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-09-08  8:18 [RFC PATCH 2/2] Btrfs: introduce free space cluster for each node Miao Xie
2011-11-01  7:39 ` Miao Xie
2011-11-01 16:04   ` Chris Mason
2011-11-02  2:16     ` Miao Xie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).