linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
To: Josef Bacik <jbacik@fb.com>
Cc: linux-btrfs@vger.kernel.org, kernel-team@fb.com
Subject: Re: [PATCH 6/7] Btrfs: kill the btree_inode
Date: Thu, 08 Sep 2016 10:47:28 +0530	[thread overview]
Message-ID: <8570191.KlMn4MSC1V@localhost.localdomain> (raw)
In-Reply-To: <1472845206-22870-7-git-send-email-jbacik@fb.com>

On Friday, September 02, 2016 03:40:05 PM Josef Bacik wrote:

Please find my comment inlined below,

> In order to more efficiently support sub-page blocksizes we need to stop
> allocating pages from pagecache for our metadata.  Instead switch to using the
> account_metadata* counters for making sure we are keeping the system aware of
> how much dirty metadata we have, and use the ->free_cached_objects super
> operation in order to handle freeing up extent buffers.  This greatly simplifies
> how we deal with extent buffers as now we no longer have to tie the page cache
> reclaimation stuff to the extent buffer stuff.  This will also allow us to
> simply kmalloc() our data for sub-page blocksizes.
> 
> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
>  fs/btrfs/btrfs_inode.h                 |   3 +-
>  fs/btrfs/ctree.c                       |  10 +-
>  fs/btrfs/ctree.h                       |  13 +-
>  fs/btrfs/disk-io.c                     | 389 ++++----------
>  fs/btrfs/extent_io.c                   | 913 ++++++++++++++++++---------------
>  fs/btrfs/extent_io.h                   |  49 +-
>  fs/btrfs/inode.c                       |   6 +-
>  fs/btrfs/root-tree.c                   |   2 +-
>  fs/btrfs/super.c                       |  29 +-
>  fs/btrfs/tests/btrfs-tests.c           |  37 +-
>  fs/btrfs/tests/extent-io-tests.c       |   4 +-
>  fs/btrfs/tests/free-space-tree-tests.c |   4 +-
>  fs/btrfs/tests/qgroup-tests.c          |   4 +-
>  fs/btrfs/transaction.c                 |  11 +-
>  14 files changed, 726 insertions(+), 748 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1a8fa46..ad7b185 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -229,10 +229,9 @@ static inline u64 btrfs_ino(struct inode *inode)
>  	u64 ino = BTRFS_I(inode)->location.objectid;
> 
>  	/*
> -	 * !ino: btree_inode
>  	 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
>  	 */
> -	if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
> +	if (BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
>  		ino = inode->i_ino;
>  	return ino;
>  }
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index d1c56c9..b267053 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -1373,8 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
> 
>  	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
>  		BUG_ON(tm->slot != 0);
> -		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
> -						eb->len);
> +		eb_rewin = alloc_dummy_extent_buffer(fs_info->eb_info,
> +						     eb->start, eb->len);
>  		if (!eb_rewin) {
>  			btrfs_tree_read_unlock_blocking(eb);
>  			free_extent_buffer(eb);
> @@ -1455,8 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
>  	} else if (old_root) {
>  		btrfs_tree_read_unlock(eb_root);
>  		free_extent_buffer(eb_root);
> -		eb = alloc_dummy_extent_buffer(root->fs_info, logical,
> -					root->nodesize);
> +		eb = alloc_dummy_extent_buffer(root->fs_info->eb_info, logical,
> +					       root->nodesize);
>  	} else {
>  		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
>  		eb = btrfs_clone_extent_buffer(eb_root);
> @@ -1772,7 +1772,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
>  	int err;
> 
>  	if (low > high) {
> -		btrfs_err(eb->fs_info,
> +		btrfs_err(eb->eb_info->fs_info,
>  		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
>  			  __func__, low, high, eb->start,
>  			  btrfs_header_owner(eb), btrfs_header_level(eb));
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 282a031..ee6956c 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -37,6 +37,7 @@
>  #include <linux/workqueue.h>
>  #include <linux/security.h>
>  #include <linux/sizes.h>
> +#include <linux/list_lru.h>
>  #include "extent_io.h"
>  #include "extent_map.h"
>  #include "async-thread.h"
> @@ -675,6 +676,7 @@ struct btrfs_device;
>  struct btrfs_fs_devices;
>  struct btrfs_balance_control;
>  struct btrfs_delayed_root;
> +struct btrfs_eb_info;
> 
>  #define BTRFS_FS_BARRIER			1
>  #define BTRFS_FS_CLOSING_START			2
> @@ -797,7 +799,7 @@ struct btrfs_fs_info {
>  	struct btrfs_super_block *super_for_commit;
>  	struct block_device *__bdev;
>  	struct super_block *sb;
> -	struct inode *btree_inode;
> +	struct btrfs_eb_info *eb_info;
>  	struct backing_dev_info bdi;
>  	struct mutex tree_log_mutex;
>  	struct mutex transaction_kthread_mutex;
> @@ -1042,10 +1044,6 @@ struct btrfs_fs_info {
>  	/* readahead works cnt */
>  	atomic_t reada_works_cnt;
> 
> -	/* Extent buffer radix tree */
> -	spinlock_t buffer_lock;
> -	struct radix_tree_root buffer_radix;
> -
>  	/* next backup root to be overwritten */
>  	int backup_root_index;
> 
> @@ -2884,6 +2882,8 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
> 
>  static inline void free_fs_info(struct btrfs_fs_info *fs_info)
>  {
> +	list_lru_destroy(&fs_info->eb_info->lru_list);
> +	kfree(fs_info->eb_info);
>  	kfree(fs_info->balance_ctl);
>  	kfree(fs_info->delayed_root);
>  	kfree(fs_info->extent_root);
> @@ -3121,9 +3121,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
>  			     struct btrfs_root *new_root,
>  			     struct btrfs_root *parent_root,
>  			     u64 new_dirid);
> -int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
> -			 size_t size, struct bio *bio,
> -			 unsigned long bio_flags);
>  void btrfs_set_range_writeback(void *private_data, u64 start, u64 end);
>  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
>  int btrfs_readpage(struct file *file, struct page *page);
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 9c42e53..03ac601 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -217,56 +217,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
> 
>  #endif
> 
> -/*
> - * extents on the btree inode are pretty simple, there's one extent
> - * that covers the entire device
> - */
> -static struct extent_map *btree_get_extent(struct inode *inode,
> -		struct page *page, size_t pg_offset, u64 start, u64 len,
> -		int create)
> -{
> -	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
> -	struct extent_map *em;
> -	int ret;
> -
> -	read_lock(&em_tree->lock);
> -	em = lookup_extent_mapping(em_tree, start, len);
> -	if (em) {
> -		em->bdev =
> -			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
> -		read_unlock(&em_tree->lock);
> -		goto out;
> -	}
> -	read_unlock(&em_tree->lock);
> -
> -	em = alloc_extent_map();
> -	if (!em) {
> -		em = ERR_PTR(-ENOMEM);
> -		goto out;
> -	}
> -	em->start = 0;
> -	em->len = (u64)-1;
> -	em->block_len = (u64)-1;
> -	em->block_start = 0;
> -	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
> -
> -	write_lock(&em_tree->lock);
> -	ret = add_extent_mapping(em_tree, em, 0);
> -	if (ret == -EEXIST) {
> -		free_extent_map(em);
> -		em = lookup_extent_mapping(em_tree, start, len);
> -		if (!em)
> -			em = ERR_PTR(-EIO);
> -	} else if (ret) {
> -		free_extent_map(em);
> -		em = ERR_PTR(ret);
> -	}
> -	write_unlock(&em_tree->lock);
> -
> -out:
> -	return em;
> -}
> -
>  u32 btrfs_csum_data(char *data, u32 seed, size_t len)
>  {
>  	return btrfs_crc32c(seed, data, len);
> @@ -349,11 +299,11 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
>   * detect blocks that either didn't get written at all or got written
>   * in the wrong place.
>   */
> -static int verify_parent_transid(struct extent_io_tree *io_tree,
> -				 struct extent_buffer *eb, u64 parent_transid,
> +static int verify_parent_transid(struct extent_buffer *eb, u64 parent_transid,
>  				 int atomic)
>  {
>  	struct extent_state *cached_state = NULL;
> +	struct extent_io_tree *io_tree = &eb->eb_info->io_tree;
>  	int ret;
>  	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
> 
> @@ -375,7 +325,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
>  		ret = 0;
>  		goto out;
>  	}
> -	btrfs_err_rl(eb->fs_info,
> +	btrfs_err_rl(eb->eb_info->fs_info,
>  		"parent transid verify failed on %llu wanted %llu found %llu",
>  			eb->start,
>  			parent_transid, btrfs_header_generation(eb));
> @@ -445,7 +395,6 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>  					  struct extent_buffer *eb,
>  					  u64 parent_transid)
>  {
> -	struct extent_io_tree *io_tree;
>  	int failed = 0;
>  	int ret;
>  	int num_copies = 0;
> @@ -453,13 +402,10 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
>  	int failed_mirror = 0;
> 
>  	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
> -	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
>  	while (1) {
> -		ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
> -					       btree_get_extent, mirror_num);
> +		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
>  		if (!ret) {
> -			if (!verify_parent_transid(io_tree, eb,
> -						   parent_transid, 0))
> +			if (!verify_parent_transid(eb, parent_transid, 0))
>  				break;
>  			else
>  				ret = -EIO;
> @@ -504,24 +450,11 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
> 
>  static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
>  {
> -	u64 start = page_offset(page);
> -	u64 found_start;
>  	struct extent_buffer *eb;
> 
>  	eb = (struct extent_buffer *)page->private;
>  	if (page != eb->pages[0])
>  		return 0;
> -
> -	found_start = btrfs_header_bytenr(eb);
> -	/*
> -	 * Please do not consolidate these warnings into a single if.
> -	 * It is useful to know what went wrong.
> -	 */
> -	if (WARN_ON(found_start != start))
> -		return -EUCLEAN;
> -	if (WARN_ON(!PageUptodate(page)))
> -		return -EUCLEAN;
> -
>  	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
>  			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
> 
> @@ -619,8 +552,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
>  	u64 found_start;
>  	int found_level;
>  	struct extent_buffer *eb;
> -	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
> -	struct btrfs_fs_info *fs_info = root->fs_info;
> +	struct btrfs_root *root;
> +	struct btrfs_fs_info *fs_info;
>  	int ret = 0;
>  	int reads_done;
> 
> @@ -633,6 +566,8 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
>  	 * in memory.  Make sure we have a ref for all this other checks
>  	 */
>  	extent_buffer_get(eb);
> +	fs_info = eb->eb_info->fs_info;
> +	root = fs_info->tree_root;
> 
>  	reads_done = atomic_dec_and_test(&eb->io_pages);
>  	if (!reads_done)
> @@ -693,11 +628,19 @@ err:
>  		/*
>  		 * our io error hook is going to dec the io pages
>  		 * again, we have to make sure it has something
> -		 * to decrement
> +		 * to decrement.
> +		 *
> +		 * TODO: Kill this, we've re-arranged how this works now so we
> +		 * don't need to do this io_pages dance.
>  		 */
>  		atomic_inc(&eb->io_pages);
>  		clear_extent_buffer_uptodate(eb);
>  	}
> +	if (reads_done) {
> +		clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +		smp_mb__after_atomic();
> +		wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
> +	}
>  	free_extent_buffer(eb);
>  out:
>  	return ret;
> @@ -712,7 +655,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
>  	eb->read_mirror = failed_mirror;
>  	atomic_dec(&eb->io_pages);
>  	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
> -		btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO);
> +		btree_readahead_hook(eb->eb_info->fs_info, eb, eb->start, -EIO);
>  	return -EIO;	/* we fixed nothing */
>  }
> 
> @@ -884,15 +827,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
>  	return 0;
>  }
> 
> -static int btree_csum_one_bio(struct bio *bio)
> +static int btree_csum_one_bio(struct btrfs_fs_info *fs_info, struct bio *bio)
>  {
>  	struct bio_vec *bvec;
> -	struct btrfs_root *root;
>  	int i, ret = 0;
> 
>  	bio_for_each_segment_all(bvec, bio, i) {
> -		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
> -		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
> +		ret = csum_dirty_buffer(fs_info, bvec->bv_page);
>  		if (ret)
>  			break;
>  	}
> @@ -904,25 +845,27 @@ static int __btree_submit_bio_start(void *private_data, struct bio *bio,
>  				    int mirror_num, unsigned long bio_flags,
>  				    u64 bio_offset)
>  {
> +	struct btrfs_eb_info *eb_info = private_data;
>  	/*
>  	 * when we're called for a write, we're already in the async
>  	 * submission context.  Just jump into btrfs_map_bio
>  	 */
> -	return btree_csum_one_bio(bio);
> +	return btree_csum_one_bio(eb_info->fs_info, bio);
>  }
> 
>  static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>  				 int mirror_num, unsigned long bio_flags,
>  				 u64 bio_offset)
>  {
> -	struct inode *inode = private_data;
> +	struct btrfs_eb_info *eb_info = private_data;
> +	struct btrfs_root *root = eb_info->fs_info->tree_root;
>  	int ret;
> 
>  	/*
>  	 * when we're called for a write, we're already in the async
>  	 * submission context.  Just jump into btrfs_map_bio
>  	 */
> -	ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1);
> +	ret = btrfs_map_bio(root, bio, mirror_num, 1);
>  	if (ret) {
>  		bio->bi_error = ret;
>  		bio_endio(bio);
> @@ -930,7 +873,7 @@ static int __btree_submit_bio_done(void *private_data, struct bio *bio,
>  	return ret;
>  }
> 
> -static int check_async_write(struct inode *inode, unsigned long bio_flags)
> +static int check_async_write(unsigned long bio_flags)
>  {
>  	if (bio_flags & EXTENT_BIO_TREE_LOG)
>  		return 0;
> @@ -945,8 +888,9 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
>  				 int mirror_num, unsigned long bio_flags,
>  				 u64 bio_offset)
>  {
> -	struct inode *inode = private_data;
> -	int async = check_async_write(inode, bio_flags);
> +	struct btrfs_eb_info *eb_info = private_data;
> +	struct btrfs_root *root = eb_info->fs_info->tree_root;
> +	int async = check_async_write(bio_flags);
>  	int ret;
> 
>  	if (bio_op(bio) != REQ_OP_WRITE) {
> @@ -954,23 +898,22 @@ static int btree_submit_bio_hook(void *private_data, struct bio *bio,
>  		 * called for a read, do the setup so that checksum validation
>  		 * can happen in the async kernel threads
>  		 */
> -		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
> -					  bio, BTRFS_WQ_ENDIO_METADATA);
> +		ret = btrfs_bio_wq_end_io(eb_info->fs_info, bio,
> +					  BTRFS_WQ_ENDIO_METADATA);
>  		if (ret)
>  			goto out_w_error;
> -		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
> +		ret = btrfs_map_bio(root, bio, mirror_num, 0);
>  	} else if (!async) {
> -		ret = btree_csum_one_bio(bio);
> +		ret = btree_csum_one_bio(eb_info->fs_info, bio);
>  		if (ret)
>  			goto out_w_error;
> -		ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0);
> +		ret = btrfs_map_bio(root, bio, mirror_num, 0);
>  	} else {
>  		/*
>  		 * kthread helpers are used to submit writes so that
>  		 * checksumming can happen in parallel across all CPUs
>  		 */
> -		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
> -					  bio, mirror_num, 0,
> +		ret = btrfs_wq_submit_bio(eb_info->fs_info, bio, mirror_num, 0,
>  					  bio_offset, private_data,
>  					  __btree_submit_bio_start,
>  					  __btree_submit_bio_done);
> @@ -986,118 +929,14 @@ out_w_error:
>  	return ret;
>  }
> 
> -#ifdef CONFIG_MIGRATION
> -static int btree_migratepage(struct address_space *mapping,
> -			struct page *newpage, struct page *page,
> -			enum migrate_mode mode)
> -{
> -	/*
> -	 * we can't safely write a btree page from here,
> -	 * we haven't done the locking hook
> -	 */
> -	if (PageDirty(page))
> -		return -EAGAIN;
> -	/*
> -	 * Buffers may be managed in a filesystem specific way.
> -	 * We must have no buffers or drop them.
> -	 */
> -	if (page_has_private(page) &&
> -	    !try_to_release_page(page, GFP_KERNEL))
> -		return -EAGAIN;
> -	return migrate_page(mapping, newpage, page, mode);
> -}
> -#endif
> -
> -
> -static int btree_writepages(struct address_space *mapping,
> -			    struct writeback_control *wbc)
> -{
> -	struct btrfs_fs_info *fs_info;
> -	int ret;
> -
> -	if (wbc->sync_mode == WB_SYNC_NONE) {
> -
> -		if (wbc->for_kupdate)
> -			return 0;
> -
> -		fs_info = BTRFS_I(mapping->host)->root->fs_info;
> -		/* this is a bit racy, but that's ok */
> -		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
> -					     BTRFS_DIRTY_METADATA_THRESH);
> -		if (ret < 0)
> -			return 0;
> -	}
> -	return btree_write_cache_pages(mapping, wbc);
> -}
> -
> -static int btree_readpage(struct file *file, struct page *page)
> -{
> -	struct extent_io_tree *tree;
> -	tree = &BTRFS_I(page->mapping->host)->io_tree;
> -	return extent_read_full_page(tree, page, btree_get_extent, 0);
> -}
> -
> -static int btree_releasepage(struct page *page, gfp_t gfp_flags)
> -{
> -	if (PageWriteback(page) || PageDirty(page))
> -		return 0;
> -
> -	return try_release_extent_buffer(page);
> -}
> -
> -static void btree_invalidatepage(struct page *page, unsigned int offset,
> -				 unsigned int length)
> -{
> -	struct extent_io_tree *tree;
> -	tree = &BTRFS_I(page->mapping->host)->io_tree;
> -	extent_invalidatepage(tree, page, offset);
> -	btree_releasepage(page, GFP_NOFS);
> -	if (PagePrivate(page)) {
> -		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
> -			   "page private not zero on page %llu",
> -			   (unsigned long long)page_offset(page));
> -		ClearPagePrivate(page);
> -		set_page_private(page, 0);
> -		put_page(page);
> -	}
> -}
> -
> -static int btree_set_page_dirty(struct page *page)
> -{
> -#ifdef DEBUG
> -	struct extent_buffer *eb;
> -
> -	BUG_ON(!PagePrivate(page));
> -	eb = (struct extent_buffer *)page->private;
> -	BUG_ON(!eb);
> -	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
> -	BUG_ON(!atomic_read(&eb->refs));
> -	btrfs_assert_tree_locked(eb);
> -#endif
> -	return __set_page_dirty_nobuffers(page);
> -}
> -
> -static const struct address_space_operations btree_aops = {
> -	.readpage	= btree_readpage,
> -	.writepages	= btree_writepages,
> -	.releasepage	= btree_releasepage,
> -	.invalidatepage = btree_invalidatepage,
> -#ifdef CONFIG_MIGRATION
> -	.migratepage	= btree_migratepage,
> -#endif
> -	.set_page_dirty = btree_set_page_dirty,
> -};
> -
>  void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
>  {
>  	struct extent_buffer *buf = NULL;
> -	struct inode *btree_inode = root->fs_info->btree_inode;
> 
>  	buf = btrfs_find_create_tree_block(root, bytenr);
>  	if (IS_ERR(buf))
>  		return;
> -	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
> -				 buf, WAIT_NONE, btree_get_extent, 0);
> +	read_extent_buffer_pages(buf, WAIT_NONE, 0);
>  	free_extent_buffer(buf);
>  }
> 
> @@ -1105,8 +944,6 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>  			 int mirror_num, struct extent_buffer **eb)
>  {
>  	struct extent_buffer *buf = NULL;
> -	struct inode *btree_inode = root->fs_info->btree_inode;
> -	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
>  	int ret;
> 
>  	buf = btrfs_find_create_tree_block(root, bytenr);
> @@ -1115,8 +952,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
> 
>  	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
> 
> -	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
> -				       btree_get_extent, mirror_num);
> +	ret = read_extent_buffer_pages(buf, WAIT_PAGE_LOCK, mirror_num);
>  	if (ret) {
>  		free_extent_buffer(buf);
>  		return ret;
> @@ -1136,29 +972,29 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
>  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>  					    u64 bytenr)
>  {
> -	return find_extent_buffer(fs_info, bytenr);
> +	return find_extent_buffer(fs_info->eb_info, bytenr);
>  }
> 
>  struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
>  						 u64 bytenr)
>  {
>  	if (btrfs_is_testing(root->fs_info))
> -		return alloc_test_extent_buffer(root->fs_info, bytenr,
> -				root->nodesize);
> +		return alloc_test_extent_buffer(root->fs_info->eb_info, bytenr,
> +						root->nodesize);
>  	return alloc_extent_buffer(root->fs_info, bytenr);
>  }
> 
> 
>  int btrfs_write_tree_block(struct extent_buffer *buf)
>  {
> -	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
> -					buf->start + buf->len - 1);
> +	return btree_write_range(buf->eb_info->fs_info, buf->start,
> +				 buf->start + buf->len - 1);
>  }
> 
>  int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
>  {
> -	return filemap_fdatawait_range(buf->pages[0]->mapping,
> -				       buf->start, buf->start + buf->len - 1);
> +	return btree_wait_range(buf->eb_info->fs_info, buf->start,
> +				buf->start + buf->len - 1);
>  }
> 
>  struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
> @@ -1188,14 +1024,10 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
>  	    fs_info->running_transaction->transid) {
>  		btrfs_assert_tree_locked(buf);
> 
> -		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
> +		if (clear_extent_buffer_dirty(buf))
>  			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
>  					     -buf->len,
>  					     fs_info->dirty_metadata_batch);
> -			/* ugh, clear_extent_buffer_dirty needs to lock the page */
> -			btrfs_set_lock_blocking(buf);
> -			clear_extent_buffer_dirty(buf);
> -		}
>  	}
>  }
> 
> @@ -2247,33 +2079,20 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
>  	init_waitqueue_head(&fs_info->balance_wait_q);
>  }
> 
> -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
> -				   struct btrfs_root *tree_root)
> +int btrfs_init_eb_info(struct btrfs_fs_info *fs_info)
>  {
> -	fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
> -	set_nlink(fs_info->btree_inode, 1);
> -	/*
> -	 * we set the i_size on the btree inode to the max possible int.
> -	 * the real end of the address space is determined by all of
> -	 * the devices in the system
> -	 */
> -	fs_info->btree_inode->i_size = OFFSET_MAX;
> -	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
> -
> -	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
> -	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
> -			    fs_info->btree_inode);
> -	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
> -	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
> -
> -	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
> -
> -	BTRFS_I(fs_info->btree_inode)->root = tree_root;
> -	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
> -	       sizeof(struct btrfs_key));
> -	set_bit(BTRFS_INODE_DUMMY,
> -		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
> -	btrfs_insert_inode_hash(fs_info->btree_inode);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +
> +	eb_info->fs_info = fs_info;
> +	extent_io_tree_init(&eb_info->io_tree, eb_info);
> +	eb_info->io_tree.track_uptodate = 0;
> +	eb_info->io_tree.ops = &btree_extent_io_ops;
> +	extent_io_tree_init(&eb_info->io_failure_tree, eb_info);
> +	INIT_RADIX_TREE(&eb_info->buffer_radix, GFP_ATOMIC);
> +	spin_lock_init(&eb_info->buffer_lock);
> +	if (list_lru_init(&eb_info->lru_list))
> +		return -ENOMEM;
> +	return 0;
>  }
> 
>  static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
> @@ -2566,16 +2385,7 @@ int open_ctree(struct super_block *sb,
>  		goto fail_delalloc_bytes;
>  	}
> 
> -	fs_info->btree_inode = new_inode(sb);
> -	if (!fs_info->btree_inode) {
> -		err = -ENOMEM;
> -		goto fail_bio_counter;
> -	}
> -
> -	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> -
>  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
> -	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
>  	INIT_LIST_HEAD(&fs_info->trans_list);
>  	INIT_LIST_HEAD(&fs_info->dead_roots);
>  	INIT_LIST_HEAD(&fs_info->delayed_iputs);
> @@ -2590,7 +2400,6 @@ int open_ctree(struct super_block *sb,
>  	spin_lock_init(&fs_info->tree_mod_seq_lock);
>  	spin_lock_init(&fs_info->super_lock);
>  	spin_lock_init(&fs_info->qgroup_op_lock);
> -	spin_lock_init(&fs_info->buffer_lock);
>  	spin_lock_init(&fs_info->unused_bgs_lock);
>  	rwlock_init(&fs_info->tree_mod_log_lock);
>  	mutex_init(&fs_info->unused_bg_unpin_mutex);
> @@ -2643,7 +2452,7 @@ int open_ctree(struct super_block *sb,
>  					GFP_KERNEL);
>  	if (!fs_info->delayed_root) {
>  		err = -ENOMEM;
> -		goto fail_iput;
> +		goto fail_alloc;
>  	}
>  	btrfs_init_delayed_root(fs_info->delayed_root);
> 
> @@ -2658,7 +2467,15 @@ int open_ctree(struct super_block *sb,
>  	sb->s_blocksize_bits = blksize_bits(4096);
>  	sb->s_bdi = &fs_info->bdi;
> 
> -	btrfs_init_btree_inode(fs_info, tree_root);
> +	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info), GFP_KERNEL);
> +	if (!fs_info->eb_info) {
> +		err = -ENOMEM;
> +		goto fail_alloc;
> +	}
> +	if (btrfs_init_eb_info(fs_info)) {
> +		err = -ENOMEM;
> +		goto fail_alloc;
> +	}
> 
>  	spin_lock_init(&fs_info->block_group_cache_lock);
>  	fs_info->block_group_cache_tree = RB_ROOT;
> @@ -3085,6 +2902,14 @@ retry_root_backup:
>  	if (sb->s_flags & MS_RDONLY)
>  		return 0;
> 
> +	/*
> +	 * We need to make sure we are on the bdi's dirty list so we get
> +	 * writeback requests for our fs properly.
> +	 */
> +	spin_lock(&fs_info->bdi.sb_list_lock);
> +	list_add_tail(&fs_info->bdi.dirty_sb_list, &sb->s_bdi_list);
> +	spin_unlock(&fs_info->bdi.sb_list_lock);
> +
>  	if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
>  	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
>  		btrfs_info(fs_info, "creating free space tree");
> @@ -3180,7 +3005,8 @@ fail_cleaner:
>  	 * make sure we're done with the btree inode before we stop our
>  	 * kthreads
>  	 */
> -	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
> +	btree_write_range(fs_info, 0, (u64)-1);
> +	btree_wait_range(fs_info, 0, (u64)-1);
> 
>  fail_sysfs:
>  	btrfs_sysfs_remove_mounted(fs_info);
> @@ -3194,16 +3020,11 @@ fail_block_groups:
> 
>  fail_tree_roots:
>  	free_root_pointers(fs_info, 1);
> -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
> -
> +	btrfs_invalidate_eb_info(fs_info->eb_info);
>  fail_sb_buffer:
>  	btrfs_stop_all_workers(fs_info);
>  fail_alloc:
> -fail_iput:
>  	btrfs_mapping_tree_free(&fs_info->mapping_tree);
> -
> -	iput(fs_info->btree_inode);
> -fail_bio_counter:
>  	percpu_counter_destroy(&fs_info->bio_counter);
>  fail_delalloc_bytes:
>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
> @@ -3908,14 +3729,11 @@ void close_ctree(struct btrfs_root *root)
>  	 * we must make sure there is not any read request to
>  	 * submit after we stopping all workers.
>  	 */
> -	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
>  	btrfs_stop_all_workers(fs_info);
> 
>  	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
>  	free_root_pointers(fs_info, 1);
> 
> -	iput(fs_info->btree_inode);
> -
>  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
>  	if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
>  		btrfsic_unmount(root, fs_info->fs_devices);
> @@ -3924,6 +3742,8 @@ void close_ctree(struct btrfs_root *root)
>  	btrfs_close_devices(fs_info->fs_devices);
>  	btrfs_mapping_tree_free(&fs_info->mapping_tree);
> 
> +	btrfs_invalidate_eb_info(fs_info->eb_info);
> +
>  	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
>  	percpu_counter_destroy(&fs_info->delalloc_bytes);
>  	percpu_counter_destroy(&fs_info->bio_counter);
> @@ -3951,14 +3771,12 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
>  			  int atomic)
>  {
>  	int ret;
> -	struct inode *btree_inode = buf->pages[0]->mapping->host;
> 
>  	ret = extent_buffer_uptodate(buf);
>  	if (!ret)
>  		return ret;
> 
> -	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
> -				    parent_transid, atomic);
> +	ret = verify_parent_transid(buf, parent_transid, atomic);
>  	if (ret == -EAGAIN)
>  		return ret;
>  	return !ret;
> @@ -3979,7 +3797,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
>  	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
>  		return;
>  #endif
> -	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
> +	root = buf->eb_info->fs_info->tree_root;
>  	btrfs_assert_tree_locked(buf);
>  	if (transid != root->fs_info->generation)
>  		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
> @@ -4015,10 +3833,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
> 
>  	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
>  				     BTRFS_DIRTY_METADATA_THRESH);
> -	if (ret > 0) {
> +	if (ret > 0)
>  		balance_dirty_pages_ratelimited(&root->fs_info->bdi,
>  						root->fs_info->sb);
> -	}
>  }
> 
>  void btrfs_btree_balance_dirty(struct btrfs_root *root)
> @@ -4033,7 +3850,7 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
> 
>  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
>  {
> -	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
> +	struct btrfs_root *root = buf->eb_info->fs_info->tree_root;
>  	return btree_read_extent_buffer_pages(root, buf, parent_transid);
>  }
> 
> @@ -4376,10 +4193,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
>  			if (!eb)
>  				continue;
>  			wait_on_extent_buffer_writeback(eb);
> -
> -			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
> -					       &eb->bflags))
> -				clear_extent_buffer_dirty(eb);
> +			clear_extent_buffer_dirty(eb);
>  			free_extent_buffer_stale(eb);
>  		}
>  	}
> @@ -4504,16 +4318,37 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
> 
>  static struct btrfs_fs_info *btree_fs_info(void *private_data)
>  {
> -	struct inode *inode = private_data;
> -	return btrfs_sb(inode->i_sb);
> +	struct btrfs_eb_info *eb_info = private_data;
> +	return eb_info->fs_info;
> +}
> +
> +static int btree_merge_bio_hook(struct page *page, unsigned long offset,
> +				size_t size, struct bio *bio,
> +				unsigned long bio_flags)
> +{
> +	struct extent_buffer *eb = (struct extent_buffer *)page->private;
> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
> +	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
> +	u64 length = 0;
> +	u64 map_length;
> +	int ret;
> +
> +	length = bio->bi_iter.bi_size;
> +	map_length = length;
> +	ret = btrfs_map_block(fs_info, bio_op(bio), logical, &map_length,
> +			      NULL, 0);
> +	if (ret < 0)
> +		return ret;
> +	if (map_length < length + size)
> +		return 1;
> +	return 0;
>  }
> 
>  static const struct extent_io_ops btree_extent_io_ops = {
>  	.readpage_end_io_hook = btree_readpage_end_io_hook,
>  	.readpage_io_failed_hook = btree_io_failed_hook,
>  	.submit_bio_hook = btree_submit_bio_hook,
> -	/* note we're sharing with inode.c for the merge bio hook */
> -	.merge_bio_hook = btrfs_merge_bio_hook,
> +	.merge_bio_hook = btree_merge_bio_hook,
>  	.tree_fs_info = btree_fs_info,
>  	.set_range_writeback = btrfs_set_range_writeback,
>  };
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 5dcdd3e..5c18a49 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -75,8 +75,8 @@ void btrfs_leak_debug_check(void)
>  	while (!list_empty(&buffers)) {
>  		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
>  		printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
> -		       "refs %d\n",
> -		       eb->start, eb->len, atomic_read(&eb->refs));
> +		       "bflags %lu refs %d\n",
> +		       eb->start, eb->len, eb->bflags, atomic_read(&eb->refs));
>  		list_del(&eb->leak_list);
>  		kmem_cache_free(extent_buffer_cache, eb);
>  	}
> @@ -3538,7 +3538,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
>  			  struct btrfs_fs_info *fs_info,
>  			  struct extent_page_data *epd)
>  {
> -	unsigned long i, num_pages;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>  	int flush = 0;
>  	int ret = 0;
> 
> @@ -3585,37 +3585,42 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
> 
>  	btrfs_tree_unlock(eb);
> 
> -	if (!ret)
> -		return ret;
> -
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		struct page *p = eb->pages[i];
> -
> -		if (!trylock_page(p)) {
> -			if (!flush) {
> -				flush_write_bio(epd);
> -				flush = 1;
> -			}
> -			lock_page(p);
> -		}
> +	/*
> +	 * We cleared dirty on this buffer, we need to adjust the radix tags.
> +	 * We do the actual page accounting in write_one_eb.
> +	 */
> +	if (ret) {
> +		spin_lock_irq(&eb_info->buffer_lock);
> +		radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +				   PAGECACHE_TAG_WRITEBACK);
> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +				     PAGECACHE_TAG_DIRTY);
> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +				     PAGECACHE_TAG_TOWRITE);
> +		spin_unlock_irq(&eb_info->buffer_lock);
>  	}
> -
>  	return ret;
>  }
> 
>  static void end_extent_buffer_writeback(struct extent_buffer *eb)
>  {
> -	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
> -	smp_mb__after_atomic();
> -	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
> +	if (test_and_clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
> +		struct btrfs_eb_info *eb_info = eb->eb_info;
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&eb_info->buffer_lock, flags);
> +		radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +				     PAGECACHE_TAG_WRITEBACK);
> +		spin_unlock_irqrestore(&eb_info->buffer_lock, flags);
> +		wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
> +	}
>  }
> 
>  static void set_btree_ioerr(struct page *page)
>  {
>  	struct extent_buffer *eb = (struct extent_buffer *)page->private;
> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
> 
> -	SetPageError(page);
>  	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>  		return;
> 
> @@ -3624,8 +3629,7 @@ static void set_btree_ioerr(struct page *page)
>  	 * failed, increment the counter transaction->eb_write_errors.
>  	 * We do this because while the transaction is running and before it's
>  	 * committing (when we call filemap_fdata[write|wait]_range against
> -	 * the btree inode), we might have
> -	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
> +	 * the btree inode), we might have write_metadata() called - if it
>  	 * returns an error or an error happens during writeback, when we're
>  	 * committing the transaction we wouldn't know about it, since the pages
>  	 * can be no longer dirty nor marked anymore for writeback (if a
> @@ -3659,13 +3663,13 @@ static void set_btree_ioerr(struct page *page)
>  	 */
>  	switch (eb->log_index) {
>  	case -1:
> -		set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
> +		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
>  		break;
>  	case 0:
> -		set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
> +		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
>  		break;
>  	case 1:
> -		set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
> +		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
>  		break;
>  	default:
>  		BUG(); /* unexpected, logic error */
> @@ -3686,16 +3690,13 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
>  		done = atomic_dec_and_test(&eb->io_pages);
> 
>  		if (bio->bi_error ||
> -		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
> -			ClearPageUptodate(page);
> +		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
>  			set_btree_ioerr(page);
> -		}
> -
> -		end_page_writeback(page);
> 
> +		account_metadata_end_writeback(page,
> +					       &eb->eb_info->fs_info->bdi);
>  		if (!done)
>  			continue;
> -
>  		end_extent_buffer_writeback(eb);
>  	}
> 
> @@ -3708,7 +3709,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>  			struct extent_page_data *epd)
>  {
>  	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
> -	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
> +	struct extent_io_tree *tree = &fs_info->eb_info->io_tree;
>  	u64 offset = eb->start;
>  	unsigned long i, num_pages;
>  	unsigned long bio_flags = 0;
> @@ -3724,8 +3725,6 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>  	for (i = 0; i < num_pages; i++) {
>  		struct page *p = eb->pages[i];
> 
> -		clear_page_dirty_for_io(p);
> -		set_page_writeback(p);
>  		ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc,
>  					 p, offset >> 9, PAGE_SIZE, 0, bdev,
>  					 &epd->bio, -1,
> @@ -3734,34 +3733,90 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
>  		epd->bio_flags = bio_flags;
>  		if (ret) {
>  			set_btree_ioerr(p);
> -			end_page_writeback(p);
>  			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
>  				end_extent_buffer_writeback(eb);
>  			ret = -EIO;
>  			break;
>  		}
> +		account_metadata_writeback(p, &fs_info->bdi);
>  		offset += PAGE_SIZE;
>  		update_nr_written(p, wbc, 1);
> -		unlock_page(p);
>  	}
> 
> -	if (unlikely(ret)) {
> -		for (; i < num_pages; i++) {
> -			struct page *p = eb->pages[i];
> -			clear_page_dirty_for_io(p);
> -			unlock_page(p);
> +	return ret;
> +}
> +
> +#define EB_TAG_BATCH 4096
> +static void tag_ebs_for_writeback(struct btrfs_eb_info *eb_info, pgoff_t start,
> +				  pgoff_t end)
> +{
> +	unsigned long tagged;
> +
> +	do {
> +		spin_lock_irq(&eb_info->buffer_lock);
> +		tagged = radix_tree_range_tag_if_tagged(&eb_info->buffer_radix,
> +							&start, end,
> +							EB_TAG_BATCH,
> +							PAGECACHE_TAG_DIRTY,
> +							PAGECACHE_TAG_TOWRITE);
> +		spin_unlock_irq(&eb_info->buffer_lock);
> +		cond_resched();
> +	} while (tagged >= EB_TAG_BATCH && start);
> +}
> +
> +static unsigned eb_lookup_tag(struct btrfs_eb_info *eb_info,
> +			      struct extent_buffer **ebs, pgoff_t *index,
> +			      int tag, unsigned nr)
> +{
> +	struct radix_tree_iter iter;
> +	void **slot;
> +	unsigned ret = 0;
> +
> +	if (unlikely(!nr))
> +		return 0;
> +
> +	rcu_read_lock();
> +	radix_tree_for_each_tagged(slot, &eb_info->buffer_radix, &iter, *index,
> +				   tag) {
> +		struct extent_buffer *eb;
> +repeat:
> +		eb = radix_tree_deref_slot(slot);
> +		if (unlikely(!eb))
> +			continue;
> +
> +		if (radix_tree_exception(eb)) {
> +			if (radix_tree_deref_retry(eb)) {
> +				slot = radix_tree_iter_retry(&iter);
> +				continue;
> +			}
> +			continue;
>  		}
> -	}
> 
> +		if (unlikely(!atomic_inc_not_zero(&eb->refs)))
> +			continue;
> +
> +		if (unlikely(eb != *slot)) {
> +			free_extent_buffer(eb);
> +			goto repeat;
> +		}
> +
> +		ebs[ret] = eb;
> +		if (++ret == nr)
> +			break;
> +	}
> +	rcu_read_unlock();
> +	if (ret)
> +		*index = (ebs[ret - 1]->start >> PAGE_SHIFT) + 1;
>  	return ret;
>  }
> 
> -int btree_write_cache_pages(struct address_space *mapping,
> +#define EBVEC_SIZE 16
> +static int btree_write_cache_pages(struct btrfs_fs_info *fs_info,
>  				   struct writeback_control *wbc)
>  {
> -	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
> -	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
> -	struct extent_buffer *eb, *prev_eb = NULL;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +	struct extent_io_tree *tree = &eb_info->io_tree;
> +	struct extent_buffer *eb;
>  	struct extent_page_data epd = {
>  		.bio = NULL,
>  		.tree = tree,
> @@ -3772,16 +3827,16 @@ int btree_write_cache_pages(struct address_space *mapping,
>  	int ret = 0;
>  	int done = 0;
>  	int nr_to_write_done = 0;
> -	struct pagevec pvec;
> -	int nr_pages;
> +	struct extent_buffer *ebs[EBVEC_SIZE];
> +	int nr_ebs;
>  	pgoff_t index;
>  	pgoff_t end;		/* Inclusive */
> +	pgoff_t done_index = 0;
>  	int scanned = 0;
>  	int tag;
> 
> -	pagevec_init(&pvec, 0);
>  	if (wbc->range_cyclic) {
> -		index = mapping->writeback_index; /* Start from prev offset */
> +		index = eb_info->writeback_index; /* Start from prev offset */
>  		end = -1;
>  	} else {
>  		index = wbc->range_start >> PAGE_SHIFT;
> @@ -3794,53 +3849,27 @@ int btree_write_cache_pages(struct address_space *mapping,
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
>  	if (wbc->sync_mode == WB_SYNC_ALL)
> -		tag_pages_for_writeback(mapping, index, end);
> +		tag_ebs_for_writeback(fs_info->eb_info, index, end);
>  	while (!done && !nr_to_write_done && (index <= end) &&
> -	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
> -			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
> +	       (nr_ebs = eb_lookup_tag(eb_info, ebs, &index, tag,
> +			min(end - index, (pgoff_t)EBVEC_SIZE-1) + 1))) {
>  		unsigned i;
> 
>  		scanned = 1;
> -		for (i = 0; i < nr_pages; i++) {
> -			struct page *page = pvec.pages[i];
> -
> -			if (!PagePrivate(page))
> -				continue;
> -
> -			if (!wbc->range_cyclic && page->index > end) {
> -				done = 1;
> -				break;
> -			}
> -
> -			spin_lock(&mapping->private_lock);
> -			if (!PagePrivate(page)) {
> -				spin_unlock(&mapping->private_lock);
> -				continue;
> -			}
> -
> -			eb = (struct extent_buffer *)page->private;
> -
> -			/*
> -			 * Shouldn't happen and normally this would be a BUG_ON
> -			 * but no sense in crashing the users box for something
> -			 * we can survive anyway.
> -			 */
> -			if (WARN_ON(!eb)) {
> -				spin_unlock(&mapping->private_lock);
> +		for (i = 0; i < nr_ebs; i++) {
> +			eb = ebs[i];
> +			if (done) {
> +				free_extent_buffer(eb);
>  				continue;
>  			}
> 
> -			if (eb == prev_eb) {
> -				spin_unlock(&mapping->private_lock);
> +			if (!wbc->range_cyclic && eb->start > wbc->range_end) {
> +				done = 1;
> +				free_extent_buffer(eb);
>  				continue;
>  			}
> 
> -			ret = atomic_inc_not_zero(&eb->refs);
> -			spin_unlock(&mapping->private_lock);
> -			if (!ret)
> -				continue;
> -
> -			prev_eb = eb;
> +			done_index = eb_index(eb);
>  			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
>  			if (!ret) {
>  				free_extent_buffer(eb);
> @@ -3848,12 +3877,11 @@ retry:
>  			}
> 
>  			ret = write_one_eb(eb, fs_info, wbc, &epd);
> +			free_extent_buffer(eb);
>  			if (ret) {
>  				done = 1;
> -				free_extent_buffer(eb);
> -				break;
> +				continue;
>  			}
> -			free_extent_buffer(eb);
> 
>  			/*
>  			 * the filesystem may choose to bump up nr_to_write.
> @@ -3862,7 +3890,6 @@ retry:
>  			 */
>  			nr_to_write_done = wbc->nr_to_write <= 0;
>  		}
> -		pagevec_release(&pvec);
>  		cond_resched();
>  	}
>  	if (!scanned && !done) {
> @@ -3874,10 +3901,77 @@ retry:
>  		index = 0;
>  		goto retry;
>  	}
> +	if (wbc->range_cyclic)
> +		fs_info->eb_info->writeback_index = done_index;
>  	flush_write_bio(&epd);
>  	return ret;
>  }
> 
> +void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	btree_write_cache_pages(fs_info, wbc);
> +}
> +
> +static int __btree_write_range(struct btrfs_fs_info *fs_info, u64 start,
> +			       u64 end, int sync_mode)
> +{
> +	struct writeback_control wbc = {
> +		.sync_mode = sync_mode,
> +		.nr_to_write = LONG_MAX,
> +		.range_start = start,
> +		.range_end = end,
> +	};
> +
> +	return btree_write_cache_pages(fs_info, &wbc);
> +}
> +
> +void btree_flush(struct btrfs_fs_info *fs_info)
> +{
> +	__btree_write_range(fs_info, 0, (u64)-1, WB_SYNC_NONE);
> +}
> +
> +int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
> +{
> +	return __btree_write_range(fs_info, start, end, WB_SYNC_ALL);
> +}
> +
> +int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end)
> +{
> +	struct extent_buffer *ebs[EBVEC_SIZE];
> +	pgoff_t index = start >> PAGE_SHIFT;
> +	pgoff_t end_index = end >> PAGE_SHIFT;
> +	unsigned nr_ebs;
> +	int ret = 0;
> +
> +	if (end < start)
> +		return ret;
> +
> +	while ((index <= end) &&
> +	       (nr_ebs = eb_lookup_tag(fs_info->eb_info, ebs, &index,
> +				       PAGECACHE_TAG_WRITEBACK,
> +				       min(end_index - index,
> +					   (pgoff_t)EBVEC_SIZE-1) + 1)) != 0) {
> +		unsigned i;
> +
> +		for (i = 0; i < nr_ebs; i++) {
> +			struct extent_buffer *eb = ebs[i];
> +
> +			if (eb->start > end) {
> +				free_extent_buffer(eb);
> +				continue;
> +			}
> +
> +			wait_on_extent_buffer_writeback(eb);
> +			if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
> +				ret = -EIO;
> +			free_extent_buffer(eb);
> +		}
> +		cond_resched();
> +	}
> +	return ret;
> +}
> +
>  /**
>   * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
>   * @mapping: address space structure to write
> @@ -4558,7 +4652,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
>  {
>  	unsigned long index;
>  	struct page *page;
> -	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
> 
>  	BUG_ON(extent_buffer_under_io(eb));
> 
> @@ -4566,39 +4659,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
>  	if (index == 0)
>  		return;
> 
> +	ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
>  	do {
>  		index--;
>  		page = eb->pages[index];
>  		if (!page)
>  			continue;
> -		if (mapped)
> -			spin_lock(&page->mapping->private_lock);
> -		/*
> -		 * We do this since we'll remove the pages after we've
> -		 * removed the eb from the radix tree, so we could race
> -		 * and have this page now attached to the new eb.  So
> -		 * only clear page_private if it's still connected to
> -		 * this eb.
> -		 */
> -		if (PagePrivate(page) &&
> -		    page->private == (unsigned long)eb) {
> -			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
> -			BUG_ON(PageDirty(page));
> -			BUG_ON(PageWriteback(page));
> -			/*
> -			 * We need to make sure we haven't be attached
> -			 * to a new eb.
> -			 */
> -			ClearPagePrivate(page);
> -			set_page_private(page, 0);
> -			/* One for the page private */
> -			put_page(page);
> -		}
> +		ASSERT(PagePrivate(page));
> +		ASSERT(page->private == (unsigned long)eb);
> +		ClearPagePrivate(page);
> +		set_page_private(page, 0);
> 
> -		if (mapped)
> -			spin_unlock(&page->mapping->private_lock);
> +		/* Once for the page private. */
> +		put_page(page);
> 
> -		/* One for when we allocated the page */
> +		/* Once for the alloc_page. */
>  		put_page(page);
>  	} while (index != 0);
>  }
> @@ -4613,7 +4688,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
>  }
> 
>  static struct extent_buffer *
> -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
> +__alloc_extent_buffer(struct btrfs_eb_info *eb_info, u64 start,
>  		      unsigned long len)
>  {
>  	struct extent_buffer *eb = NULL;
> @@ -4621,7 +4696,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>  	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
>  	eb->start = start;
>  	eb->len = len;
> -	eb->fs_info = fs_info;
> +	eb->eb_info = eb_info;
>  	eb->bflags = 0;
>  	rwlock_init(&eb->lock);
>  	atomic_set(&eb->write_locks, 0);
> @@ -4633,6 +4708,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
>  	eb->lock_nested = 0;
>  	init_waitqueue_head(&eb->write_lock_wq);
>  	init_waitqueue_head(&eb->read_lock_wq);
> +	INIT_LIST_HEAD(&eb->lru);
> 
>  	btrfs_leak_debug_add(&eb->leak_list, &buffers);
> 
> @@ -4657,7 +4733,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>  	struct extent_buffer *new;
>  	unsigned long num_pages = num_extent_pages(src->start, src->len);
> 
> -	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
> +	new = __alloc_extent_buffer(src->eb_info, src->start, src->len);
>  	if (new == NULL)
>  		return NULL;
> 
> @@ -4668,8 +4744,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>  			return NULL;
>  		}
>  		attach_extent_buffer_page(new, p);
> -		WARN_ON(PageDirty(p));
> -		SetPageUptodate(p);
>  		new->pages[i] = p;
>  	}
> 
> @@ -4680,8 +4754,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
>  	return new;
>  }
> 
> -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> -						  u64 start, unsigned long len)
> +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
> +						u64 start, unsigned long len)
>  {
>  	struct extent_buffer *eb;
>  	unsigned long num_pages;
> @@ -4689,7 +4763,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> 
>  	num_pages = num_extent_pages(start, len);
> 
> -	eb = __alloc_extent_buffer(fs_info, start, len);
> +	eb = __alloc_extent_buffer(eb_info, start, len);
>  	if (!eb)
>  		return NULL;
> 
> @@ -4697,6 +4771,7 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
>  		eb->pages[i] = alloc_page(GFP_NOFS);
>  		if (!eb->pages[i])
>  			goto err;
> +		attach_extent_buffer_page(eb, eb->pages[i]);
>  	}
>  	set_extent_buffer_uptodate(eb);
>  	btrfs_set_header_nritems(eb, 0);
> @@ -4704,30 +4779,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> 
>  	return eb;
>  err:
> -	for (; i > 0; i--)
> -		__free_page(eb->pages[i - 1]);
> -	__free_extent_buffer(eb);
> +	btrfs_release_extent_buffer(eb);
>  	return NULL;
>  }
> 
> -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> -						u64 start, u32 nodesize)
> -{
> -	unsigned long len;
> -
> -	if (!fs_info) {
> -		/*
> -		 * Called only from tests that don't always have a fs_info
> -		 * available
> -		 */
> -		len = nodesize;
> -	} else {
> -		len = fs_info->tree_root->nodesize;
> -	}
> -
> -	return __alloc_dummy_extent_buffer(fs_info, start, len);
> -}
> -
>  static void check_buffer_tree_ref(struct extent_buffer *eb)
>  {
>  	int refs;
> @@ -4777,13 +4832,13 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
>  	}
>  }
> 
> -struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
> +struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
>  					 u64 start)
>  {
>  	struct extent_buffer *eb;
> 
>  	rcu_read_lock();
> -	eb = radix_tree_lookup(&fs_info->buffer_radix,
> +	eb = radix_tree_lookup(&eb_info->buffer_radix,
>  			       start >> PAGE_SHIFT);
>  	if (eb && atomic_inc_not_zero(&eb->refs)) {
>  		rcu_read_unlock();
> @@ -4815,30 +4870,30 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
>  }
> 
>  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> -struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> -					u64 start, u32 nodesize)
> +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
> +					       u64 start, u32 nodesize)
>  {
>  	struct extent_buffer *eb, *exists = NULL;
>  	int ret;
> 
> -	eb = find_extent_buffer(fs_info, start);
> +	eb = find_extent_buffer(eb_info, start);
>  	if (eb)
>  		return eb;
> -	eb = alloc_dummy_extent_buffer(fs_info, start, nodesize);
> +	eb = alloc_dummy_extent_buffer(eb_info, start, nodesize);
>  	if (!eb)
>  		return NULL;
> -	eb->fs_info = fs_info;
> +	eb->eb_info = eb_info;
>  again:
>  	ret = radix_tree_preload(GFP_NOFS);
>  	if (ret)
>  		goto free_eb;
> -	spin_lock(&fs_info->buffer_lock);
> -	ret = radix_tree_insert(&fs_info->buffer_radix,
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>  				start >> PAGE_SHIFT, eb);
> -	spin_unlock(&fs_info->buffer_lock);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	radix_tree_preload_end();
>  	if (ret == -EEXIST) {
> -		exists = find_extent_buffer(fs_info, start);
> +		exists = find_extent_buffer(eb_info, start);
>  		if (exists)
>  			goto free_eb;
>  		else
> @@ -4854,6 +4909,7 @@ again:
>  	 * bump the ref count again.
>  	 */
>  	atomic_inc(&eb->refs);
> +	set_extent_buffer_uptodate(eb);
>  	return eb;
>  free_eb:
>  	btrfs_release_extent_buffer(eb);
> @@ -4867,12 +4923,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  	unsigned long len = fs_info->tree_root->nodesize;
>  	unsigned long num_pages = num_extent_pages(start, len);
>  	unsigned long i;
> -	unsigned long index = start >> PAGE_SHIFT;
>  	struct extent_buffer *eb;
>  	struct extent_buffer *exists = NULL;
>  	struct page *p;
> -	struct address_space *mapping = fs_info->btree_inode->i_mapping;
> -	int uptodate = 1;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +//	struct zone *last_zone = NULL;
> +//	struct pg_data_t *last_pgdata = NULL;
>  	int ret;
> 
>  	if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) {
> @@ -4880,62 +4936,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  		return ERR_PTR(-EINVAL);
>  	}
> 
> -	eb = find_extent_buffer(fs_info, start);
> +	eb = find_extent_buffer(eb_info, start);
>  	if (eb)
>  		return eb;
> 
> -	eb = __alloc_extent_buffer(fs_info, start, len);
> +	eb = __alloc_extent_buffer(eb_info, start, len);
>  	if (!eb)
>  		return ERR_PTR(-ENOMEM);
> 
> -	for (i = 0; i < num_pages; i++, index++) {
> -		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
> +	for (i = 0; i < num_pages; i++) {
> +		p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
>  		if (!p) {
>  			exists = ERR_PTR(-ENOMEM);
>  			goto free_eb;
>  		}
> 
> -		spin_lock(&mapping->private_lock);
> -		if (PagePrivate(p)) {
> -			/*
> -			 * We could have already allocated an eb for this page
> -			 * and attached one so lets see if we can get a ref on
> -			 * the existing eb, and if we can we know it's good and
> -			 * we can just return that one, else we know we can just
> -			 * overwrite page->private.
> -			 */
> -			exists = (struct extent_buffer *)p->private;
> -			if (atomic_inc_not_zero(&exists->refs)) {
> -				spin_unlock(&mapping->private_lock);
> -				unlock_page(p);
> -				put_page(p);
> -				mark_extent_buffer_accessed(exists, p);
> -				goto free_eb;
> -			}
> -			exists = NULL;
> -
> -			/*
> -			 * Do this so attach doesn't complain and we need to
> -			 * drop the ref the old guy had.
> -			 */
> -			ClearPagePrivate(p);
> -			WARN_ON(PageDirty(p));
> -			put_page(p);
> -		}
> +		/*
> +		 * If our pages span zones or numa nodes we have to do
> +		 * dirty/writeback accounting per page, otherwise we can do it
> +		 * in bulk and save us some looping.
> +		 *
> +		if (!last_zone)
> +			last_zone = page_zone(p);
> +		if (!last_pgdata)
> +			last_pgdata = page_pgdata(p);
> +		if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
> +			set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
> +		*/
>  		attach_extent_buffer_page(eb, p);
> -		spin_unlock(&mapping->private_lock);
> -		WARN_ON(PageDirty(p));
>  		eb->pages[i] = p;
> -		if (!PageUptodate(p))
> -			uptodate = 0;
> -
> -		/*
> -		 * see below about how we avoid a nasty race with release page
> -		 * and why we unlock later
> -		 */
>  	}
> -	if (uptodate)
> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  again:
>  	ret = radix_tree_preload(GFP_NOFS);
>  	if (ret) {
> @@ -4943,13 +4973,13 @@ again:
>  		goto free_eb;
>  	}
> 
> -	spin_lock(&fs_info->buffer_lock);
> -	ret = radix_tree_insert(&fs_info->buffer_radix,
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>  				start >> PAGE_SHIFT, eb);
> -	spin_unlock(&fs_info->buffer_lock);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	radix_tree_preload_end();
>  	if (ret == -EEXIST) {
> -		exists = find_extent_buffer(fs_info, start);
> +		exists = find_extent_buffer(eb_info, start);
>  		if (exists)
>  			goto free_eb;
>  		else
> @@ -4959,31 +4989,10 @@ again:
>  	check_buffer_tree_ref(eb);
>  	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
> 
> -	/*
> -	 * there is a race where release page may have
> -	 * tried to find this extent buffer in the radix
> -	 * but failed.  It will tell the VM it is safe to
> -	 * reclaim the, and it will clear the page private bit.
> -	 * We must make sure to set the page private bit properly
> -	 * after the extent buffer is in the radix tree so
> -	 * it doesn't get lost
> -	 */
> -	SetPageChecked(eb->pages[0]);
> -	for (i = 1; i < num_pages; i++) {
> -		p = eb->pages[i];
> -		ClearPageChecked(p);
> -		unlock_page(p);
> -	}
> -	unlock_page(eb->pages[0]);
>  	return eb;
> 
>  free_eb:
>  	WARN_ON(!atomic_dec_and_test(&eb->refs));
> -	for (i = 0; i < num_pages; i++) {
> -		if (eb->pages[i])
> -			unlock_page(eb->pages[i]);
> -	}
> -
>  	btrfs_release_extent_buffer(eb);
>  	return exists;
>  }
> @@ -4999,17 +5008,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>  /* Expects to have eb->eb_lock already held */
>  static int release_extent_buffer(struct extent_buffer *eb)
>  {
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +
>  	WARN_ON(atomic_read(&eb->refs) == 0);
>  	if (atomic_dec_and_test(&eb->refs)) {
> +		if (eb_info)
> +			list_lru_del(&eb_info->lru_list, &eb->lru);
>  		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
> -			struct btrfs_fs_info *fs_info = eb->fs_info;
> -
>  			spin_unlock(&eb->refs_lock);
> 
> -			spin_lock(&fs_info->buffer_lock);
> -			radix_tree_delete(&fs_info->buffer_radix,
> -					  eb->start >> PAGE_SHIFT);
> -			spin_unlock(&fs_info->buffer_lock);
> +			spin_lock_irq(&eb_info->buffer_lock);
> +			radix_tree_delete(&eb_info->buffer_radix,
> +					  eb_index(eb));
> +			spin_unlock_irq(&eb_info->buffer_lock);
>  		} else {
>  			spin_unlock(&eb->refs_lock);
>  		}
> @@ -5024,6 +5035,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
>  #endif
>  		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
>  		return 1;
> +	} else if (eb_info && atomic_read(&eb->refs) == 1) {
> +		list_lru_add(&eb_info->lru_list, &eb->lru);
>  	}
>  	spin_unlock(&eb->refs_lock);
> 
> @@ -5057,10 +5070,6 @@ void free_extent_buffer(struct extent_buffer *eb)
>  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
>  		atomic_dec(&eb->refs);
> 
> -	/*
> -	 * I know this is terrible, but it's temporary until we stop tracking
> -	 * the uptodate bits and such for the extent buffers.
> -	 */
>  	release_extent_buffer(eb);
>  }
> 
> @@ -5078,82 +5087,163 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
>  	release_extent_buffer(eb);
>  }
> 
> -void clear_extent_buffer_dirty(struct extent_buffer *eb)
> +long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
>  {
> -	unsigned long i;
> -	unsigned long num_pages;
> -	struct page *page;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> 
> -	num_pages = num_extent_pages(eb->start, eb->len);
> +	return list_lru_shrink_count(&eb_info->lru_list, sc);
> +}
> 
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (!PageDirty(page))
> -			continue;
> +static enum lru_status eb_lru_isolate(struct list_head *item,
> +				      struct list_lru_one *lru,
> +				      spinlock_t *lru_lock, void *arg)
> +{
> +	struct list_head *freeable = (struct list_head *)arg;
> +	struct extent_buffer *eb = container_of(item, struct extent_buffer,
> +						lru);
> +	enum lru_status ret;
> +	int refs;
> 
> -		lock_page(page);
> -		WARN_ON(!PagePrivate(page));
> +	if (!spin_trylock(&eb->refs_lock))
> +		return LRU_SKIP;
> 
> -		clear_page_dirty_for_io(page);
> -		spin_lock_irq(&page->mapping->tree_lock);
> -		if (!PageDirty(page)) {
> -			radix_tree_tag_clear(&page->mapping->page_tree,
> -						page_index(page),
> -						PAGECACHE_TAG_DIRTY);
> -		}
> -		spin_unlock_irq(&page->mapping->tree_lock);
> -		ClearPageError(page);
> -		unlock_page(page);
> +	if (extent_buffer_under_io(eb)) {
> +		ret = LRU_ROTATE;
> +		goto out;
> +	}
> +
> +	refs = atomic_read(&eb->refs);
> +	/* We can race with somebody freeing us, just skip if this happens. */
> +	if (refs == 0) {
> +		ret = LRU_SKIP;
> +		goto out;
> +	}
> +
> +	/* Eb is in use, don't kill it. */
> +	if (refs > 1) {
> +		ret = LRU_ROTATE;
> +		goto out;
> +	}
> +
> +	/*
> +	 * If we don't clear the TREE_REF flag then this eb is going to
> +	 * disappear soon anyway.  Otherwise we become responsible for dropping
> +	 * the last ref on this eb and we know it'll survive until we call
> +	 * dispose_list.
> +	 */
> +	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
> +		ret = LRU_SKIP;
> +		goto out;
> +	}
> +	list_lru_isolate_move(lru, &eb->lru, freeable);
> +	ret = LRU_REMOVED;
> +out:
> +	spin_unlock(&eb->refs_lock);
> +	return ret;
> +}
> +
> +static void dispose_list(struct list_head *list)
> +{
> +	struct extent_buffer *eb;
> +
> +	while (!list_empty(list)) {
> +		eb = list_first_entry(list, struct extent_buffer, lru);
> +
> +		spin_lock(&eb->refs_lock);
> +		list_del_init(&eb->lru);
> +		spin_unlock(&eb->refs_lock);
> +		free_extent_buffer(eb);
> +		cond_resched();
>  	}
> +}
> +
> +long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +	LIST_HEAD(freeable);
> +	long freed;
> +
> +	freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
> +				     &freeable);
> +	dispose_list(&freeable);
> +	return freed;
> +}
> +
> +#define MAX_EVICT_COUNT 1024
> +void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
> +{
> +	LIST_HEAD(freeable);
> +	unsigned long count;
> +
> +	/*
> +	 * Evict in batches so we don't lockup the system trying to evict
> +	 * memory.
> +	 */
> +	do {
> +		count = list_lru_walk(&eb_info->lru_list, eb_lru_isolate,
> +				      &freeable, MAX_EVICT_COUNT);
> +		cond_resched();
> +	} while (count);
> +	dispose_list(&freeable);
> +	synchronize_rcu();
> +}
> +
> +int clear_extent_buffer_dirty(struct extent_buffer *eb)
> +{
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	unsigned long i;
> +	unsigned long num_pages;
> +
> +	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +		return 0;
> +
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +			     PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&eb_info->buffer_lock);
> +
> +	num_pages = num_extent_pages(eb->start, eb->len);
> +	for (i = 0; i < num_pages; i++)
> +		account_metadata_cleaned(eb->pages[i], &eb_info->fs_info->bdi);
>  	WARN_ON(atomic_read(&eb->refs) == 0);
> +	return 1;
>  }
> 
>  int set_extent_buffer_dirty(struct extent_buffer *eb)
>  {
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
>  	unsigned long i;
>  	unsigned long num_pages;
>  	int was_dirty = 0;
> 
>  	check_buffer_tree_ref(eb);
> 
> -	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
> -
> -	num_pages = num_extent_pages(eb->start, eb->len);
>  	WARN_ON(atomic_read(&eb->refs) == 0);
>  	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
> +	if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +		return 1;
> 
> +	num_pages = num_extent_pages(eb->start, eb->len);
>  	for (i = 0; i < num_pages; i++)
> -		set_page_dirty(eb->pages[i]);
> +		account_metadata_dirtied(eb->pages[i],
> +					 &eb->eb_info->fs_info->bdi);
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +			   PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	return was_dirty;
>  }
> 
>  void clear_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> -	struct page *page;
> -	unsigned long num_pages;
> -
>  	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (page)
> -			ClearPageUptodate(page);
> -	}
>  }
> 
>  void set_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> -	struct page *page;
> -	unsigned long num_pages;
> -
>  	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		SetPageUptodate(page);
> -	}
>  }
> 
>  int extent_buffer_uptodate(struct extent_buffer *eb)
> @@ -5161,103 +5251,166 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
>  	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  }
> 
> -int read_extent_buffer_pages(struct extent_io_tree *tree,
> -			     struct extent_buffer *eb, int wait,
> -			     get_extent_t *get_extent, int mirror_num)
> +static void end_bio_extent_buffer_readpage(struct bio *bio)
>  {
> +	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +	struct extent_io_tree *tree = NULL;
> +	struct bio_vec *bvec;
> +	u64 unlock_start = 0, unlock_len = 0;
> +	int mirror_num = io_bio->mirror_num;
> +	int uptodate = !bio->bi_error;
> +	int i, ret;
> +
> +	bio_for_each_segment_all(bvec, bio, i) {
> +		struct page *page = bvec->bv_page;
> +		struct btrfs_eb_info *eb_info;
> +		struct extent_buffer *eb;
> +
> +		eb = (struct extent_buffer *)page->private;
> +		if (WARN_ON(!eb))
> +			continue;
> +
> +		eb_info = eb->eb_info;
> +		if (!tree)
> +			tree = &eb_info->io_tree;
> +		if (uptodate) {
> +			/*
> +			 * btree_readpage_end_io_hook doesn't care about
> +			 * start/end so just pass 0.  We'll kill this later.
> +			 */
> +			ret = tree->ops->readpage_end_io_hook(io_bio, 0,
> +							      page, 0, 0,
> +							      mirror_num);
> +			if (ret) {
> +				uptodate = 0;
> +			} else {
> +				u64 start = eb->start;
> +				int c, num_pages;
> +
> +				num_pages = num_extent_pages(eb->start,
> +							     eb->len);
> +				for (c = 0; c < num_pages; c++) {
> +					if (eb->pages[c] == page)
> +						break;
> +					start += PAGE_SIZE;
> +				}
> +				clean_io_failure(eb_info->fs_info,
> +						 &eb_info->io_failure_tree,
> +						 tree, start, page, 0, 0);
> +			}
> +		}
> +		/*
> +		 * We never fix anything in btree_io_failed_hook.
> +		 *
> +		 * TODO: rework the io failed hook to not assume we can fix
> +		 * anything.
> +		 */
> +		if (!uptodate)
> +			tree->ops->readpage_io_failed_hook(page, mirror_num);
> +
> +		if (unlock_start == 0) {
> +			unlock_start = eb->start;
> +			unlock_len = PAGE_SIZE;
> +		} else {
> +			unlock_len += PAGE_SIZE;
> +		}
> +	}
> +
> +	if (unlock_start)
> +		unlock_extent(tree, unlock_start,
> +			      unlock_start + unlock_len - 1);
> +	if (io_bio->end_io)
> +		io_bio->end_io(io_bio, bio->bi_error);
> +	bio_put(bio);
> +}
> +
> +int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
> +			     int mirror_num)
> +{
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	struct extent_io_tree *io_tree = &eb_info->io_tree;
> +	struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
> +	struct bio *bio = NULL;
> +	u64 offset = eb->start;
> +	u64 unlock_start = 0, unlock_len = 0;
>  	unsigned long i;
>  	struct page *page;
>  	int err;
>  	int ret = 0;
> -	int locked_pages = 0;
> -	int all_uptodate = 1;
>  	unsigned long num_pages;
> -	unsigned long num_reads = 0;
> -	struct bio *bio = NULL;
> -	unsigned long bio_flags = 0;
> 
>  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>  		return 0;
> 
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (wait == WAIT_NONE) {
> -			if (!trylock_page(page))
> -				goto unlock_exit;
> -		} else {
> -			lock_page(page);
> -		}
> -		locked_pages++;
> -		if (!PageUptodate(page)) {
> -			num_reads++;
> -			all_uptodate = 0;
> -		}
> -	}
> -	if (all_uptodate) {
> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -		goto unlock_exit;
> +	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
> +		if (wait != WAIT_COMPLETE)
> +			return 0;
> +		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +			       TASK_UNINTERRUPTIBLE);
> +		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +			ret = -EIO;
> +		return ret;
>  	}
> 
> +	lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
> +	num_pages = num_extent_pages(eb->start, eb->len);
>  	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
>  	eb->read_mirror = 0;
> -	atomic_set(&eb->io_pages, num_reads);
> +	atomic_set(&eb->io_pages, num_pages);
>  	for (i = 0; i < num_pages; i++) {
>  		page = eb->pages[i];
> -
> -		if (!PageUptodate(page)) {
> -			if (ret) {
> -				atomic_dec(&eb->io_pages);
> -				unlock_page(page);
> -				continue;
> +		if (ret) {
> +			unlock_len += PAGE_SIZE;
> +			if (atomic_dec_and_test(&eb->io_pages)) {
> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +				smp_mb__after_atomic();
> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>  			}
> +			continue;
> +		}
> 
> -			ClearPageError(page);
> -			err = __extent_read_full_page(tree, page,
> -						      get_extent, &bio,
> -						      mirror_num, &bio_flags,
> -						      REQ_META);
> -			if (err) {
> -				ret = err;
> -				/*
> -				 * We use &bio in above __extent_read_full_page,
> -				 * so we ensure that if it returns error, the
> -				 * current page fails to add itself to bio and
> -				 * it's been unlocked.
> -				 *
> -				 * We must dec io_pages by ourselves.
> -				 */
> -				atomic_dec(&eb->io_pages);
> +		err = submit_extent_page(REQ_OP_READ, REQ_META, io_tree, NULL,
> +					 page, offset >> 9, PAGE_SIZE, 0, bdev,
> +					 &bio, -1,
> +					 end_bio_extent_buffer_readpage,
> +					 mirror_num, 0, 0, false);
> +		if (err) {
> +			ret = err;
> +			/*
> +			 * We use &bio in above submit_extent_page
> +			 * so we ensure that if it returns error, the
> +			 * current page fails to add itself to bio and
> +			 * it's been unlocked.
> +			 *
> +			 * We must dec io_pages by ourselves.
> +			 */
> +			if (atomic_dec_and_test(&eb->io_pages)) {
> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +				smp_mb__after_atomic();
> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>  			}
> -		} else {
> -			unlock_page(page);
> +			unlock_start = eb->start;

Josef, IMHO "unlock_start" should have been set to "offset". Lets say we
have 4 pages making up a metadata block and the first page was successfully
added to a bio. Assume that adding the second page to the bio resulted in
submit_extent_page() returning an error. In this scenario,
end_bio_extent_buffer_readpage() will own the responsibility of unlocking the
first 4k range in the io tree. However with "unlock_start" being set to
"eb->start", read_extent_buffer_pages() may end up unlocking the first 4k
range in the io tree.

> +			unlock_len = PAGE_SIZE;
>  		}
> +		offset += PAGE_SIZE;
>  	}
> 
>  	if (bio) {
> -		err = submit_one_bio(bio, mirror_num, bio_flags);
> +		err = submit_one_bio(bio, mirror_num, 0);
>  		if (err)
>  			return err;
>  	}
> 
> +	if (ret && unlock_start)
> +		unlock_extent(io_tree, unlock_start,
> +			      unlock_start + unlock_len - 1);
>  	if (ret || wait != WAIT_COMPLETE)
>  		return ret;
> 
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		wait_on_page_locked(page);
> -		if (!PageUptodate(page))
> -			ret = -EIO;
> -	}
> -
> -	return ret;
> -
> -unlock_exit:
> -	while (locked_pages > 0) {
> -		locked_pages--;
> -		page = eb->pages[locked_pages];
> -		unlock_page(page);
> -	}
> +	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +		       TASK_UNINTERRUPTIBLE);
> +	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +		ret = -EIO;
>  	return ret;
>  }
> 

-- 
chandan


  reply	other threads:[~2016-09-08  5:17 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-09-02 19:39 [PATCH 0/7] Kill the btree inode Josef Bacik
2016-09-02 19:40 ` [PATCH 1/7] Btrfs: replace tree->mapping with tree->private_data Josef Bacik
2016-09-02 19:40 ` [PATCH 2/7] btrfs: remove inode argument from repair_io_failure Josef Bacik
2016-09-02 19:40 ` [PATCH 3/7] Btrfs: add a flags field to btrfs_fs_info Josef Bacik
2016-09-08 17:01   ` David Sterba
2016-09-02 19:40 ` [PATCH 4/7] Btrfs: kill the start argument to read_extent_buffer_pages Josef Bacik
2016-09-08 17:01   ` David Sterba
2016-09-02 19:40 ` [PATCH 5/7] Btrfs: don't pass the inode through clean_io_failure Josef Bacik
2016-09-02 19:40 ` [PATCH 6/7] Btrfs: kill the btree_inode Josef Bacik
2016-09-08  5:17   ` Chandan Rajendra [this message]
2016-09-08 14:12     ` Josef Bacik
2016-09-09 17:40   ` [PATCH 6/7][V2] " Josef Bacik
2016-09-02 19:40 ` [PATCH 7/7] Btrfs: kill BUG_ON()'s in btrfs_mark_extent_written Josef Bacik
2016-09-08 17:07   ` David Sterba
2016-09-05 16:31 ` [PATCH 0/7] Kill the btree inode David Sterba
2016-09-06 13:03   ` Josef Bacik

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8570191.KlMn4MSC1V@localhost.localdomain \
    --to=chandan@linux.vnet.ibm.com \
    --cc=jbacik@fb.com \
    --cc=kernel-team@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).