Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.

linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Liu Bo <bo.li.liu@oracle.com>
To: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Cc: clm@fb.com, jbacik@fb.com, dsterba@suse.cz,
	linux-btrfs@vger.kernel.org, chandan@mykolab.com
Subject: Re: [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write.
Date: Fri, 26 Jun 2015 17:50:54 +0800	[thread overview]
Message-ID: <20150626095052.GB16732@localhost.localdomain> (raw)
In-Reply-To: <1433172176-8742-3-git-send-email-chandan@linux.vnet.ibm.com>

On Mon, Jun 01, 2015 at 08:52:37PM +0530, Chandan Rajendra wrote:
> For the subpagesize-blocksize scenario, a page can contain multiple
> blocks. In such cases, this patch handles writing data to files.
> 
> Also, When setting EXTENT_DELALLOC, we no longer set EXTENT_UPTODATE bit on
> the extent_io_tree since uptodate status is being tracked by the bitmap
> pointed to by page->private.

To be honestly, I'm not sure why we set EXTENT_UPTODATE bit for data as we
don't check for that bit at all for now, correct me if I'm wrong.

> 
> Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
> ---
>  fs/btrfs/extent_io.c | 141 +++++++++++++++++++++++----------------------------
>  fs/btrfs/file.c      |  16 ++++++
>  fs/btrfs/inode.c     |  58 ++++++++++++++++-----
>  3 files changed, 125 insertions(+), 90 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d37badb..3736ab5 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1283,9 +1283,8 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
>  int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
>  			struct extent_state **cached_state, gfp_t mask)
>  {
> -	return set_extent_bit(tree, start, end,
> -			      EXTENT_DELALLOC | EXTENT_UPTODATE,
> -			      NULL, cached_state, mask);
> +	return set_extent_bit(tree, start, end, EXTENT_DELALLOC,
> +			NULL, cached_state, mask);
>  }
>  
>  int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
> @@ -1498,25 +1497,6 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
>  	return 0;
>  }
>  
> -/*
> - * helper function to set both pages and extents in the tree writeback
> - */
> -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
> -{
> -	unsigned long index = start >> PAGE_CACHE_SHIFT;
> -	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
> -	struct page *page;
> -
> -	while (index <= end_index) {
> -		page = find_get_page(tree->mapping, index);
> -		BUG_ON(!page); /* Pages should be in the extent_io_tree */
> -		set_page_writeback(page);
> -		page_cache_release(page);
> -		index++;
> -	}
> -	return 0;
> -}
> -
>  /* find the first state struct with 'bits' set after 'start', and
>   * return it.  tree->lock must be held.  NULL will returned if
>   * nothing was found after 'start'
> @@ -2080,6 +2060,14 @@ static int page_read_complete(struct page *page)
>  	return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
>  }
>  
> +static int page_write_complete(struct page *page)
> +{
> +	u64 start = page_offset(page);
> +	u64 end = start + PAGE_CACHE_SIZE - 1;
> +
> +	return !test_page_blks_state(page, BLK_STATE_IO, start, end, 0);
> +}
> +
>  int free_io_failure(struct inode *inode, struct io_failure_record *rec)
>  {
>  	int ret;
> @@ -2575,38 +2563,37 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
>   */
>  static void end_bio_extent_writepage(struct bio *bio, int err)
>  {
> +	struct btrfs_page_private *pg_private;
>  	struct bio_vec *bvec;
> +	unsigned long flags;
>  	u64 start;
>  	u64 end;
> +	int clear_writeback;
>  	int i;
>  
>  	bio_for_each_segment_all(bvec, bio, i) {
>  		struct page *page = bvec->bv_page;
>  
> -		/* We always issue full-page reads, but if some block
> -		 * in a page fails to read, blk_update_request() will
> -		 * advance bv_offset and adjust bv_len to compensate.
> -		 * Print a warning for nonzero offsets, and an error
> -		 * if they don't add up to a full page.  */
> -		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
> -			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
> -				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
> -				   "partial page write in btrfs with offset %u and length %u",
> -					bvec->bv_offset, bvec->bv_len);
> -			else
> -				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
> -				   "incomplete page write in btrfs with offset %u and "
> -				   "length %u",
> -					bvec->bv_offset, bvec->bv_len);
> -		}
> +		start = page_offset(page) + bvec->bv_offset;
> +		end = start + bvec->bv_len - 1;
>  
> -		start = page_offset(page);
> -		end = start + bvec->bv_offset + bvec->bv_len - 1;
> +		pg_private = (struct btrfs_page_private *)page->private;
> +
> +		spin_lock_irqsave(&pg_private->io_lock, flags);
>  
> -		if (end_extent_writepage(page, err, start, end))
> +		if (end_extent_writepage(page, err, start, end)) {
> +			spin_unlock_irqrestore(&pg_private->io_lock, flags);
>  			continue;
> +		}
>  
> -		end_page_writeback(page);
> +		clear_page_blks_state(page, 1 << BLK_STATE_IO, start, end);
> +
> +		clear_writeback = page_write_complete(page);
> +
> +		spin_unlock_irqrestore(&pg_private->io_lock, flags);
> +
> +		if (clear_writeback)
> +			end_page_writeback(page);
>  	}
>  
>  	bio_put(bio);
> @@ -3417,10 +3404,9 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  	u64 block_start;
>  	u64 iosize;
>  	sector_t sector;
> -	struct extent_state *cached_state = NULL;
>  	struct extent_map *em;
>  	struct block_device *bdev;
> -	size_t pg_offset = 0;
> +	size_t pg_offset;
>  	size_t blocksize;
>  	int ret = 0;
>  	int nr = 0;
> @@ -3467,8 +3453,16 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  							 page_end, NULL, 1);
>  			break;
>  		}
> -		em = epd->get_extent(inode, page, pg_offset, cur,
> -				     end - cur + 1, 1);
> +
> +		pg_offset = cur & (PAGE_CACHE_SIZE - 1);
> +
> +		if (!test_page_blks_state(page, BLK_STATE_DIRTY, cur,
> +						cur + blocksize - 1, 1)) {
> +			cur += blocksize;
> +			continue;
> +		}

If we don't check this, the below get_extent() will return a HOLE (block_start
== EXTENT_MAP_HOLE) and we can still go on to the next block, then we don't
need to maintain this BLK_STATE_DIRTY bit all the while.

> +
> +		em = epd->get_extent(inode, page, pg_offset, cur, blocksize, 1);
>  		if (IS_ERR_OR_NULL(em)) {
>  			SetPageError(page);
>  			ret = PTR_ERR_OR_ZERO(em);
> @@ -3479,7 +3473,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  		em_end = extent_map_end(em);
>  		BUG_ON(em_end <= cur);
>  		BUG_ON(end < cur);
> -		iosize = min(em_end - cur, end - cur + 1);
> +		iosize = min_t(u64, em_end - cur, blocksize);
>  		iosize = ALIGN(iosize, blocksize);

This limits us to do one block per loop, if two blocks are contiguous,
it should be fine to write them along.

>  		sector = (em->block_start + extent_offset) >> 9;
>  		bdev = em->bdev;
> @@ -3488,32 +3482,20 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  		free_extent_map(em);
>  		em = NULL;
>  
> -		/*
> -		 * compressed and inline extents are written through other
> -		 * paths in the FS
> -		 */
> -		if (compressed || block_start == EXTENT_MAP_HOLE ||
> -		    block_start == EXTENT_MAP_INLINE) {
> -			/*
> -			 * end_io notification does not happen here for
> -			 * compressed extents
> -			 */
> -			if (!compressed && tree->ops &&
> -			    tree->ops->writepage_end_io_hook)
> -				tree->ops->writepage_end_io_hook(page, cur,
> -							 cur + iosize - 1,
> -							 NULL, 1);
> -			else if (compressed) {
> -				/* we don't want to end_page_writeback on
> -				 * a compressed extent.  this happens
> -				 * elsewhere
> -				 */
> -				nr++;
> -			}
> +		BUG_ON(compressed);
> +		BUG_ON(block_start == EXTENT_MAP_INLINE);
>  
> -			cur += iosize;
> -			pg_offset += iosize;
> -			continue;
> +		if (block_start == EXTENT_MAP_HOLE) {
> +			if (test_page_blks_state(page, BLK_STATE_UPTODATE, cur,
> +							cur + iosize - 1, 1)) {
> +				clear_page_blks_state(page,
> +						1 << BLK_STATE_DIRTY, cur,
> +						cur + iosize - 1);
> +				cur += iosize;
> +				continue;
> +			} else {
> +				BUG();
> +			}
>  		}
>  
>  		if (tree->ops && tree->ops->writepage_io_hook) {
> @@ -3527,7 +3509,13 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  		} else {
>  			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
>  
> -			set_range_writeback(tree, cur, cur + iosize - 1);
> +			clear_page_blks_state(page, 1 << BLK_STATE_DIRTY, cur,
> +					cur + iosize - 1);
> +			set_page_writeback(page);
> +
> +			set_page_blks_state(page, 1 << BLK_STATE_IO, cur,
> +					cur + iosize - 1);
> +
>  			if (!PageWriteback(page)) {
>  				btrfs_err(BTRFS_I(inode)->root->fs_info,
>  					   "page %lu not writeback, cur %llu end %llu",
> @@ -3542,17 +3530,14 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
>  			if (ret)
>  				SetPageError(page);
>  		}
> -		cur = cur + iosize;
> -		pg_offset += iosize;
> +
> +		cur += iosize;
>  		nr++;
>  	}
>  done:
>  	*nr_ret = nr;
>  
>  done_unlocked:
> -
> -	/* drop our reference on any cached states */
> -	free_extent_state(cached_state);
>  	return ret;
>  }
>  
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 23b6e03..cbe6381 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -495,6 +495,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>  	u64 num_bytes;
>  	u64 start_pos;
>  	u64 end_of_last_block;
> +	u64 start;
> +	u64 end;
> +	u64 page_end;
>  	u64 end_pos = pos + write_bytes;
>  	loff_t isize = i_size_read(inode);
>  
> @@ -507,11 +510,24 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>  	if (err)
>  		return err;
>  
> +	start = start_pos;
> +
>  	for (i = 0; i < num_pages; i++) {
>  		struct page *p = pages[i];
>  		SetPageUptodate(p);
>  		ClearPageChecked(p);
> +
> +		end = page_end = page_offset(p) + PAGE_CACHE_SIZE - 1;
> +
> +		if (i == num_pages - 1)
> +			end = min_t(u64, page_end, end_of_last_block);
> +
> +		set_page_blks_state(p,
> +				1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +				start, end);
>  		set_page_dirty(p);
> +
> +		start = page_end + 1;

This is not the usual way, page_end is unnecessary, (start += PAGE_CACHE_SIZE) should work.

>  	}
>  
>  	/*
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8262f83..ac6a3f3 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1995,6 +1995,11 @@ again:
>  	 }
>  
>  	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
> +
> +	set_page_blks_state(page,
> +			1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +			page_start, page_end);
> +
>  	ClearPageChecked(page);
>  	set_page_dirty(page);
>  out:
> @@ -2984,26 +2989,48 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
>  	struct btrfs_ordered_extent *ordered_extent = NULL;
>  	struct btrfs_workqueue *wq;
>  	btrfs_work_func_t func;
> +	u64 ordered_start, ordered_end;
> +	int done;
>  
>  	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
>  
>  	ClearPagePrivate2(page);
> -	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
> -					    end - start + 1, uptodate))
> -		return 0;
> +loop:
> +	ordered_extent = btrfs_lookup_ordered_range(inode, start,
> +						end - start + 1);
> +	if (!ordered_extent)
> +		goto out;
>  
> -	if (btrfs_is_free_space_inode(inode)) {
> -		wq = root->fs_info->endio_freespace_worker;
> -		func = btrfs_freespace_write_helper;
> -	} else {
> -		wq = root->fs_info->endio_write_workers;
> -		func = btrfs_endio_write_helper;
> +	ordered_start = max_t(u64, start, ordered_extent->file_offset);
> +	ordered_end = min_t(u64, end,
> +			ordered_extent->file_offset + ordered_extent->len - 1);
> +
> +	done = btrfs_dec_test_ordered_pending(inode, &ordered_extent,
> +					ordered_start,
> +					ordered_end - ordered_start + 1,
> +					uptodate);
> +	if (done) {
> +		if (btrfs_is_free_space_inode(inode)) {
> +			wq = root->fs_info->endio_freespace_worker;
> +			func = btrfs_freespace_write_helper;
> +		} else {
> +			wq = root->fs_info->endio_write_workers;
> +			func = btrfs_endio_write_helper;
> +		}
> +
> +		btrfs_init_work(&ordered_extent->work, func,
> +				finish_ordered_fn, NULL, NULL);
> +		btrfs_queue_work(wq, &ordered_extent->work);
>  	}
>  
> -	btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
> -			NULL);
> -	btrfs_queue_work(wq, &ordered_extent->work);
> +	btrfs_put_ordered_extent(ordered_extent);
> +
> +	start = ordered_end + 1;
> +
> +	if (start < end)
> +		goto loop;
>  
> +out:

I saw this's put a BUG_ON(block_start == EXTENT_MAP_INLINE); in writepage(),
but I didn't see the code of disabling inline data in patch 01 and patch 02,
but anyway I think we can avoid above searching for ordered_extents in a single page
if we enable inline data.

Thanks,

-liubo

>  	return 0;
>  }
>  
> @@ -4601,6 +4628,9 @@ again:
>  		goto out_unlock;
>  	}
>  
> +	set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +			page_start, page_end);
> +
>  	if (offset != PAGE_CACHE_SIZE) {
>  		if (!len)
>  			len = PAGE_CACHE_SIZE - offset;
> @@ -8590,6 +8620,10 @@ again:
>  		ret = VM_FAULT_SIGBUS;
>  		goto out_unlock;
>  	}
> +
> +	set_page_blks_state(page, 1 << BLK_STATE_DIRTY | 1 << BLK_STATE_UPTODATE,
> +			page_start, end);
> +
>  	ret = 0;
>  
>  	/* page is wholly or partially inside EOF */
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

next prev parent reply	other threads:[~2015-06-26  9:51 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-01 15:22 [RFC PATCH V11 00/21] Btrfs: Subpagesize-blocksize: Allow I/O on blocks whose size is less than page size Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 01/21] Btrfs: subpagesize-blocksize: Fix whole page read Chandan Rajendra
2015-06-19  4:45   ` Liu Bo
2015-06-19  9:45     ` Chandan Rajendra
2015-06-23  8:37       ` Liu Bo
2016-02-10 10:44         ` David Sterba
2016-02-10 10:39       ` David Sterba
2016-02-11  5:42         ` Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 02/21] Btrfs: subpagesize-blocksize: Fix whole page write Chandan Rajendra
2015-06-26  9:50   ` Liu Bo [this message]
2015-06-29  8:54     ` Chandan Rajendra
2015-07-01 14:27       ` Liu Bo
2015-06-01 15:22 ` [RFC PATCH V11 03/21] Btrfs: subpagesize-blocksize: __btrfs_buffered_write: Reserve/release extents aligned to block size Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 04/21] Btrfs: subpagesize-blocksize: Define extent_buffer_head Chandan Rajendra
2015-07-01 14:33   ` Liu Bo
2015-06-01 15:22 ` [RFC PATCH V11 05/21] Btrfs: subpagesize-blocksize: Read tree blocks whose size is < PAGE_SIZE Chandan Rajendra
2015-07-01 14:40   ` Liu Bo
2015-07-03 10:02     ` Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 06/21] Btrfs: subpagesize-blocksize: Write only dirty extent buffers belonging to a page Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 07/21] Btrfs: subpagesize-blocksize: Allow mounting filesystems where sectorsize != PAGE_SIZE Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 08/21] Btrfs: subpagesize-blocksize: Compute and look up csums based on sectorsized blocks Chandan Rajendra
2015-07-01 14:37   ` Liu Bo
2015-06-01 15:22 ` [RFC PATCH V11 09/21] Btrfs: subpagesize-blocksize: Direct I/O read: Work " Chandan Rajendra
2015-07-01 14:45   ` Liu Bo
2015-07-03 10:05     ` Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 10/21] Btrfs: subpagesize-blocksize: fallocate: Work with sectorsized units Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 11/21] Btrfs: subpagesize-blocksize: btrfs_page_mkwrite: Reserve space in " Chandan Rajendra
2015-07-06  3:18   ` Liu Bo
2015-06-01 15:22 ` [RFC PATCH V11 12/21] Btrfs: subpagesize-blocksize: Search for all ordered extents that could span across a page Chandan Rajendra
2015-07-01 14:47   ` Liu Bo
2015-07-03 10:08     ` Chandan Rajendra
2015-07-06  3:17       ` Liu Bo
2015-07-06 10:49         ` Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 13/21] Btrfs: subpagesize-blocksize: Deal with partial ordered extent allocations Chandan Rajendra
2015-07-06 10:06   ` Liu Bo
2015-07-07 13:38     ` Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 14/21] Btrfs: subpagesize-blocksize: Explicitly Track I/O status of blocks of an ordered extent Chandan Rajendra
2015-07-20  8:34   ` Liu Bo
2015-07-20 12:54     ` Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 15/21] Btrfs: subpagesize-blocksize: Revert commit fc4adbff823f76577ece26dcb88bf6f8392dbd43 Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 16/21] Btrfs: subpagesize-blocksize: Prevent writes to an extent buffer when PG_writeback flag is set Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 17/21] Btrfs: subpagesize-blocksize: Use (eb->start, seq) as search key for tree modification log Chandan Rajendra
2015-07-20 14:46   ` Liu Bo
2015-06-01 15:22 ` [RFC PATCH V11 18/21] Btrfs: subpagesize-blocksize: btrfs_submit_direct_hook: Handle map_length < bio vector length Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 19/21] Revert "btrfs: fix lockups from btrfs_clear_path_blocking" Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 20/21] Btrfs: subpagesize-blockssize: Limit inline extents to root->sectorsize Chandan Rajendra
2015-06-01 15:22 ` [RFC PATCH V11 21/21] Btrfs: subpagesize-blocksize: Fix block size returned to user space Chandan Rajendra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150626095052.GB16732@localhost.localdomain \
    --to=bo.li.liu@oracle.com \
    --cc=chandan@linux.vnet.ibm.com \
    --cc=chandan@mykolab.com \
    --cc=clm@fb.com \
    --cc=dsterba@suse.cz \
    --cc=jbacik@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).