linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Josef Bacik <josef@redhat.com>
To: Chris Mason <chris.mason@oracle.com>
Cc: linux-btrfs <linux-btrfs@vger.kernel.org>
Subject: Re: [PATCH] rework btrfs fiemap (please review)
Date: Fri, 25 Feb 2011 09:52:37 -0500	[thread overview]
Message-ID: <20110225145236.GB2534@localhost.localdomain> (raw)
In-Reply-To: <1298431616-sup-9316@think>

On Tue, Feb 22, 2011 at 10:29:17PM -0500, Chris Mason wrote:
> Excerpts from Chris Mason's message of 2011-02-21 22:19:09 -0500:
> > Hi everyone,
> > 
> > Looks like the latest versions of cp use fiemap to decide if a file is
> > sparse, which is a great way to avoid doing memcmp, but only if your
> > fiemap is really accurate about which ranges in the file have holes.
> 
> Josef and I were talking this over today, and I wasn't quite handling
> the last extent in the file correctly.  The fiemap interface expects us
> to mark the last extent, but it doesn't want to know about holes.
> 
> This means we need to search forward when we find a hole and see if
> there are any extents past it.
> 
> So, here's a slightly bigger patch that makes things less complex.  It
> adds a helper to search forward through holes.
> 
> -chris
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 92ac519..fd3f172 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
>   */
>  u64 count_range_bits(struct extent_io_tree *tree,
>  		     u64 *start, u64 search_end, u64 max_bytes,
> -		     unsigned long bits)
> +		     unsigned long bits, int contig)
>  {
>  	struct rb_node *node;
>  	struct extent_state *state;
>  	u64 cur_start = *start;
>  	u64 total_bytes = 0;
> +	u64 last = 0;
>  	int found = 0;
>  
>  	if (search_end <= cur_start) {
> @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
>  		state = rb_entry(node, struct extent_state, rb_node);
>  		if (state->start > search_end)
>  			break;
> -		if (state->end >= cur_start && (state->state & bits)) {
> +		if (contig && found && state->start > last + 1)
> +			break;
> +		if (state->end >= cur_start && (state->state & bits) == bits) {
>  			total_bytes += min(search_end, state->end) + 1 -
>  				       max(cur_start, state->start);
>  			if (total_bytes >= max_bytes)
> @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
>  				*start = state->start;
>  				found = 1;
>  			}
> +			last = state->end;
> +		} else if (contig && found) {
> +			break;
>  		}
>  		node = rb_next(node);
>  		if (!node)
> @@ -2912,6 +2918,46 @@ out:
>  	return sector;
>  }
>  
> +/*
> + * helper function for fiemap, which doesn't want to see any holes.
> + * This maps until we find something past 'last'
> + */
> +static struct extent_map *get_extent_skip_holes(struct inode *inode,
> +						u64 offset,
> +						u64 last,
> +						get_extent_t *get_extent)
> +{
> +	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
> +	struct extent_map *em;
> +	u64 len;
> +
> +	if (offset >= last)
> +		return NULL;
> +
> +	while(1) {
> +		len = last - offset;
> +		if (len == 0)
> +			break;
> +		len = (len + sectorsize - 1) & ~(sectorsize - 1);
> +		em = get_extent(inode, NULL, 0, offset, len, 0);
> +		if (!em || IS_ERR(em))
> +			return em;
> +
> +		/* if this isn't a hole return it */
> +		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
> +		    em->block_start != EXTENT_MAP_HOLE) {
> +			return em;
> +		}
> +
> +		/* this is a hole, advance to the next extent */
> +		offset = extent_map_end(em);
> +		free_extent_map(em);
> +		if (offset >= last)
> +			break;
> +	}
> +	return NULL;
> +}
> +
>  int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		__u64 start, __u64 len, get_extent_t *get_extent)
>  {
> @@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  	u32 flags = 0;
>  	u32 found_type;
>  	u64 last;
> +	u64 last_for_get_extent = 0;
>  	u64 disko = 0;
> +	u64 isize = i_size_read(inode);
>  	struct btrfs_key found_key;
>  	struct extent_map *em = NULL;
>  	struct extent_state *cached_state = NULL;
>  	struct btrfs_path *path;
>  	struct btrfs_file_extent_item *item;
>  	int end = 0;
> -	u64 em_start = 0, em_len = 0;
> +	u64 em_start = 0;
> +	u64 em_len = 0;
> +	u64 em_end = 0;
>  	unsigned long emflags;
> -	int hole = 0;
>  
>  	if (len == 0)
>  		return -EINVAL;
> @@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		return -ENOMEM;
>  	path->leave_spinning = 1;
>  
> +	/*
> +	 * lookup the last file extent.  We're not using i_size here
> +	 * because there might be preallocation past i_size
> +	 */
>  	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
>  				       path, inode->i_ino, -1, 0);
>  	if (ret < 0) {
> @@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
>  	found_type = btrfs_key_type(&found_key);
>  
> -	/* No extents, just return */
> +	/* No extents, but there might be delalloc bits */
>  	if (found_key.objectid != inode->i_ino ||
>  	    found_type != BTRFS_EXTENT_DATA_KEY) {
> -		btrfs_free_path(path);
> -		return 0;
> +		/* have to trust i_size as the end */
> +		last = (u64)-1;
> +		last_for_get_extent = isize;
> +	} else {
> +		/*
> +		 * remember the start of the last extent.  There are a
> +		 * bunch of different factors that go into the length of the
> +		 * extent, so its much less complex to remember where it started
> +		 */
> +		last = found_key.offset;
> +		last_for_get_extent = last + 1;
>  	}
> -	last = found_key.offset;
>  	btrfs_free_path(path);
>  
> +	/*
> +	 * we might have some extents allocated but more delalloc past those
> +	 * extents.  so, we trust isize unless the start of the last extent is
> +	 * beyond isize
> +	 */
> +	if (last < isize) {
> +		last = (u64)-1;
> +		last_for_get_extent = isize;
> +	}
> +
>  	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
>  			 &cached_state, GFP_NOFS);
> -	em = get_extent(inode, NULL, 0, off, max - off, 0);
> +
> +	em = get_extent_skip_holes(inode, off, last_for_get_extent,
> +				   get_extent);
>  	if (!em)
>  		goto out;
>  	if (IS_ERR(em)) {
> @@ -2973,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  	}
>  
>  	while (!end) {
> -		hole = 0;
> -		off = em->start + em->len;
> +		off = extent_map_end(em);
>  		if (off >= max)
>  			end = 1;
>  
> -		if (em->block_start == EXTENT_MAP_HOLE) {
> -			hole = 1;
> -			goto next;
> -		}
> -
>  		em_start = em->start;
>  		em_len = em->len;
> -
> +		em_end = extent_map_end(em);
> +		emflags = em->flags;
>  		disko = 0;
>  		flags = 0;
>  
> @@ -3004,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
>  			flags |= FIEMAP_EXTENT_ENCODED;
>  
> -next:
> -		emflags = em->flags;
>  		free_extent_map(em);
>  		em = NULL;
> -		if (!end) {
> -			em = get_extent(inode, NULL, 0, off, max - off, 0);
> -			if (!em)
> -				goto out;
> -			if (IS_ERR(em)) {
> -				ret = PTR_ERR(em);
> -				goto out;
> -			}
> -			emflags = em->flags;
> -		}
> -
> -		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
> +		if ((em_start >= last) || em_len == (u64)-1 ||
> +		   (last == (u64)-1 && isize <= em_end)) {
>  			flags |= FIEMAP_EXTENT_LAST;
>  			end = 1;
>  		}
>  
> -		if (em_start == last) {
> +		/* now scan forward to see if this is really the last extent. */
> +		em = get_extent_skip_holes(inode, off, last_for_get_extent,
> +					   get_extent);
> +		if (IS_ERR(em)) {
> +			ret = PTR_ERR(em);
> +			goto out;
> +		}
> +		if (!em) {
>  			flags |= FIEMAP_EXTENT_LAST;
>  			end = 1;
>  		}
> -
> -		if (!hole) {
> -			ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> -						em_len, flags);
> -			if (ret)
> -				goto out_free;
> -		}
> +		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> +					      em_len, flags);
> +		if (ret)
> +			goto out_free;
>  	}
>  out_free:
>  	free_extent_map(em);
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 7083cfa..9318dfe 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -191,7 +191,7 @@ void extent_io_exit(void);
>  
>  u64 count_range_bits(struct extent_io_tree *tree,
>  		     u64 *start, u64 search_end,
> -		     u64 max_bytes, unsigned long bits);
> +		     u64 max_bytes, unsigned long bits, int contig);
>  
>  void free_extent_state(struct extent_state *state);
>  int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index fb9bd78..0efdb65 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
>  
>  	private = 0;
>  	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
> -			     (u64)-1, 1, EXTENT_DIRTY)) {
> +			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
>  		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
>  					start, &private_failure);
>  		if (ret == 0) {
> @@ -5280,6 +5280,128 @@ out:
>  	return em;
>  }
>  
> +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
> +					   size_t pg_offset, u64 start, u64 len,
> +					   int create)
> +{
> +	struct extent_map *em;
> +	struct extent_map *hole_em = NULL;
> +	u64 range_start = start;
> +	u64 end;
> +	u64 found;
> +	u64 found_end;
> +	int err = 0;
> +
> +	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
> +	if (IS_ERR(em))
> +		return em;
> +	if (em) {
> +		/*
> +		 * if our em maps to a hole, there might
> +		 * actually be delalloc bytes behind it
> +		 */
> +		if (em->block_start != EXTENT_MAP_HOLE)
> +			return em;
> +		else
> +			hole_em = em;
> +	}
> +
> +	/* check to see if we've wrapped (len == -1 or similar) */
> +	end = start + len;
> +	if (end < start)
> +		end = (u64)-1;
> +	else
> +		end -= 1;
> +
> +	em = NULL;
> +
> +	/* ok, we didn't find anything, lets look for delalloc */
> +	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
> +				 end, len, EXTENT_DELALLOC, 1);
> +	found_end = range_start + found;
> +	if (found_end < range_start)
> +		found_end = (u64)-1;
> +
> +	/*
> +	 * we didn't find anything useful, return
> +	 * the original results from get_extent()
> +	 */
> +	if (range_start > end || found_end <= start) {
> +		em = hole_em;
> +		hole_em = NULL;
> +		goto out;
> +	}
> +
> +	/* adjust the range_start to make sure it doesn't
> +	 * go backwards from the start they passed in
> +	 */
> +	range_start = max(start,range_start);
> +	found = found_end - range_start;
> +
> +	if (found > 0) {
> +		u64 hole_start = start;
> +		u64 hole_len = len;
> +
> +		em = alloc_extent_map(GFP_NOFS);
> +		if (!em) {
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +		/*
> +		 * when btrfs_get_extent can't find anything it
> +		 * returns one huge hole
> +		 *
> +		 * make sure what it found really fits our range, and
> +		 * adjust to make sure it is based on the start from
> +		 * the caller
> +		 */
> +		if (hole_em) {
> +			u64 calc_end = extent_map_end(hole_em);
> +
> +			if (calc_end <= start || (hole_em->start > end)) {
> +				free_extent_map(hole_em);
> +				hole_em = NULL;
> +			} else {
> +				hole_start = max(hole_em->start, start);
> +				hole_len = calc_end - hole_start;
> +			}
> +		}
> +		em->bdev = NULL;
> +		if (hole_em && range_start > hole_start) {
> +			/* our hole starts before our delalloc, so we
> +			 * have to return just the parts of the hole
> +			 * that go until  the delalloc starts
> +			 */
> +			em->len = min(hole_len,
> +				      range_start - hole_start);
> +			em->start = hole_start;
> +			em->orig_start = hole_start;
> +			/*
> +			 * don't adjust block start at all,
> +			 * it is fixed at EXTENT_MAP_HOLE
> +			 */
> +			em->block_start = hole_em->block_start;
> +			em->block_len = hole_len;
> +		} else {
> +			em->start = range_start;
> +			em->len = found;
> +			em->orig_start = range_start;
> +			em->block_start = EXTENT_MAP_DELALLOC;
> +			em->block_len = found;
> +		}
> +	} else if (hole_em) {
> +		return hole_em;
> +	}
> +out:
> +
> +	free_extent_map(hole_em);
> +	if (err) {
> +		free_extent_map(em);
> +		return ERR_PTR(err);
> +	}
> +	return em;
> +}
> +
>  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>  						  u64 start, u64 len)
>  {
> @@ -6102,7 +6224,7 @@ out:
>  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
>  		__u64 start, __u64 len)
>  {
> -	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
> +	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
>  }
>  
>  int btrfs_readpage(struct file *file, struct page *page)

Looks good to me and passes xfstest 225 with the new no-sync option,

Reviewed-by: Josef Bacik <josef@redhat.com>

Thanks,

Josef

      reply	other threads:[~2011-02-25 14:52 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-22  3:19 [PATCH] rework btrfs fiemap (please review) Chris Mason
2011-02-23  3:29 ` Chris Mason
2011-02-25 14:52   ` Josef Bacik [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110225145236.GB2534@localhost.localdomain \
    --to=josef@redhat.com \
    --cc=chris.mason@oracle.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).