From: Josef Bacik <josef@redhat.com>
To: Chris Mason <chris.mason@oracle.com>
Cc: linux-btrfs <linux-btrfs@vger.kernel.org>
Subject: Re: [PATCH] rework btrfs fiemap (please review)
Date: Fri, 25 Feb 2011 09:52:37 -0500 [thread overview]
Message-ID: <20110225145236.GB2534@localhost.localdomain> (raw)
In-Reply-To: <1298431616-sup-9316@think>
On Tue, Feb 22, 2011 at 10:29:17PM -0500, Chris Mason wrote:
> Excerpts from Chris Mason's message of 2011-02-21 22:19:09 -0500:
> > Hi everyone,
> >
> > Looks like the latest versions of cp use fiemap to decide if a file is
> > sparse, which is a great way to avoid doing memcmp, but only if your
> > fiemap is really accurate about which ranges in the file have holes.
>
> Josef and I were talking this over today, and I wasn't quite handling
> the last extent in the file correctly. The fiemap interface expects us
> to mark the last extent, but it doesn't want to know about holes.
>
> This means we need to search forward when we find a hole and see if
> there are any extents past it.
>
> So, here's a slightly bigger patch that makes things less complex. It
> adds a helper to search forward through holes.
>
> -chris
>
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 92ac519..fd3f172 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
> */
> u64 count_range_bits(struct extent_io_tree *tree,
> u64 *start, u64 search_end, u64 max_bytes,
> - unsigned long bits)
> + unsigned long bits, int contig)
> {
> struct rb_node *node;
> struct extent_state *state;
> u64 cur_start = *start;
> u64 total_bytes = 0;
> + u64 last = 0;
> int found = 0;
>
> if (search_end <= cur_start) {
> @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
> state = rb_entry(node, struct extent_state, rb_node);
> if (state->start > search_end)
> break;
> - if (state->end >= cur_start && (state->state & bits)) {
> + if (contig && found && state->start > last + 1)
> + break;
> + if (state->end >= cur_start && (state->state & bits) == bits) {
> total_bytes += min(search_end, state->end) + 1 -
> max(cur_start, state->start);
> if (total_bytes >= max_bytes)
> @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
> *start = state->start;
> found = 1;
> }
> + last = state->end;
> + } else if (contig && found) {
> + break;
> }
> node = rb_next(node);
> if (!node)
> @@ -2912,6 +2918,46 @@ out:
> return sector;
> }
>
> +/*
> + * helper function for fiemap, which doesn't want to see any holes.
> + * This maps until we find something past 'last'
> + */
> +static struct extent_map *get_extent_skip_holes(struct inode *inode,
> + u64 offset,
> + u64 last,
> + get_extent_t *get_extent)
> +{
> + u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
> + struct extent_map *em;
> + u64 len;
> +
> + if (offset >= last)
> + return NULL;
> +
> + while(1) {
> + len = last - offset;
> + if (len == 0)
> + break;
> + len = (len + sectorsize - 1) & ~(sectorsize - 1);
> + em = get_extent(inode, NULL, 0, offset, len, 0);
> + if (!em || IS_ERR(em))
> + return em;
> +
> + /* if this isn't a hole return it */
> + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
> + em->block_start != EXTENT_MAP_HOLE) {
> + return em;
> + }
> +
> + /* this is a hole, advance to the next extent */
> + offset = extent_map_end(em);
> + free_extent_map(em);
> + if (offset >= last)
> + break;
> + }
> + return NULL;
> +}
> +
> int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> __u64 start, __u64 len, get_extent_t *get_extent)
> {
> @@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> u32 flags = 0;
> u32 found_type;
> u64 last;
> + u64 last_for_get_extent = 0;
> u64 disko = 0;
> + u64 isize = i_size_read(inode);
> struct btrfs_key found_key;
> struct extent_map *em = NULL;
> struct extent_state *cached_state = NULL;
> struct btrfs_path *path;
> struct btrfs_file_extent_item *item;
> int end = 0;
> - u64 em_start = 0, em_len = 0;
> + u64 em_start = 0;
> + u64 em_len = 0;
> + u64 em_end = 0;
> unsigned long emflags;
> - int hole = 0;
>
> if (len == 0)
> return -EINVAL;
> @@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> return -ENOMEM;
> path->leave_spinning = 1;
>
> + /*
> + * lookup the last file extent. We're not using i_size here
> + * because there might be preallocation past i_size
> + */
> ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
> path, inode->i_ino, -1, 0);
> if (ret < 0) {
> @@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
> found_type = btrfs_key_type(&found_key);
>
> - /* No extents, just return */
> + /* No extents, but there might be delalloc bits */
> if (found_key.objectid != inode->i_ino ||
> found_type != BTRFS_EXTENT_DATA_KEY) {
> - btrfs_free_path(path);
> - return 0;
> + /* have to trust i_size as the end */
> + last = (u64)-1;
> + last_for_get_extent = isize;
> + } else {
> + /*
> + * remember the start of the last extent. There are a
> + * bunch of different factors that go into the length of the
> + * extent, so its much less complex to remember where it started
> + */
> + last = found_key.offset;
> + last_for_get_extent = last + 1;
> }
> - last = found_key.offset;
> btrfs_free_path(path);
>
> + /*
> + * we might have some extents allocated but more delalloc past those
> + * extents. so, we trust isize unless the start of the last extent is
> + * beyond isize
> + */
> + if (last < isize) {
> + last = (u64)-1;
> + last_for_get_extent = isize;
> + }
> +
> lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
> &cached_state, GFP_NOFS);
> - em = get_extent(inode, NULL, 0, off, max - off, 0);
> +
> + em = get_extent_skip_holes(inode, off, last_for_get_extent,
> + get_extent);
> if (!em)
> goto out;
> if (IS_ERR(em)) {
> @@ -2973,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> }
>
> while (!end) {
> - hole = 0;
> - off = em->start + em->len;
> + off = extent_map_end(em);
> if (off >= max)
> end = 1;
>
> - if (em->block_start == EXTENT_MAP_HOLE) {
> - hole = 1;
> - goto next;
> - }
> -
> em_start = em->start;
> em_len = em->len;
> -
> + em_end = extent_map_end(em);
> + emflags = em->flags;
> disko = 0;
> flags = 0;
>
> @@ -3004,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
> flags |= FIEMAP_EXTENT_ENCODED;
>
> -next:
> - emflags = em->flags;
> free_extent_map(em);
> em = NULL;
> - if (!end) {
> - em = get_extent(inode, NULL, 0, off, max - off, 0);
> - if (!em)
> - goto out;
> - if (IS_ERR(em)) {
> - ret = PTR_ERR(em);
> - goto out;
> - }
> - emflags = em->flags;
> - }
> -
> - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
> + if ((em_start >= last) || em_len == (u64)-1 ||
> + (last == (u64)-1 && isize <= em_end)) {
> flags |= FIEMAP_EXTENT_LAST;
> end = 1;
> }
>
> - if (em_start == last) {
> + /* now scan forward to see if this is really the last extent. */
> + em = get_extent_skip_holes(inode, off, last_for_get_extent,
> + get_extent);
> + if (IS_ERR(em)) {
> + ret = PTR_ERR(em);
> + goto out;
> + }
> + if (!em) {
> flags |= FIEMAP_EXTENT_LAST;
> end = 1;
> }
> -
> - if (!hole) {
> - ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> - em_len, flags);
> - if (ret)
> - goto out_free;
> - }
> + ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
> + em_len, flags);
> + if (ret)
> + goto out_free;
> }
> out_free:
> free_extent_map(em);
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 7083cfa..9318dfe 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -191,7 +191,7 @@ void extent_io_exit(void);
>
> u64 count_range_bits(struct extent_io_tree *tree,
> u64 *start, u64 search_end,
> - u64 max_bytes, unsigned long bits);
> + u64 max_bytes, unsigned long bits, int contig);
>
> void free_extent_state(struct extent_state *state);
> int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index fb9bd78..0efdb65 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
>
> private = 0;
> if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
> - (u64)-1, 1, EXTENT_DIRTY)) {
> + (u64)-1, 1, EXTENT_DIRTY, 0)) {
> ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
> start, &private_failure);
> if (ret == 0) {
> @@ -5280,6 +5280,128 @@ out:
> return em;
> }
>
> +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
> + size_t pg_offset, u64 start, u64 len,
> + int create)
> +{
> + struct extent_map *em;
> + struct extent_map *hole_em = NULL;
> + u64 range_start = start;
> + u64 end;
> + u64 found;
> + u64 found_end;
> + int err = 0;
> +
> + em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
> + if (IS_ERR(em))
> + return em;
> + if (em) {
> + /*
> + * if our em maps to a hole, there might
> + * actually be delalloc bytes behind it
> + */
> + if (em->block_start != EXTENT_MAP_HOLE)
> + return em;
> + else
> + hole_em = em;
> + }
> +
> + /* check to see if we've wrapped (len == -1 or similar) */
> + end = start + len;
> + if (end < start)
> + end = (u64)-1;
> + else
> + end -= 1;
> +
> + em = NULL;
> +
> + /* ok, we didn't find anything, lets look for delalloc */
> + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
> + end, len, EXTENT_DELALLOC, 1);
> + found_end = range_start + found;
> + if (found_end < range_start)
> + found_end = (u64)-1;
> +
> + /*
> + * we didn't find anything useful, return
> + * the original results from get_extent()
> + */
> + if (range_start > end || found_end <= start) {
> + em = hole_em;
> + hole_em = NULL;
> + goto out;
> + }
> +
> + /* adjust the range_start to make sure it doesn't
> + * go backwards from the start they passed in
> + */
> + range_start = max(start,range_start);
> + found = found_end - range_start;
> +
> + if (found > 0) {
> + u64 hole_start = start;
> + u64 hole_len = len;
> +
> + em = alloc_extent_map(GFP_NOFS);
> + if (!em) {
> + err = -ENOMEM;
> + goto out;
> + }
> + /*
> + * when btrfs_get_extent can't find anything it
> + * returns one huge hole
> + *
> + * make sure what it found really fits our range, and
> + * adjust to make sure it is based on the start from
> + * the caller
> + */
> + if (hole_em) {
> + u64 calc_end = extent_map_end(hole_em);
> +
> + if (calc_end <= start || (hole_em->start > end)) {
> + free_extent_map(hole_em);
> + hole_em = NULL;
> + } else {
> + hole_start = max(hole_em->start, start);
> + hole_len = calc_end - hole_start;
> + }
> + }
> + em->bdev = NULL;
> + if (hole_em && range_start > hole_start) {
> + /* our hole starts before our delalloc, so we
> + * have to return just the parts of the hole
> + * that go until the delalloc starts
> + */
> + em->len = min(hole_len,
> + range_start - hole_start);
> + em->start = hole_start;
> + em->orig_start = hole_start;
> + /*
> + * don't adjust block start at all,
> + * it is fixed at EXTENT_MAP_HOLE
> + */
> + em->block_start = hole_em->block_start;
> + em->block_len = hole_len;
> + } else {
> + em->start = range_start;
> + em->len = found;
> + em->orig_start = range_start;
> + em->block_start = EXTENT_MAP_DELALLOC;
> + em->block_len = found;
> + }
> + } else if (hole_em) {
> + return hole_em;
> + }
> +out:
> +
> + free_extent_map(hole_em);
> + if (err) {
> + free_extent_map(em);
> + return ERR_PTR(err);
> + }
> + return em;
> +}
> +
> static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
> u64 start, u64 len)
> {
> @@ -6102,7 +6224,7 @@ out:
> static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> __u64 start, __u64 len)
> {
> - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
> + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
> }
>
> int btrfs_readpage(struct file *file, struct page *page)
Looks good to me and passes xfstest 225 with the new no-sync option,
Reviewed-by: Josef Bacik <josef@redhat.com>
Thanks,
Josef
prev parent reply other threads:[~2011-02-25 14:52 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-02-22 3:19 [PATCH] rework btrfs fiemap (please review) Chris Mason
2011-02-23 3:29 ` Chris Mason
2011-02-25 14:52 ` Josef Bacik [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20110225145236.GB2534@localhost.localdomain \
--to=josef@redhat.com \
--cc=chris.mason@oracle.com \
--cc=linux-btrfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).