From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:57178) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Wifmu-0003TO-Bb for qemu-devel@nongnu.org; Fri, 09 May 2014 04:04:21 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Wifml-00046W-8C for qemu-devel@nongnu.org; Fri, 09 May 2014 04:04:12 -0400 Received: from mail-ee0-x234.google.com ([2a00:1450:4013:c00::234]:58620) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Wifmk-00046K-TF for qemu-devel@nongnu.org; Fri, 09 May 2014 04:04:03 -0400 Received: by mail-ee0-f52.google.com with SMTP id e53so2357271eek.11 for ; Fri, 09 May 2014 01:04:02 -0700 (PDT) Sender: Paolo Bonzini Message-ID: <536C8BEC.9010406@redhat.com> Date: Fri, 09 May 2014 10:03:56 +0200 From: Paolo Bonzini MIME-Version: 1.0 References: <1399402854-24559-1-git-send-email-mreitz@redhat.com> In-Reply-To: <1399402854-24559-1-git-send-email-mreitz@redhat.com> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Max Reitz , qemu-devel@nongnu.org Cc: Kevin Wolf , Stefan Hajnoczi Il 06/05/2014 21:00, Max Reitz ha scritto: > The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if > FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even > compiled in in this case. However, there may be implementations which > support the latter but not the former (e.g., NFSv4.2) as well as vice > versa. > > To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will > probably be covered by POSIX soon) and if that does not work, fall back > to FIEMAP; and if that does not work either, treat everything as > allocated. I think it's better to test FIEMAP first; if it's implemented, FIEMAP it's guaranteed to be accurate, and it can also be ruled out on the first attempt if it's not implemented. With SEEK_HOLE/SEEK_DATA, the OS could provide a dummy implementation that returns no holes: this would mean either always falling back to FIEMAP on fully-allocated files, or producing inaccurate results for sparse files on filesystems that do not support SEEK_HOLE/SEEK_DATA. Paolo > Signed-off-by: Max Reitz > --- > v2: > - reworked using static functions [Stefan] > - changed order of FIEMAP and SEEK_HOLE [Eric] > --- > block/raw-posix.c | 135 ++++++++++++++++++++++++++++++++++-------------------- > 1 file changed, 85 insertions(+), 50 deletions(-) > > diff --git a/block/raw-posix.c b/block/raw-posix.c > index 3ce026d..1b3900d 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -146,6 +146,12 @@ typedef struct BDRVRawState { > bool has_discard:1; > bool has_write_zeroes:1; > bool discard_zeroes:1; > +#if defined SEEK_HOLE && defined SEEK_DATA > + bool skip_seek_hole; > +#endif > +#ifdef CONFIG_FIEMAP > + bool skip_fiemap; > +#endif > } BDRVRawState; > > typedef struct BDRVRawReopenState { > @@ -1272,53 +1278,29 @@ static int raw_create(const char *filename, QEMUOptionParameter *options, > return result; > } > > -/* > - * Returns true iff the specified sector is present in the disk image. Drivers > - * not implementing the functionality are assumed to not support backing files, > - * hence all their sectors are reported as allocated. > - * > - * If 'sector_num' is beyond the end of the disk image the return value is 0 > - * and 'pnum' is set to 0. > - * > - * 'pnum' is set to the number of sectors (including and immediately following > - * the specified sector) that are known to be in the same > - * allocated/unallocated state. > - * > - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes > - * beyond the end of the disk image it will be clamped. > - */ > -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, > - int64_t sector_num, > - int nb_sectors, int *pnum) > +static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, > + off_t *hole, int nb_sectors, int *pnum) > { > - off_t start, data, hole; > - int64_t ret; > - > - ret = fd_open(bs); > - if (ret < 0) { > - return ret; > - } > - > - start = sector_num * BDRV_SECTOR_SIZE; > - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; > - > #ifdef CONFIG_FIEMAP > - > BDRVRawState *s = bs->opaque; > + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; > struct { > struct fiemap fm; > struct fiemap_extent fe; > } f; > > + if (s->skip_fiemap) { > + return -ENOTSUP; > + } > + > f.fm.fm_start = start; > f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; > f.fm.fm_flags = 0; > f.fm.fm_extent_count = 1; > f.fm.fm_reserved = 0; > if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { > - /* Assume everything is allocated. */ > - *pnum = nb_sectors; > - return ret; > + s->skip_fiemap = true; > + return -errno; > } > > if (f.fm.fm_mapped_extents == 0) { > @@ -1326,44 +1308,97 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, > * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! > */ > off_t length = lseek(s->fd, 0, SEEK_END); > - hole = f.fm.fm_start; > - data = MIN(f.fm.fm_start + f.fm.fm_length, length); > + *hole = f.fm.fm_start; > + *data = MIN(f.fm.fm_start + f.fm.fm_length, length); > } else { > - data = f.fe.fe_logical; > - hole = f.fe.fe_logical + f.fe.fe_length; > + *data = f.fe.fe_logical; > + *hole = f.fe.fe_logical + f.fe.fe_length; > if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) { > ret |= BDRV_BLOCK_ZERO; > } > } > > -#elif defined SEEK_HOLE && defined SEEK_DATA > + return ret; > +#else > + return -ENOTSUP; > +#endif > +} > > +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, > + off_t *hole, int *pnum) > +{ > +#if defined SEEK_HOLE && defined SEEK_DATA > BDRVRawState *s = bs->opaque; > > - hole = lseek(s->fd, start, SEEK_HOLE); > - if (hole == -1) { > + if (s->skip_seek_hole) { > + return -ENOTSUP; > + } > + > + *hole = lseek(s->fd, start, SEEK_HOLE); > + if (*hole == -1) { > /* -ENXIO indicates that sector_num was past the end of the file. > * There is a virtual hole there. */ > assert(errno != -ENXIO); > > - /* Most likely EINVAL. Assume everything is allocated. */ > - *pnum = nb_sectors; > - return ret; > + s->skip_seek_hole = true; > + return -errno; > } > > - if (hole > start) { > - data = start; > + if (*hole > start) { > + *data = start; > } else { > /* On a hole. We need another syscall to find its end. */ > - data = lseek(s->fd, start, SEEK_DATA); > - if (data == -1) { > - data = lseek(s->fd, 0, SEEK_END); > + *data = lseek(s->fd, start, SEEK_DATA); > + if (*data == -1) { > + *data = lseek(s->fd, 0, SEEK_END); > } > } > + > + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; > #else > - data = 0; > - hole = start + nb_sectors * BDRV_SECTOR_SIZE; > + return -ENOTSUP; > #endif > +} > + > +/* > + * Returns true iff the specified sector is present in the disk image. Drivers > + * not implementing the functionality are assumed to not support backing files, > + * hence all their sectors are reported as allocated. > + * > + * If 'sector_num' is beyond the end of the disk image the return value is 0 > + * and 'pnum' is set to 0. > + * > + * 'pnum' is set to the number of sectors (including and immediately following > + * the specified sector) that are known to be in the same > + * allocated/unallocated state. > + * > + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes > + * beyond the end of the disk image it will be clamped. > + */ > +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, > + int64_t sector_num, > + int nb_sectors, int *pnum) > +{ > + off_t start, data = 0, hole = 0; > + int64_t ret; > + > + ret = fd_open(bs); > + if (ret < 0) { > + return ret; > + } > + > + start = sector_num * BDRV_SECTOR_SIZE; > + > + ret = try_seek_hole(bs, start, &data, &hole, pnum); > + if (ret < 0) { > + ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum); > + if (ret < 0) { > + /* Assume everything is allocated. */ > + data = 0; > + hole = start + nb_sectors * BDRV_SECTOR_SIZE; > + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; > + } > + } > > if (data <= start) { > /* On a data extent, compute sectors to the end of the extent. */ >