qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v3] block/raw-posix: Try both FIEMAP and SEEK_HOLE
@ 2014-05-06 22:18 Max Reitz
  2014-05-07  6:37 ` Paolo Bonzini
  0 siblings, 1 reply; 2+ messages in thread
From: Max Reitz @ 2014-05-06 22:18 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
compiled in in this case. However, there may be implementations which
support the latter but not the former (e.g., NFSv4.2) as well as vice
versa.

To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
probably be covered by POSIX soon) and if that does not work, fall back
to FIEMAP; and if that does not work either, treat everything as
allocated.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
v3:
 - use a tristate for keeping the information whether or not to use
   FIEMAP and/or SEEK_HOLE/SEEK_DATA [Eric]
 - fall through to another implementation (i.e. FIEMAP) if the first
   (i.e. SEEK_HOLE/SEEK_DATA) reports everything as allocated; this will
   not result in that implementation being skipped the next time,
   however [Eric]
---
 block/raw-posix.c | 182 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 131 insertions(+), 51 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 3ce026d..fd6bac6 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -123,6 +123,18 @@
 
 #define MAX_BLOCKSIZE	4096
 
+/* In case there are multiple implementations for the same feature provided by
+ * the environment, this enumeration may be used to represent the status of
+ * these alternatives. */
+typedef enum ImplementationAlternativeStatus {
+    /* The status is (yet) unknown. */
+    IMPLSTAT_UNKNOWN = 0,
+    /* This implementation is known to return correct results. */
+    IMPLSTAT_WORKING,
+    /* This implementation is known not to return correct results. */
+    IMPLSTAT_SKIP,
+} ImplementationAlternativeStatus;
+
 typedef struct BDRVRawState {
     int fd;
     int type;
@@ -146,6 +158,12 @@ typedef struct BDRVRawState {
     bool has_discard:1;
     bool has_write_zeroes:1;
     bool discard_zeroes:1;
+#if defined SEEK_HOLE && defined SEEK_DATA
+    ImplementationAlternativeStatus seek_hole_status;
+#endif
+#ifdef CONFIG_FIEMAP
+    ImplementationAlternativeStatus fiemap_status;
+#endif
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
@@ -1272,98 +1290,160 @@ static int raw_create(const char *filename, QEMUOptionParameter *options,
     return result;
 }
 
-/*
- * Returns true iff the specified sector is present in the disk image. Drivers
- * not implementing the functionality are assumed to not support backing files,
- * hence all their sectors are reported as allocated.
- *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
- */
-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            int nb_sectors, int *pnum)
+static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t end,
+                          off_t *data, off_t *hole, int nb_sectors, int *pnum)
 {
-    off_t start, data, hole;
-    int64_t ret;
-
-    ret = fd_open(bs);
-    if (ret < 0) {
-        return ret;
-    }
-
-    start = sector_num * BDRV_SECTOR_SIZE;
-    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
-
 #ifdef CONFIG_FIEMAP
-
     BDRVRawState *s = bs->opaque;
+    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
     struct {
         struct fiemap fm;
         struct fiemap_extent fe;
     } f;
 
+    if (s->fiemap_status == IMPLSTAT_SKIP) {
+        return -ENOTSUP;
+    }
+
     f.fm.fm_start = start;
     f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
     f.fm.fm_flags = 0;
     f.fm.fm_extent_count = 1;
     f.fm.fm_reserved = 0;
     if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
-        /* Assume everything is allocated.  */
-        *pnum = nb_sectors;
-        return ret;
+        s->fiemap_status = IMPLSTAT_SKIP;
+        return -errno;
+    }
+
+    if (s->fiemap_status == IMPLSTAT_UNKNOWN) {
+        if (f.fm.fm_extent_count == 1 &&
+            f.fe.fe_logical == 0 && f.fe.fe_length >= end)
+        {
+            /* FIEMAP returned a single extent spanning the entire file; maybe
+             * this was just a default response and therefore we cannot be sure
+             * whether it actually works; fall back to an alternative
+             * implementation. */
+            return -ENOTSUP;
+        } else {
+            s->fiemap_status = IMPLSTAT_WORKING;
+        }
     }
 
     if (f.fm.fm_mapped_extents == 0) {
         /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
          * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
          */
-        off_t length = lseek(s->fd, 0, SEEK_END);
-        hole = f.fm.fm_start;
-        data = MIN(f.fm.fm_start + f.fm.fm_length, length);
+        *hole = f.fm.fm_start;
+        *data = MIN(f.fm.fm_start + f.fm.fm_length, end);
     } else {
-        data = f.fe.fe_logical;
-        hole = f.fe.fe_logical + f.fe.fe_length;
+        *data = f.fe.fe_logical;
+        *hole = f.fe.fe_logical + f.fe.fe_length;
         if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
             ret |= BDRV_BLOCK_ZERO;
         }
     }
 
-#elif defined SEEK_HOLE && defined SEEK_DATA
+    return ret;
+#else
+    return -ENOTSUP;
+#endif
+}
 
+static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t end,
+                             off_t *data, off_t *hole, int *pnum)
+{
+#if defined SEEK_HOLE && defined SEEK_DATA
     BDRVRawState *s = bs->opaque;
 
-    hole = lseek(s->fd, start, SEEK_HOLE);
-    if (hole == -1) {
+    if (s->seek_hole_status == IMPLSTAT_SKIP) {
+        return -ENOTSUP;
+    }
+
+    *hole = lseek(s->fd, start, SEEK_HOLE);
+    if (*hole == -1) {
         /* -ENXIO indicates that sector_num was past the end of the file.
          * There is a virtual hole there.  */
         assert(errno != -ENXIO);
 
-        /* Most likely EINVAL.  Assume everything is allocated.  */
-        *pnum = nb_sectors;
-        return ret;
+        s->seek_hole_status = IMPLSTAT_SKIP;
+        return -errno;
     }
 
-    if (hole > start) {
-        data = start;
+    if (s->seek_hole_status == IMPLSTAT_UNKNOWN) {
+        if (*hole >= end) {
+            /* lseek() returned the EOF, therefore it is unknown whether
+             * SEEK_HOLE actually works; fall back to an alternative
+             * implementation. */
+            return -ENOTSUP;
+        } else {
+            s->seek_hole_status = IMPLSTAT_WORKING;
+        }
+    }
+
+    if (*hole > start) {
+        *data = start;
     } else {
         /* On a hole.  We need another syscall to find its end.  */
-        data = lseek(s->fd, start, SEEK_DATA);
-        if (data == -1) {
-            data = lseek(s->fd, 0, SEEK_END);
+        *data = lseek(s->fd, start, SEEK_DATA);
+        if (*data == -1) {
+            *data = lseek(s->fd, 0, SEEK_END);
         }
     }
+
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
 #else
-    data = 0;
-    hole = start + nb_sectors * BDRV_SECTOR_SIZE;
+    return -ENOTSUP;
 #endif
+}
+
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
+                                                    int64_t sector_num,
+                                                    int nb_sectors, int *pnum)
+{
+    off_t start, end, data = 0, hole = 0;
+    int64_t ret;
+
+    ret = fd_open(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    start = sector_num * BDRV_SECTOR_SIZE;
+    end   = bdrv_getlength(bs);
+    if (end < 0) {
+        /* This function is used solely by "file" which does not have
+         * variable-length images; therefore, bdrv_getlength() should use
+         * bs->total_sectors for calculating the length and never return an
+         * error; if it does, it will be -ENOMEDIUM which should be returned
+         * to the caller. */
+        return end;
+    }
+
+    ret = try_seek_hole(bs, start, end, &data, &hole, pnum);
+    if (ret < 0) {
+        ret = try_fiemap(bs, start, end, &data, &hole, nb_sectors, pnum);
+        if (ret < 0) {
+            /* Assume everything is allocated. */
+            data = 0;
+            hole = start + nb_sectors * BDRV_SECTOR_SIZE;
+            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+        }
+    }
 
     if (data <= start) {
         /* On a data extent, compute sectors to the end of the extent.  */
-- 
1.9.2

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [Qemu-devel] [PATCH v3] block/raw-posix: Try both FIEMAP and SEEK_HOLE
  2014-05-06 22:18 [Qemu-devel] [PATCH v3] block/raw-posix: Try both FIEMAP and SEEK_HOLE Max Reitz
@ 2014-05-07  6:37 ` Paolo Bonzini
  0 siblings, 0 replies; 2+ messages in thread
From: Paolo Bonzini @ 2014-05-07  6:37 UTC (permalink / raw)
  To: Max Reitz, qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi

Il 07/05/2014 00:18, Max Reitz ha scritto:
> The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
> FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
> compiled in in this case. However, there may be implementations which
> support the latter but not the former (e.g., NFSv4.2) as well as vice
> versa.
>
> To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
> probably be covered by POSIX soon) and if that does not work, fall back
> to FIEMAP; and if that does not work either, treat everything as
> allocated.
>
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
> v3:
>  - use a tristate for keeping the information whether or not to use
>    FIEMAP and/or SEEK_HOLE/SEEK_DATA [Eric]
>  - fall through to another implementation (i.e. FIEMAP) if the first
>    (i.e. SEEK_HOLE/SEEK_DATA) reports everything as allocated; this will
>    not result in that implementation being skipped the next time,
>    however [Eric]

You need this if you use SEEK_HOLE/SEEK_DATA first, because it has a 
default implementation that returns a single non-hole for the entire file.

But if you start with FIEMAP, you don't because it will return ENOTSUP 
instead of a single allocated extent.

Paolo

> ---
>  block/raw-posix.c | 182 +++++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 131 insertions(+), 51 deletions(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 3ce026d..fd6bac6 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -123,6 +123,18 @@
>
>  #define MAX_BLOCKSIZE	4096
>
> +/* In case there are multiple implementations for the same feature provided by
> + * the environment, this enumeration may be used to represent the status of
> + * these alternatives. */
> +typedef enum ImplementationAlternativeStatus {
> +    /* The status is (yet) unknown. */
> +    IMPLSTAT_UNKNOWN = 0,
> +    /* This implementation is known to return correct results. */
> +    IMPLSTAT_WORKING,
> +    /* This implementation is known not to return correct results. */
> +    IMPLSTAT_SKIP,
> +} ImplementationAlternativeStatus;
> +
>  typedef struct BDRVRawState {
>      int fd;
>      int type;
> @@ -146,6 +158,12 @@ typedef struct BDRVRawState {
>      bool has_discard:1;
>      bool has_write_zeroes:1;
>      bool discard_zeroes:1;
> +#if defined SEEK_HOLE && defined SEEK_DATA
> +    ImplementationAlternativeStatus seek_hole_status;
> +#endif
> +#ifdef CONFIG_FIEMAP
> +    ImplementationAlternativeStatus fiemap_status;
> +#endif
>  } BDRVRawState;
>
>  typedef struct BDRVRawReopenState {
> @@ -1272,98 +1290,160 @@ static int raw_create(const char *filename, QEMUOptionParameter *options,
>      return result;
>  }
>
> -/*
> - * Returns true iff the specified sector is present in the disk image. Drivers
> - * not implementing the functionality are assumed to not support backing files,
> - * hence all their sectors are reported as allocated.
> - *
> - * If 'sector_num' is beyond the end of the disk image the return value is 0
> - * and 'pnum' is set to 0.
> - *
> - * 'pnum' is set to the number of sectors (including and immediately following
> - * the specified sector) that are known to be in the same
> - * allocated/unallocated state.
> - *
> - * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
> - * beyond the end of the disk image it will be clamped.
> - */
> -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> -                                            int64_t sector_num,
> -                                            int nb_sectors, int *pnum)
> +static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t end,
> +                          off_t *data, off_t *hole, int nb_sectors, int *pnum)
>  {
> -    off_t start, data, hole;
> -    int64_t ret;
> -
> -    ret = fd_open(bs);
> -    if (ret < 0) {
> -        return ret;
> -    }
> -
> -    start = sector_num * BDRV_SECTOR_SIZE;
> -    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> -
>  #ifdef CONFIG_FIEMAP
> -
>      BDRVRawState *s = bs->opaque;
> +    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
>      struct {
>          struct fiemap fm;
>          struct fiemap_extent fe;
>      } f;
>
> +    if (s->fiemap_status == IMPLSTAT_SKIP) {
> +        return -ENOTSUP;
> +    }
> +
>      f.fm.fm_start = start;
>      f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
>      f.fm.fm_flags = 0;
>      f.fm.fm_extent_count = 1;
>      f.fm.fm_reserved = 0;
>      if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
> -        /* Assume everything is allocated.  */
> -        *pnum = nb_sectors;
> -        return ret;
> +        s->fiemap_status = IMPLSTAT_SKIP;
> +        return -errno;
> +    }
> +
> +    if (s->fiemap_status == IMPLSTAT_UNKNOWN) {
> +        if (f.fm.fm_extent_count == 1 &&
> +            f.fe.fe_logical == 0 && f.fe.fe_length >= end)
> +        {
> +            /* FIEMAP returned a single extent spanning the entire file; maybe
> +             * this was just a default response and therefore we cannot be sure
> +             * whether it actually works; fall back to an alternative
> +             * implementation. */
> +            return -ENOTSUP;
> +        } else {
> +            s->fiemap_status = IMPLSTAT_WORKING;
> +        }
>      }
>
>      if (f.fm.fm_mapped_extents == 0) {
>          /* No extents found, data is beyond f.fm.fm_start + f.fm.fm_length.
>           * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
>           */
> -        off_t length = lseek(s->fd, 0, SEEK_END);
> -        hole = f.fm.fm_start;
> -        data = MIN(f.fm.fm_start + f.fm.fm_length, length);
> +        *hole = f.fm.fm_start;
> +        *data = MIN(f.fm.fm_start + f.fm.fm_length, end);
>      } else {
> -        data = f.fe.fe_logical;
> -        hole = f.fe.fe_logical + f.fe.fe_length;
> +        *data = f.fe.fe_logical;
> +        *hole = f.fe.fe_logical + f.fe.fe_length;
>          if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
>              ret |= BDRV_BLOCK_ZERO;
>          }
>      }
>
> -#elif defined SEEK_HOLE && defined SEEK_DATA
> +    return ret;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
>
> +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t end,
> +                             off_t *data, off_t *hole, int *pnum)
> +{
> +#if defined SEEK_HOLE && defined SEEK_DATA
>      BDRVRawState *s = bs->opaque;
>
> -    hole = lseek(s->fd, start, SEEK_HOLE);
> -    if (hole == -1) {
> +    if (s->seek_hole_status == IMPLSTAT_SKIP) {
> +        return -ENOTSUP;
> +    }
> +
> +    *hole = lseek(s->fd, start, SEEK_HOLE);
> +    if (*hole == -1) {
>          /* -ENXIO indicates that sector_num was past the end of the file.
>           * There is a virtual hole there.  */
>          assert(errno != -ENXIO);
>
> -        /* Most likely EINVAL.  Assume everything is allocated.  */
> -        *pnum = nb_sectors;
> -        return ret;
> +        s->seek_hole_status = IMPLSTAT_SKIP;
> +        return -errno;
>      }
>
> -    if (hole > start) {
> -        data = start;
> +    if (s->seek_hole_status == IMPLSTAT_UNKNOWN) {
> +        if (*hole >= end) {
> +            /* lseek() returned the EOF, therefore it is unknown whether
> +             * SEEK_HOLE actually works; fall back to an alternative
> +             * implementation. */
> +            return -ENOTSUP;
> +        } else {
> +            s->seek_hole_status = IMPLSTAT_WORKING;
> +        }
> +    }
> +
> +    if (*hole > start) {
> +        *data = start;
>      } else {
>          /* On a hole.  We need another syscall to find its end.  */
> -        data = lseek(s->fd, start, SEEK_DATA);
> -        if (data == -1) {
> -            data = lseek(s->fd, 0, SEEK_END);
> +        *data = lseek(s->fd, start, SEEK_DATA);
> +        if (*data == -1) {
> +            *data = lseek(s->fd, 0, SEEK_END);
>          }
>      }
> +
> +    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
>  #else
> -    data = 0;
> -    hole = start + nb_sectors * BDRV_SECTOR_SIZE;
> +    return -ENOTSUP;
>  #endif
> +}
> +
> +/*
> + * Returns true iff the specified sector is present in the disk image. Drivers
> + * not implementing the functionality are assumed to not support backing files,
> + * hence all their sectors are reported as allocated.
> + *
> + * If 'sector_num' is beyond the end of the disk image the return value is 0
> + * and 'pnum' is set to 0.
> + *
> + * 'pnum' is set to the number of sectors (including and immediately following
> + * the specified sector) that are known to be in the same
> + * allocated/unallocated state.
> + *
> + * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
> + * beyond the end of the disk image it will be clamped.
> + */
> +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> +                                                    int64_t sector_num,
> +                                                    int nb_sectors, int *pnum)
> +{
> +    off_t start, end, data = 0, hole = 0;
> +    int64_t ret;
> +
> +    ret = fd_open(bs);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    start = sector_num * BDRV_SECTOR_SIZE;
> +    end   = bdrv_getlength(bs);
> +    if (end < 0) {
> +        /* This function is used solely by "file" which does not have
> +         * variable-length images; therefore, bdrv_getlength() should use
> +         * bs->total_sectors for calculating the length and never return an
> +         * error; if it does, it will be -ENOMEDIUM which should be returned
> +         * to the caller. */
> +        return end;
> +    }
> +
> +    ret = try_seek_hole(bs, start, end, &data, &hole, pnum);
> +    if (ret < 0) {
> +        ret = try_fiemap(bs, start, end, &data, &hole, nb_sectors, pnum);
> +        if (ret < 0) {
> +            /* Assume everything is allocated. */
> +            data = 0;
> +            hole = start + nb_sectors * BDRV_SECTOR_SIZE;
> +            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> +        }
> +    }
>
>      if (data <= start) {
>          /* On a data extent, compute sectors to the end of the extent.  */
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2014-05-07  6:38 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-06 22:18 [Qemu-devel] [PATCH v3] block/raw-posix: Try both FIEMAP and SEEK_HOLE Max Reitz
2014-05-07  6:37 ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).