* [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
@ 2014-05-06 19:00 Max Reitz
2014-05-06 21:18 ` Eric Blake
` (2 more replies)
0 siblings, 3 replies; 10+ messages in thread
From: Max Reitz @ 2014-05-06 19:00 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz
The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
compiled in in this case. However, there may be implementations which
support the latter but not the former (e.g., NFSv4.2) as well as vice
versa.
To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
probably be covered by POSIX soon) and if that does not work, fall back
to FIEMAP; and if that does not work either, treat everything as
allocated.
Signed-off-by: Max Reitz <mreitz@redhat.com>
---
v2:
- reworked using static functions [Stefan]
- changed order of FIEMAP and SEEK_HOLE [Eric]
---
block/raw-posix.c | 135 ++++++++++++++++++++++++++++++++++--------------------
1 file changed, 85 insertions(+), 50 deletions(-)
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 3ce026d..1b3900d 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -146,6 +146,12 @@ typedef struct BDRVRawState {
bool has_discard:1;
bool has_write_zeroes:1;
bool discard_zeroes:1;
+#if defined SEEK_HOLE && defined SEEK_DATA
+ bool skip_seek_hole;
+#endif
+#ifdef CONFIG_FIEMAP
+ bool skip_fiemap;
+#endif
} BDRVRawState;
typedef struct BDRVRawReopenState {
@@ -1272,53 +1278,29 @@ static int raw_create(const char *filename, QEMUOptionParameter *options,
return result;
}
-/*
- * Returns true iff the specified sector is present in the disk image. Drivers
- * not implementing the functionality are assumed to not support backing files,
- * hence all their sectors are reported as allocated.
- *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
- *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
- * allocated/unallocated state.
- *
- * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
- */
-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
- int64_t sector_num,
- int nb_sectors, int *pnum)
+static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
+ off_t *hole, int nb_sectors, int *pnum)
{
- off_t start, data, hole;
- int64_t ret;
-
- ret = fd_open(bs);
- if (ret < 0) {
- return ret;
- }
-
- start = sector_num * BDRV_SECTOR_SIZE;
- ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
-
#ifdef CONFIG_FIEMAP
-
BDRVRawState *s = bs->opaque;
+ int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
struct {
struct fiemap fm;
struct fiemap_extent fe;
} f;
+ if (s->skip_fiemap) {
+ return -ENOTSUP;
+ }
+
f.fm.fm_start = start;
f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
f.fm.fm_flags = 0;
f.fm.fm_extent_count = 1;
f.fm.fm_reserved = 0;
if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
- /* Assume everything is allocated. */
- *pnum = nb_sectors;
- return ret;
+ s->skip_fiemap = true;
+ return -errno;
}
if (f.fm.fm_mapped_extents == 0) {
@@ -1326,44 +1308,97 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
* f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
*/
off_t length = lseek(s->fd, 0, SEEK_END);
- hole = f.fm.fm_start;
- data = MIN(f.fm.fm_start + f.fm.fm_length, length);
+ *hole = f.fm.fm_start;
+ *data = MIN(f.fm.fm_start + f.fm.fm_length, length);
} else {
- data = f.fe.fe_logical;
- hole = f.fe.fe_logical + f.fe.fe_length;
+ *data = f.fe.fe_logical;
+ *hole = f.fe.fe_logical + f.fe.fe_length;
if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
ret |= BDRV_BLOCK_ZERO;
}
}
-#elif defined SEEK_HOLE && defined SEEK_DATA
+ return ret;
+#else
+ return -ENOTSUP;
+#endif
+}
+static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
+ off_t *hole, int *pnum)
+{
+#if defined SEEK_HOLE && defined SEEK_DATA
BDRVRawState *s = bs->opaque;
- hole = lseek(s->fd, start, SEEK_HOLE);
- if (hole == -1) {
+ if (s->skip_seek_hole) {
+ return -ENOTSUP;
+ }
+
+ *hole = lseek(s->fd, start, SEEK_HOLE);
+ if (*hole == -1) {
/* -ENXIO indicates that sector_num was past the end of the file.
* There is a virtual hole there. */
assert(errno != -ENXIO);
- /* Most likely EINVAL. Assume everything is allocated. */
- *pnum = nb_sectors;
- return ret;
+ s->skip_seek_hole = true;
+ return -errno;
}
- if (hole > start) {
- data = start;
+ if (*hole > start) {
+ *data = start;
} else {
/* On a hole. We need another syscall to find its end. */
- data = lseek(s->fd, start, SEEK_DATA);
- if (data == -1) {
- data = lseek(s->fd, 0, SEEK_END);
+ *data = lseek(s->fd, start, SEEK_DATA);
+ if (*data == -1) {
+ *data = lseek(s->fd, 0, SEEK_END);
}
}
+
+ return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
#else
- data = 0;
- hole = start + nb_sectors * BDRV_SECTOR_SIZE;
+ return -ENOTSUP;
#endif
+}
+
+/*
+ * Returns true iff the specified sector is present in the disk image. Drivers
+ * not implementing the functionality are assumed to not support backing files,
+ * hence all their sectors are reported as allocated.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ */
+static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ off_t start, data = 0, hole = 0;
+ int64_t ret;
+
+ ret = fd_open(bs);
+ if (ret < 0) {
+ return ret;
+ }
+
+ start = sector_num * BDRV_SECTOR_SIZE;
+
+ ret = try_seek_hole(bs, start, &data, &hole, pnum);
+ if (ret < 0) {
+ ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
+ if (ret < 0) {
+ /* Assume everything is allocated. */
+ data = 0;
+ hole = start + nb_sectors * BDRV_SECTOR_SIZE;
+ ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+ }
+ }
if (data <= start) {
/* On a data extent, compute sectors to the end of the extent. */
--
1.9.2
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 19:00 [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE Max Reitz
@ 2014-05-06 21:18 ` Eric Blake
2014-05-06 21:35 ` Max Reitz
2014-05-06 21:47 ` Eric Blake
2014-05-09 8:03 ` Paolo Bonzini
2014-05-11 17:26 ` Christoph Hellwig
2 siblings, 2 replies; 10+ messages in thread
From: Eric Blake @ 2014-05-06 21:18 UTC (permalink / raw)
To: Max Reitz, qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi
[-- Attachment #1: Type: text/plain, Size: 5952 bytes --]
On 05/06/2014 01:00 PM, Max Reitz wrote:
> The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
> FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
> compiled in in this case. However, there may be implementations which
> support the latter but not the former (e.g., NFSv4.2) as well as vice
> versa.
>
> To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
> probably be covered by POSIX soon) and if that does not work, fall back
> to FIEMAP; and if that does not work either, treat everything as
> allocated.
>
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
> v2:
> - reworked using static functions [Stefan]
> - changed order of FIEMAP and SEEK_HOLE [Eric]
> ---
> block/raw-posix.c | 135 ++++++++++++++++++++++++++++++++++--------------------
> 1 file changed, 85 insertions(+), 50 deletions(-)
>
> +++ b/block/raw-posix.c
> @@ -146,6 +146,12 @@ typedef struct BDRVRawState {
> bool has_discard:1;
> bool has_write_zeroes:1;
> bool discard_zeroes:1;
> +#if defined SEEK_HOLE && defined SEEK_DATA
> + bool skip_seek_hole;
Remember, SEEK_HOLE support requires both support of the kernel and the
filesystem - we may still have cases where there are filesystems that
lack SEEK_HOLE but still have FIEMAP, even when compiled against kernel
headers where SEEK_HOLE is defined - on those filesystems, the kernel
guarantees SEEK_HOLE will report the entire file as allocated, even
though FIEMAP might be able to find holes. On the other hand, the whole
point of this is to optimize cases; where the fallback to treating the
whole file as allocated may be slower but still gives correct behavior
to the guest.
I like how you have a per-struct state to track whether you have
encountered a previous failure, to skip the known-failing probe the next
time around. But I wonder if you need a tweak...
> +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
> + off_t *hole, int *pnum)
> +{
> +#if defined SEEK_HOLE && defined SEEK_DATA
> BDRVRawState *s = bs->opaque;
>
> - hole = lseek(s->fd, start, SEEK_HOLE);
> - if (hole == -1) {
> + if (s->skip_seek_hole) {
> + return -ENOTSUP;
> + }
> +
> + *hole = lseek(s->fd, start, SEEK_HOLE);
> + if (*hole == -1) {
> /* -ENXIO indicates that sector_num was past the end of the file.
> * There is a virtual hole there. */
> assert(errno != -ENXIO);
>
> - /* Most likely EINVAL. Assume everything is allocated. */
> - *pnum = nb_sectors;
> - return ret;
> + s->skip_seek_hole = true;
> + return -errno;
> }
...if you are on a file system where SEEK_HOLE triggers the kernel
fallback of "entire file is allocated", but where FIEMAP is wired up for
that file system, would it make sense to have try_seek_hole return -1 in
situations where lseek(s->fd, 0, SEEK_HOLE) returns the end of the file?
Even more, should skip_seek_hole be a tri-state?
state: "unknown" - if lseek(SEEK_HOLE) returns -1, state changes to
"skip"; if it returns something other than EOF, state changes to "use";
if it returns EOF, state remains "unknown" (as punching a hole or
resizing the image may work to create a hole in what is currently a
fully-allocated image)
state: "skip" - we've had a previous lseek failure, no need to try
seeking for holes on this file
state: "use" - we've had success probing a hole, so we want to always
trust SEEK_HOLE for this file
>
> - if (hole > start) {
> - data = start;
> + if (*hole > start) {
> + *data = start;
> } else {
> /* On a hole. We need another syscall to find its end. */
> - data = lseek(s->fd, start, SEEK_DATA);
> - if (data == -1) {
> - data = lseek(s->fd, 0, SEEK_END);
> + *data = lseek(s->fd, start, SEEK_DATA);
> + if (*data == -1) {
> + *data = lseek(s->fd, 0, SEEK_END);
> }
Pre-existing, and unaffected by this patch, but lseek() changes the file
offset. Does that affect any other code that might do a read()/write(),
or are we safe in always using pread()/pwrite() so that we don't care
about file offset?
> + *
> + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
> + * beyond the end of the disk image it will be clamped.
> + */
> +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> + int64_t sector_num,
> + int nb_sectors, int *pnum)
Indentation is off.
> +{
> + off_t start, data = 0, hole = 0;
> + int64_t ret;
> +
> + ret = fd_open(bs);
> + if (ret < 0) {
> + return ret;
> + }
> +
> + start = sector_num * BDRV_SECTOR_SIZE;
> +
> + ret = try_seek_hole(bs, start, &data, &hole, pnum);
Again, a tri-state return (-1 for skipped, 0 for unknown, 1 for hole or
EOF found) might make more sense. That way, you hit the fallback of
declaring the whole file as allocated only if both probes reported no
holes. Or hide the tri-state in the helper function, but map both
"skip" and "unknown" into -1 return, and only "use" into 0 return.
> + if (ret < 0) {
> + ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
> + if (ret < 0) {
> + /* Assume everything is allocated. */
> + data = 0;
> + hole = start + nb_sectors * BDRV_SECTOR_SIZE;
> + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> + }
> + }
>
> if (data <= start) {
> /* On a data extent, compute sectors to the end of the extent. */
>
--
Eric Blake eblake redhat com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 21:18 ` Eric Blake
@ 2014-05-06 21:35 ` Max Reitz
2014-05-06 21:47 ` Eric Blake
1 sibling, 0 replies; 10+ messages in thread
From: Max Reitz @ 2014-05-06 21:35 UTC (permalink / raw)
To: Eric Blake, qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi
On 06.05.2014 23:18, Eric Blake wrote:
> On 05/06/2014 01:00 PM, Max Reitz wrote:
>> The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
>> FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
>> compiled in in this case. However, there may be implementations which
>> support the latter but not the former (e.g., NFSv4.2) as well as vice
>> versa.
>>
>> To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
>> probably be covered by POSIX soon) and if that does not work, fall back
>> to FIEMAP; and if that does not work either, treat everything as
>> allocated.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>> v2:
>> - reworked using static functions [Stefan]
>> - changed order of FIEMAP and SEEK_HOLE [Eric]
>> ---
>> block/raw-posix.c | 135 ++++++++++++++++++++++++++++++++++--------------------
>> 1 file changed, 85 insertions(+), 50 deletions(-)
>>
>> +++ b/block/raw-posix.c
>> @@ -146,6 +146,12 @@ typedef struct BDRVRawState {
>> bool has_discard:1;
>> bool has_write_zeroes:1;
>> bool discard_zeroes:1;
>> +#if defined SEEK_HOLE && defined SEEK_DATA
>> + bool skip_seek_hole;
> Remember, SEEK_HOLE support requires both support of the kernel and the
> filesystem - we may still have cases where there are filesystems that
> lack SEEK_HOLE but still have FIEMAP, even when compiled against kernel
> headers where SEEK_HOLE is defined - on those filesystems, the kernel
> guarantees SEEK_HOLE will report the entire file as allocated, even
> though FIEMAP might be able to find holes. On the other hand, the whole
> point of this is to optimize cases; where the fallback to treating the
> whole file as allocated may be slower but still gives correct behavior
> to the guest.
>
> I like how you have a per-struct state to track whether you have
> encountered a previous failure, to skip the known-failing probe the next
> time around. But I wonder if you need a tweak...
>
>> +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
>> + off_t *hole, int *pnum)
>> +{
>> +#if defined SEEK_HOLE && defined SEEK_DATA
>> BDRVRawState *s = bs->opaque;
>>
>> - hole = lseek(s->fd, start, SEEK_HOLE);
>> - if (hole == -1) {
>> + if (s->skip_seek_hole) {
>> + return -ENOTSUP;
>> + }
>> +
>> + *hole = lseek(s->fd, start, SEEK_HOLE);
>> + if (*hole == -1) {
>> /* -ENXIO indicates that sector_num was past the end of the file.
>> * There is a virtual hole there. */
>> assert(errno != -ENXIO);
>>
>> - /* Most likely EINVAL. Assume everything is allocated. */
>> - *pnum = nb_sectors;
>> - return ret;
>> + s->skip_seek_hole = true;
>> + return -errno;
>> }
> ...if you are on a file system where SEEK_HOLE triggers the kernel
> fallback of "entire file is allocated", but where FIEMAP is wired up for
> that file system, would it make sense to have try_seek_hole return -1 in
> situations where lseek(s->fd, 0, SEEK_HOLE) returns the end of the file?
> Even more, should skip_seek_hole be a tri-state?
>
> state: "unknown" - if lseek(SEEK_HOLE) returns -1, state changes to
> "skip"; if it returns something other than EOF, state changes to "use";
> if it returns EOF, state remains "unknown" (as punching a hole or
> resizing the image may work to create a hole in what is currently a
> fully-allocated image)
>
> state: "skip" - we've had a previous lseek failure, no need to try
> seeking for holes on this file
>
> state: "use" - we've had success probing a hole, so we want to always
> trust SEEK_HOLE for this file
Hm, you're probably right. I just hope it won't get too complicated…
>>
>> - if (hole > start) {
>> - data = start;
>> + if (*hole > start) {
>> + *data = start;
>> } else {
>> /* On a hole. We need another syscall to find its end. */
>> - data = lseek(s->fd, start, SEEK_DATA);
>> - if (data == -1) {
>> - data = lseek(s->fd, 0, SEEK_END);
>> + *data = lseek(s->fd, start, SEEK_DATA);
>> + if (*data == -1) {
>> + *data = lseek(s->fd, 0, SEEK_END);
>> }
> Pre-existing, and unaffected by this patch, but lseek() changes the file
> offset. Does that affect any other code that might do a read()/write(),
> or are we safe in always using pread()/pwrite() so that we don't care
> about file offset?
I see multiple lseek(fd, 0, SEEK_END) in raw-posix.c for determining the
file size, so I guess it's fine.
>> + *
>> + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
>> + * beyond the end of the disk image it will be clamped.
>> + */
>> +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
>> + int64_t sector_num,
>> + int nb_sectors, int *pnum)
> Indentation is off.
Well, yes, this is pre-existing, but I'll fix it.
>> +{
>> + off_t start, data = 0, hole = 0;
>> + int64_t ret;
>> +
>> + ret = fd_open(bs);
>> + if (ret < 0) {
>> + return ret;
>> + }
>> +
>> + start = sector_num * BDRV_SECTOR_SIZE;
>> +
>> + ret = try_seek_hole(bs, start, &data, &hole, pnum);
> Again, a tri-state return (-1 for skipped, 0 for unknown, 1 for hole or
> EOF found) might make more sense. That way, you hit the fallback of
> declaring the whole file as allocated only if both probes reported no
> holes. Or hide the tri-state in the helper function, but map both
> "skip" and "unknown" into -1 return, and only "use" into 0 return.
I guess I'll go for the latter.
Thanks for you review,
Max
>> + if (ret < 0) {
>> + ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
>> + if (ret < 0) {
>> + /* Assume everything is allocated. */
>> + data = 0;
>> + hole = start + nb_sectors * BDRV_SECTOR_SIZE;
>> + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
>> + }
>> + }
>>
>> if (data <= start) {
>> /* On a data extent, compute sectors to the end of the extent. */
>>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 21:18 ` Eric Blake
2014-05-06 21:35 ` Max Reitz
@ 2014-05-06 21:47 ` Eric Blake
2014-05-06 21:48 ` Max Reitz
2014-05-07 5:56 ` Markus Armbruster
1 sibling, 2 replies; 10+ messages in thread
From: Eric Blake @ 2014-05-06 21:47 UTC (permalink / raw)
To: Max Reitz, qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi
[-- Attachment #1: Type: text/plain, Size: 721 bytes --]
On 05/06/2014 03:18 PM, Eric Blake wrote:
> ...if you are on a file system where SEEK_HOLE triggers the kernel
> fallback of "entire file is allocated", but where FIEMAP is wired up for
> that file system, would it make sense to have try_seek_hole return -1 in
> situations where lseek(s->fd, 0, SEEK_HOLE) returns the end of the file?
> Even more, should skip_seek_hole be a tri-state?
On the other hand, such systems are getting vanishingly rare as people
upgrade to newer kernels. Do we care about catering to them, or is it
fair game to just tell people to upgrade if they want performance?
--
Eric Blake eblake redhat com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 604 bytes --]
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 21:47 ` Eric Blake
@ 2014-05-06 21:48 ` Max Reitz
2014-05-07 5:56 ` Markus Armbruster
1 sibling, 0 replies; 10+ messages in thread
From: Max Reitz @ 2014-05-06 21:48 UTC (permalink / raw)
To: Eric Blake, qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi
On 06.05.2014 23:47, Eric Blake wrote:
> On 05/06/2014 03:18 PM, Eric Blake wrote:
>
>> ...if you are on a file system where SEEK_HOLE triggers the kernel
>> fallback of "entire file is allocated", but where FIEMAP is wired up for
>> that file system, would it make sense to have try_seek_hole return -1 in
>> situations where lseek(s->fd, 0, SEEK_HOLE) returns the end of the file?
>> Even more, should skip_seek_hole be a tri-state?
> On the other hand, such systems are getting vanishingly rare as people
> upgrade to newer kernels. Do we care about catering to them, or is it
> fair game to just tell people to upgrade if they want performance?
How about I'll send v3 and you decide? ;-)
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 21:47 ` Eric Blake
2014-05-06 21:48 ` Max Reitz
@ 2014-05-07 5:56 ` Markus Armbruster
2014-05-08 18:35 ` Max Reitz
1 sibling, 1 reply; 10+ messages in thread
From: Markus Armbruster @ 2014-05-07 5:56 UTC (permalink / raw)
To: Eric Blake; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi, Max Reitz
Eric Blake <eblake@redhat.com> writes:
> On 05/06/2014 03:18 PM, Eric Blake wrote:
>
>> ...if you are on a file system where SEEK_HOLE triggers the kernel
>> fallback of "entire file is allocated", but where FIEMAP is wired up for
>> that file system, would it make sense to have try_seek_hole return -1 in
>> situations where lseek(s->fd, 0, SEEK_HOLE) returns the end of the file?
>> Even more, should skip_seek_hole be a tri-state?
>
> On the other hand, such systems are getting vanishingly rare as people
> upgrade to newer kernels. Do we care about catering to them, or is it
> fair game to just tell people to upgrade if they want performance?
My tolerance for code complications to speed up things under old kernels
isn't zero, but close. Leave these games to downstreams sporting such
kernels.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-07 5:56 ` Markus Armbruster
@ 2014-05-08 18:35 ` Max Reitz
0 siblings, 0 replies; 10+ messages in thread
From: Max Reitz @ 2014-05-08 18:35 UTC (permalink / raw)
To: Markus Armbruster, Eric Blake; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi
On 07.05.2014 07:56, Markus Armbruster wrote:
> Eric Blake <eblake@redhat.com> writes:
>
>> On 05/06/2014 03:18 PM, Eric Blake wrote:
>>
>>> ...if you are on a file system where SEEK_HOLE triggers the kernel
>>> fallback of "entire file is allocated", but where FIEMAP is wired up for
>>> that file system, would it make sense to have try_seek_hole return -1 in
>>> situations where lseek(s->fd, 0, SEEK_HOLE) returns the end of the file?
>>> Even more, should skip_seek_hole be a tri-state?
>> On the other hand, such systems are getting vanishingly rare as people
>> upgrade to newer kernels. Do we care about catering to them, or is it
>> fair game to just tell people to upgrade if they want performance?
> My tolerance for code complications to speed up things under old kernels
> isn't zero, but close. Leave these games to downstreams sporting such
> kernels.
As far as I see it, it isn't an optimization for older but for newer
kernels. Including Paolo's comment for v3, we could get away without
these tristate shenanigans if we'd keep the old (current) order, that
is, first try FIEMAP which will return ENOTSUP if it's not supported,
and then fall back to SEEK_HOLE which will always return some result (if
available, otherwise we can't even use it, obviously).
But as SEEK_HOLE is apparently the more "standard" way to acquire the
required information, using it first is an optimization for newer
systems (older Linux systems and/or older drivers will only provide FIEMAP).
I think there is no big performance penality in trying FIEMAP first, as
the first call to raw_co_get_block_status() will detect that it is not
available for the given file and therefore will be skipped from then on
anyway.
Considering this, I guess I'll send a v4 without any tristates and which
keeps the old order (FIEMAP first and then SEEK_HOLE).
Max
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 19:00 [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE Max Reitz
2014-05-06 21:18 ` Eric Blake
@ 2014-05-09 8:03 ` Paolo Bonzini
2014-05-11 17:26 ` Christoph Hellwig
2 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2014-05-09 8:03 UTC (permalink / raw)
To: Max Reitz, qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi
Il 06/05/2014 21:00, Max Reitz ha scritto:
> The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
> FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
> compiled in in this case. However, there may be implementations which
> support the latter but not the former (e.g., NFSv4.2) as well as vice
> versa.
>
> To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
> probably be covered by POSIX soon) and if that does not work, fall back
> to FIEMAP; and if that does not work either, treat everything as
> allocated.
I think it's better to test FIEMAP first; if it's implemented, FIEMAP
it's guaranteed to be accurate, and it can also be ruled out on the
first attempt if it's not implemented.
With SEEK_HOLE/SEEK_DATA, the OS could provide a dummy implementation
that returns no holes: this would mean either always falling back to
FIEMAP on fully-allocated files, or producing inaccurate results for
sparse files on filesystems that do not support SEEK_HOLE/SEEK_DATA.
Paolo
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
> v2:
> - reworked using static functions [Stefan]
> - changed order of FIEMAP and SEEK_HOLE [Eric]
> ---
> block/raw-posix.c | 135 ++++++++++++++++++++++++++++++++++--------------------
> 1 file changed, 85 insertions(+), 50 deletions(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 3ce026d..1b3900d 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -146,6 +146,12 @@ typedef struct BDRVRawState {
> bool has_discard:1;
> bool has_write_zeroes:1;
> bool discard_zeroes:1;
> +#if defined SEEK_HOLE && defined SEEK_DATA
> + bool skip_seek_hole;
> +#endif
> +#ifdef CONFIG_FIEMAP
> + bool skip_fiemap;
> +#endif
> } BDRVRawState;
>
> typedef struct BDRVRawReopenState {
> @@ -1272,53 +1278,29 @@ static int raw_create(const char *filename, QEMUOptionParameter *options,
> return result;
> }
>
> -/*
> - * Returns true iff the specified sector is present in the disk image. Drivers
> - * not implementing the functionality are assumed to not support backing files,
> - * hence all their sectors are reported as allocated.
> - *
> - * If 'sector_num' is beyond the end of the disk image the return value is 0
> - * and 'pnum' is set to 0.
> - *
> - * 'pnum' is set to the number of sectors (including and immediately following
> - * the specified sector) that are known to be in the same
> - * allocated/unallocated state.
> - *
> - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
> - * beyond the end of the disk image it will be clamped.
> - */
> -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> - int64_t sector_num,
> - int nb_sectors, int *pnum)
> +static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data,
> + off_t *hole, int nb_sectors, int *pnum)
> {
> - off_t start, data, hole;
> - int64_t ret;
> -
> - ret = fd_open(bs);
> - if (ret < 0) {
> - return ret;
> - }
> -
> - start = sector_num * BDRV_SECTOR_SIZE;
> - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> -
> #ifdef CONFIG_FIEMAP
> -
> BDRVRawState *s = bs->opaque;
> + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> struct {
> struct fiemap fm;
> struct fiemap_extent fe;
> } f;
>
> + if (s->skip_fiemap) {
> + return -ENOTSUP;
> + }
> +
> f.fm.fm_start = start;
> f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE;
> f.fm.fm_flags = 0;
> f.fm.fm_extent_count = 1;
> f.fm.fm_reserved = 0;
> if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) {
> - /* Assume everything is allocated. */
> - *pnum = nb_sectors;
> - return ret;
> + s->skip_fiemap = true;
> + return -errno;
> }
>
> if (f.fm.fm_mapped_extents == 0) {
> @@ -1326,44 +1308,97 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> * f.fm.fm_start + f.fm.fm_length must be clamped to the file size!
> */
> off_t length = lseek(s->fd, 0, SEEK_END);
> - hole = f.fm.fm_start;
> - data = MIN(f.fm.fm_start + f.fm.fm_length, length);
> + *hole = f.fm.fm_start;
> + *data = MIN(f.fm.fm_start + f.fm.fm_length, length);
> } else {
> - data = f.fe.fe_logical;
> - hole = f.fe.fe_logical + f.fe.fe_length;
> + *data = f.fe.fe_logical;
> + *hole = f.fe.fe_logical + f.fe.fe_length;
> if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
> ret |= BDRV_BLOCK_ZERO;
> }
> }
>
> -#elif defined SEEK_HOLE && defined SEEK_DATA
> + return ret;
> +#else
> + return -ENOTSUP;
> +#endif
> +}
>
> +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data,
> + off_t *hole, int *pnum)
> +{
> +#if defined SEEK_HOLE && defined SEEK_DATA
> BDRVRawState *s = bs->opaque;
>
> - hole = lseek(s->fd, start, SEEK_HOLE);
> - if (hole == -1) {
> + if (s->skip_seek_hole) {
> + return -ENOTSUP;
> + }
> +
> + *hole = lseek(s->fd, start, SEEK_HOLE);
> + if (*hole == -1) {
> /* -ENXIO indicates that sector_num was past the end of the file.
> * There is a virtual hole there. */
> assert(errno != -ENXIO);
>
> - /* Most likely EINVAL. Assume everything is allocated. */
> - *pnum = nb_sectors;
> - return ret;
> + s->skip_seek_hole = true;
> + return -errno;
> }
>
> - if (hole > start) {
> - data = start;
> + if (*hole > start) {
> + *data = start;
> } else {
> /* On a hole. We need another syscall to find its end. */
> - data = lseek(s->fd, start, SEEK_DATA);
> - if (data == -1) {
> - data = lseek(s->fd, 0, SEEK_END);
> + *data = lseek(s->fd, start, SEEK_DATA);
> + if (*data == -1) {
> + *data = lseek(s->fd, 0, SEEK_END);
> }
> }
> +
> + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> #else
> - data = 0;
> - hole = start + nb_sectors * BDRV_SECTOR_SIZE;
> + return -ENOTSUP;
> #endif
> +}
> +
> +/*
> + * Returns true iff the specified sector is present in the disk image. Drivers
> + * not implementing the functionality are assumed to not support backing files,
> + * hence all their sectors are reported as allocated.
> + *
> + * If 'sector_num' is beyond the end of the disk image the return value is 0
> + * and 'pnum' is set to 0.
> + *
> + * 'pnum' is set to the number of sectors (including and immediately following
> + * the specified sector) that are known to be in the same
> + * allocated/unallocated state.
> + *
> + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
> + * beyond the end of the disk image it will be clamped.
> + */
> +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
> + int64_t sector_num,
> + int nb_sectors, int *pnum)
> +{
> + off_t start, data = 0, hole = 0;
> + int64_t ret;
> +
> + ret = fd_open(bs);
> + if (ret < 0) {
> + return ret;
> + }
> +
> + start = sector_num * BDRV_SECTOR_SIZE;
> +
> + ret = try_seek_hole(bs, start, &data, &hole, pnum);
> + if (ret < 0) {
> + ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum);
> + if (ret < 0) {
> + /* Assume everything is allocated. */
> + data = 0;
> + hole = start + nb_sectors * BDRV_SECTOR_SIZE;
> + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
> + }
> + }
>
> if (data <= start) {
> /* On a data extent, compute sectors to the end of the extent. */
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-06 19:00 [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE Max Reitz
2014-05-06 21:18 ` Eric Blake
2014-05-09 8:03 ` Paolo Bonzini
@ 2014-05-11 17:26 ` Christoph Hellwig
2014-05-14 23:13 ` Max Reitz
2 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2014-05-11 17:26 UTC (permalink / raw)
To: Max Reitz; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi
On Tue, May 06, 2014 at 09:00:54PM +0200, Max Reitz wrote:
> The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
> FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
> compiled in in this case. However, there may be implementations which
> support the latter but not the former (e.g., NFSv4.2) as well as vice
> versa.
>
> To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
> probably be covered by POSIX soon) and if that does not work, fall back
> to FIEMAP; and if that does not work either, treat everything as
> allocated.
Btw, while I think that SEEK_HOLE/SEEK_DATA generally is the better API
for qemu, the NFS 4.2 SEEK operation will be sufficient for a proper FIEMAP
implementation, and we'll implement it for the Linux NFS client.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE
2014-05-11 17:26 ` Christoph Hellwig
@ 2014-05-14 23:13 ` Max Reitz
0 siblings, 0 replies; 10+ messages in thread
From: Max Reitz @ 2014-05-14 23:13 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Kevin Wolf, qemu-devel, Stefan Hajnoczi
On 11.05.2014 19:26, Christoph Hellwig wrote:
> On Tue, May 06, 2014 at 09:00:54PM +0200, Max Reitz wrote:
>> The current version of raw-posix always uses ioctl(FS_IOC_FIEMAP) if
>> FIEMAP is available; lseek with SEEK_HOLE/SEEK_DATA are not even
>> compiled in in this case. However, there may be implementations which
>> support the latter but not the former (e.g., NFSv4.2) as well as vice
>> versa.
>>
>> To cover both cases, always try SEEK_HOLE/SEEK_DATA (as this will
>> probably be covered by POSIX soon) and if that does not work, fall back
>> to FIEMAP; and if that does not work either, treat everything as
>> allocated.
> Btw, while I think that SEEK_HOLE/SEEK_DATA generally is the better API
> for qemu, the NFS 4.2 SEEK operation will be sufficient for a proper FIEMAP
> implementation, and we'll implement it for the Linux NFS client.
Hm, great, in that case this patch will probably never be put to the
test. *g*
Max
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2014-05-14 23:13 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-06 19:00 [Qemu-devel] [PATCH v2] block/raw-posix: Try both FIEMAP and SEEK_HOLE Max Reitz
2014-05-06 21:18 ` Eric Blake
2014-05-06 21:35 ` Max Reitz
2014-05-06 21:47 ` Eric Blake
2014-05-06 21:48 ` Max Reitz
2014-05-07 5:56 ` Markus Armbruster
2014-05-08 18:35 ` Max Reitz
2014-05-09 8:03 ` Paolo Bonzini
2014-05-11 17:26 ` Christoph Hellwig
2014-05-14 23:13 ` Max Reitz
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).