[Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
@ 2011-06-08 14:50 Dmitry Konishchev
  2011-06-13  8:26 ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-08 14:50 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Dmitry Konishchev

This patch optimizes 'qemu-img convert' operation for volumes which are
almost fully unallocated. Here are the results of simple tests:

We have a snapshot of a volume:
$ qemu-img info snapshot.qcow2
image: snapshot.qcow2
file format: qcow2
virtual size: 5.0G (5372805120 bytes)
disk size: 4.0G
cluster_size: 65536

Create a volume from the snapshot and use it a little:
$ qemu-img create -f qcow2 -o backing_file=snapshot.qcow2 volume.qcow2

For volumes which are almost fully allocated we have a little regression:
$ time qemu-img convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  2m43.864s
user  0m9.257s
sys   0m40.559s
$ time qemu-img-patched convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  2m46.899s
user  0m9.749s
sys	  0m40.471s

But now create a volume which is almost fully unallocated:
$ qemu-img create -f qcow2 -o backing_file=snapshot.qcow2 volume.qcow2 1T

And now we have more than twice decreased CPU consumption:
$ time qemu-img convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  6m40.985s
user  4m13.832s
sys   0m33.738s
$ time qemu-img-patched convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  4m28.448s
user  1m43.882s
sys   0m33.894s

Signed-off-by: Dmitry Konishchev <konishchev@gmail.com>
---
 qemu-img.c |  168 +++++++++++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 4f162d1..9d905ed 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -38,6 +38,8 @@ typedef struct img_cmd_t {
     int (*handler)(int argc, char **argv);
 } img_cmd_t;
 
+static const int SECTOR_SIZE = 512;
+
 /* Default to cache=writeback as data integrity is not important for qemu-tcg. */
 #define BDRV_O_FLAGS BDRV_O_CACHE_WB
 
@@ -531,7 +533,7 @@ static int is_not_zero(const uint8_t *sector, int len)
 }
 
 /*
- * Returns true iff the first sector pointed to by 'buf' contains at least
+ * Returns true if the first sector pointed to by 'buf' contains at least
  * a non-NUL byte.
  *
  * 'pnum' is set to the number of sectors (including and immediately following
@@ -590,15 +592,15 @@ static int compare_sectors(const uint8_t *buf1, const uint8_t *buf2, int n,
 
 static int img_convert(int argc, char **argv)
 {
-    int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size, cluster_sectors;
+    int c, ret = 0, n, cur_n, bs_n, bs_i, compress, cluster_size, cluster_sectors;
     int progress = 0;
     const char *fmt, *out_fmt, *out_baseimg, *out_filename;
     BlockDriver *drv, *proto_drv;
     BlockDriverState **bs = NULL, *out_bs = NULL;
     int64_t total_sectors, nb_sectors, sector_num, bs_offset;
     uint64_t bs_sectors;
+    uint64_t *bs_geometry = NULL;
     uint8_t * buf = NULL;
-    const uint8_t *buf1;
     BlockDriverInfo bdi;
     QEMUOptionParameter *param = NULL, *create_options = NULL;
     QEMUOptionParameter *out_baseimg_param;
@@ -874,14 +876,21 @@ static int img_convert(int argc, char **argv)
         /* signal EOF to align */
         bdrv_write_compressed(out_bs, 0, NULL, 0);
     } else {
+        int backing_depth;
+        int bs_i_prev = -1;
+        float progress = 100;
+        BlockDriverState *cur_bs;
         int has_zero_init = bdrv_has_zero_init(out_bs);
 
         sector_num = 0; // total number of sectors converted so far
         nb_sectors = total_sectors - sector_num;
-        local_progress = (float)100 /
-            (nb_sectors / MIN(nb_sectors, IO_BUF_SIZE / 512));
 
         for(;;) {
+            if (total_sectors) {
+                progress = (long double) sector_num / total_sectors * 100;
+            }
+            qemu_progress_print(progress, 0);
+
             nb_sectors = total_sectors - sector_num;
             if (nb_sectors <= 0) {
                 break;
@@ -893,15 +902,38 @@ static int img_convert(int argc, char **argv)
             }
 
             while (sector_num - bs_offset >= bs_sectors) {
-                bs_i ++;
-                assert (bs_i < bs_n);
+                bs_i++;
+                assert(bs_i < bs_n);
                 bs_offset += bs_sectors;
                 bdrv_get_geometry(bs[bs_i], &bs_sectors);
+
                 /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
                   "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
                    sector_num, bs_i, bs_offset, bs_sectors); */
             }
 
+            if (bs_i != bs_i_prev) {
+                /* Getting geometry of the image and all its backing images */
+
+                backing_depth = 1;
+                cur_bs = bs[bs_i];
+                while (( cur_bs = cur_bs->backing_hd )) {
+                    backing_depth++;
+                }
+
+                bs_geometry = (uint64_t *) qemu_realloc(
+                    bs_geometry, backing_depth * sizeof(uint64_t));
+
+                backing_depth = 1;
+                cur_bs = bs[bs_i];
+                *bs_geometry = bs_sectors;
+                while (( cur_bs = cur_bs->backing_hd )) {
+                    bdrv_get_geometry(cur_bs, bs_geometry + backing_depth++);
+                }
+
+                bs_i_prev = bs_i;
+            }
+
             if (n > bs_offset + bs_sectors - sector_num) {
                 n = bs_offset + bs_sectors - sector_num;
             }
@@ -912,55 +944,109 @@ static int img_convert(int argc, char **argv)
                    are present in both the output's and input's base images (no
                    need to copy them). */
                 if (out_baseimg) {
-                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
-                                           n, &n1)) {
-                        sector_num += n1;
+                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset, n, &cur_n)) {
+                        sector_num += cur_n;
                         continue;
                     }
-                    /* The next 'n1' sectors are allocated in the input image. Copy
+                    /* The next 'cur_n' sectors are allocated in the input image. Copy
                        only those as they may be followed by unallocated sectors. */
-                    n = n1;
+                    n = cur_n;
                 }
-            } else {
-                n1 = n;
             }
 
-            ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
-            if (ret < 0) {
-                error_report("error while reading");
-                goto out;
-            }
-            /* NOTE: at the same time we convert, we do not write zero
-               sectors to have a chance to compress the image. Ideally, we
-               should add a specific call to have the info to go faster */
-            buf1 = buf;
-            while (n > 0) {
-                /* If the output image is being created as a copy on write image,
-                   copy all sectors even the ones containing only NUL bytes,
-                   because they may differ from the sectors in the base image.
-
-                   If the output is to a host device, we also write out
-                   sectors that are entirely 0, since whatever data was
-                   already there is garbage, not 0s. */
-                if (!has_zero_init || out_baseimg ||
-                    is_allocated_sectors(buf1, n, &n1)) {
-                    ret = bdrv_write(out_bs, sector_num, buf1, n1);
-                    if (ret < 0) {
-                        error_report("error while writing");
-                        goto out;
+            /* If the output image is being created as a copy on write image,
+               copy all sectors even the ones containing only zero bytes,
+               because they may differ from the sectors in the base image.
+
+               If the output is to a host device, we also write out
+               sectors that are entirely 0, since whatever data was
+               already there is garbage, not 0s. */
+            if (!has_zero_init || out_baseimg) {
+                ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
+                if (ret < 0) {
+                    error_report("error while reading");
+                    goto out;
+                }
+
+                ret = bdrv_write(out_bs, sector_num, buf, n);
+                if (ret < 0) {
+                    error_report("error while writing");
+                    goto out;
+                }
+
+                sector_num += n;
+            } else {
+                /* Look for the sectors in the image and if they are not
+                   allocated - sequentially in all its backing images.
+
+                   Write only non-zero bytes to the output image. */
+
+                uint64_t cur_sectors;
+                uint64_t bs_sector;
+                int allocated_num;
+                int sector_found;
+
+                while (n > 0) {
+                    cur_bs = bs[bs_i];
+                    bs_sector = sector_num - bs_offset;
+                    backing_depth = 0;
+                    sector_found = 0;
+
+                    do {
+                        cur_sectors = bs_geometry[backing_depth++];
+
+                        if (bs_sector >= cur_sectors) {
+                            continue;
+                        }
+
+                        if (bs_sector + n <= cur_sectors) {
+                            cur_n = n;
+                        } else {
+                            cur_n = cur_sectors - bs_sector;
+                        }
+
+                        if (bdrv_is_allocated(cur_bs, bs_sector, cur_n, &allocated_num)) {
+                            const uint8_t *cur_buf = buf;
+                            sector_found = 1;
+
+                            ret = bdrv_read(cur_bs, bs_sector, buf, allocated_num);
+                            if (ret < 0) {
+                                error_report("error while reading");
+                                goto out;
+                            }
+
+                            while (allocated_num > 0) {
+                                if (is_allocated_sectors(cur_buf, allocated_num, &cur_n)) {
+                                    ret = bdrv_write(out_bs, sector_num, cur_buf, cur_n);
+                                    if (ret < 0) {
+                                        error_report("error while writing");
+                                        goto out;
+                                    }
+                                }
+
+                                n -= cur_n;
+                                sector_num += cur_n;
+                                allocated_num -= cur_n;
+                                cur_buf += cur_n * SECTOR_SIZE;
+                            }
+
+                            break;
+                        }
+                    } while(( cur_bs = cur_bs->backing_hd ));
+
+                    if (!sector_found) {
+                        sector_num++;
+                        n--;
                     }
                 }
-                sector_num += n1;
-                n -= n1;
-                buf1 += n1 * 512;
             }
-            qemu_progress_print(local_progress, 100);
         }
     }
 out:
     qemu_progress_end();
     free_option_parameters(create_options);
     free_option_parameters(param);
+    qemu_free(bs_geometry);
     qemu_free(buf);
     if (out_bs) {
         bdrv_delete(out_bs);
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-08 14:50 [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated() Dmitry Konishchev
@ 2011-06-13  8:26 ` Stefan Hajnoczi
  2011-06-13  9:13   ` Dmitry Konishchev
  2011-06-15 10:14   ` Dmitry Konishchev
  0 siblings, 2 replies; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-13  8:26 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 08, 2011 at 06:50:25PM +0400, Dmitry Konishchev wrote:
> This patch optimizes 'qemu-img convert' operation for volumes which are
> almost fully unallocated. Here are the results of simple tests:

The optimization is to check allocation metadata instead of
unconditionally reading and then checking for all zeroes?

> diff --git a/qemu-img.c b/qemu-img.c
> index 4f162d1..9d905ed 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -38,6 +38,8 @@ typedef struct img_cmd_t {
>      int (*handler)(int argc, char **argv);
>  } img_cmd_t;
>  
> +static const int SECTOR_SIZE = 512;

Why introduce a new constant instead of using BDRV_SECTOR_SIZE?

> +
>  /* Default to cache=writeback as data integrity is not important for qemu-tcg. */
>  #define BDRV_O_FLAGS BDRV_O_CACHE_WB
>  
> @@ -531,7 +533,7 @@ static int is_not_zero(const uint8_t *sector, int len)
>  }
>  
>  /*
> - * Returns true iff the first sector pointed to by 'buf' contains at least
> + * Returns true if the first sector pointed to by 'buf' contains at least

"iff" is not a typo.  It means "if and only if".

> @@ -912,55 +944,109 @@ static int img_convert(int argc, char **argv)
>                     are present in both the output's and input's base images (no
>                     need to copy them). */
>                  if (out_baseimg) {
> -                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
> -                                           n, &n1)) {
> -                        sector_num += n1;
> +                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset, n, &cur_n)) {
> +                        sector_num += cur_n;
>                          continue;
>                      }
> -                    /* The next 'n1' sectors are allocated in the input image. Copy
> +                    /* The next 'cur_n' sectors are allocated in the input image. Copy
>                         only those as they may be followed by unallocated sectors. */
> -                    n = n1;
> +                    n = cur_n;
>                  }
> -            } else {
> -                n1 = n;
>              }
>  
> -            ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
> -            if (ret < 0) {
> -                error_report("error while reading");
> -                goto out;
> -            }
> -            /* NOTE: at the same time we convert, we do not write zero
> -               sectors to have a chance to compress the image. Ideally, we
> -               should add a specific call to have the info to go faster */
> -            buf1 = buf;
> -            while (n > 0) {
> -                /* If the output image is being created as a copy on write image,
> -                   copy all sectors even the ones containing only NUL bytes,
> -                   because they may differ from the sectors in the base image.
> -
> -                   If the output is to a host device, we also write out
> -                   sectors that are entirely 0, since whatever data was
> -                   already there is garbage, not 0s. */
> -                if (!has_zero_init || out_baseimg ||
> -                    is_allocated_sectors(buf1, n, &n1)) {
> -                    ret = bdrv_write(out_bs, sector_num, buf1, n1);
> -                    if (ret < 0) {
> -                        error_report("error while writing");
> -                        goto out;
> +            /* If the output image is being created as a copy on write image,
> +               copy all sectors even the ones containing only zero bytes,
> +               because they may differ from the sectors in the base image.
> +
> +               If the output is to a host device, we also write out
> +               sectors that are entirely 0, since whatever data was
> +               already there is garbage, not 0s. */
> +            if (!has_zero_init || out_baseimg) {
> +                ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
> +                if (ret < 0) {
> +                    error_report("error while reading");
> +                    goto out;
> +                }
> +
> +                ret = bdrv_write(out_bs, sector_num, buf, n);
> +                if (ret < 0) {
> +                    error_report("error while writing");
> +                    goto out;
> +                }
> +
> +                sector_num += n;
> +            } else {
> +                /* Look for the sectors in the image and if they are not
> +                   allocated - sequentially in all its backing images.
> +
> +                   Write only non-zero bytes to the output image. */

I think the recursive is_allocated() needs its own function.  This
function is already long/complex enough :).

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-13  8:26 ` Stefan Hajnoczi
@ 2011-06-13  9:13   ` Dmitry Konishchev
  2011-06-14  7:43     ` Dmitry Konishchev
  2011-06-15 10:14   ` Dmitry Konishchev
  1 sibling, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-13  9:13 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Mon, Jun 13, 2011 at 12:26 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> The optimization is to check allocation metadata instead of
> unconditionally reading and then checking for all zeroes?
Yeah, exactly.

On Mon, Jun 13, 2011 at 12:26 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> Why introduce a new constant instead of using BDRV_SECTOR_SIZE?
OK, I'll fix this.

On Mon, Jun 13, 2011 at 12:26 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> "iff" is not a typo.  It means "if and only if".
Sorry, I don't know English so good. :) Will revert this.

On Mon, Jun 13, 2011 at 12:26 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> I think the recursive is_allocated() needs its own function.  This
> function is already long/complex enough :).
I haven't done this because in this case I have to pass too lot of
local variables to this function. Just not sure that it'll look
better. But if you mind I surely can do this.

-- 
Dmitry Konishchev
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-13  9:13   ` Dmitry Konishchev
@ 2011-06-14  7:43     ` Dmitry Konishchev
  2011-06-14 15:58       ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-14  7:43 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Mon, Jun 13, 2011 at 1:13 PM, Dmitry Konishchev <konishchev@gmail.com> wrote:
> I haven't done this because in this case I have to pass too lot of
> local variables to this function. Just not sure that it'll look
> better. But if you mind I surely can do this.
Should I?

-- 
Dmitry Konishchev
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-14  7:43     ` Dmitry Konishchev
@ 2011-06-14 15:58       ` Stefan Hajnoczi
  2011-06-15  7:38         ` Dmitry Konishchev
  0 siblings, 1 reply; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 15:58 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

On Tue, Jun 14, 2011 at 8:43 AM, Dmitry Konishchev <konishchev@gmail.com> wrote:
> On Mon, Jun 13, 2011 at 1:13 PM, Dmitry Konishchev <konishchev@gmail.com> wrote:
>> I haven't done this because in this case I have to pass too lot of
>> local variables to this function. Just not sure that it'll look
>> better. But if you mind I surely can do this.
> Should I?

Yes, please.  For image files the block layer should be caching the
device capacity (size) anyway, so you probably don't need to allocate
the array, just call bdrv_get_geometry().  That might make it easier
to write a self-contained function.

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-14 15:58       ` Stefan Hajnoczi
@ 2011-06-15  7:38         ` Dmitry Konishchev
  2011-06-15  8:39           ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-15  7:38 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Tue, Jun 14, 2011 at 7:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> Yes, please.

OK, I'll do it as soon I'll find time for it.


On Tue, Jun 14, 2011 at 7:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> For image files the block layer should be caching the device capacity (size)
> anyway, so you probably don't need to allocate the array, just call
> bdrv_get_geometry().  That might make it easier to write a self-contained
> function.

I've tried to not cache, but it turned out that bdrv_get_geometry()
calls are quite noticeable in profiler without caching.

-- 
Дмитрий Конищев (Dmitry Konishchev)
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15  7:38         ` Dmitry Konishchev
@ 2011-06-15  8:39           ` Stefan Hajnoczi
  2011-06-15  9:50             ` Dmitry Konishchev
  0 siblings, 1 reply; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-15  8:39 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 8:38 AM, Dmitry Konishchev <konishchev@gmail.com> wrote:
> On Tue, Jun 14, 2011 at 7:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> Yes, please.
>
> OK, I'll do it as soon I'll find time for it.
>
>
> On Tue, Jun 14, 2011 at 7:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> For image files the block layer should be caching the device capacity (size)
>> anyway, so you probably don't need to allocate the array, just call
>> bdrv_get_geometry().  That might make it easier to write a self-contained
>> function.
>
> I've tried to not cache, but it turned out that bdrv_get_geometry()
> calls are quite noticeable in profiler without caching.

Why is bdrv_get_geometry() slow?

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15  8:39           ` Stefan Hajnoczi
@ 2011-06-15  9:50             ` Dmitry Konishchev
  2011-06-15 12:02               ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-15  9:50 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 12:39 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> Why is bdrv_get_geometry() slow?

Mmm.. Frankly, I haven't looked so deep, but it is going to be slow at
least for raw images due to using lseek().

-- 
Dmitry Konishchev
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15  9:50             ` Dmitry Konishchev
@ 2011-06-15 12:02               ` Stefan Hajnoczi
  2011-06-15 13:14                 ` Dmitry Konishchev
  0 siblings, 1 reply; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-15 12:02 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 10:50 AM, Dmitry Konishchev
<konishchev@gmail.com> wrote:
> On Wed, Jun 15, 2011 at 12:39 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> Why is bdrv_get_geometry() slow?
>
> Mmm.. Frankly, I haven't looked so deep, but it is going to be slow at
> least for raw images due to using lseek().

We need to fully understand performance before applying optimizations
on top.  Otherwise it is possible to paper over a problem while
leaving the root cause unsolved.  Avoiding lseek(2) is very important,
not just for qemu-img but also for running VMs.  lseek(2) should only
be invoked if the image is growable/removable.

When I run a VM from a virtio-blk raw image I see no lseek(2) calls.

On the host:
strace -p $pid_of_qemu -f

Inside the guest:
dd if=/dev/vda of=/dev/null iflag=direct

I see pread(2) from the posix-aio-compat.c worker threads but no
lseek(2).  The total_sectors cached value is being used.

Does strace(1) show lseek(2) on your host?

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15 12:02               ` Stefan Hajnoczi
@ 2011-06-15 13:14                 ` Dmitry Konishchev
  2011-06-15 13:33                   ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-15 13:14 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 4:02 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> We need to fully understand performance before applying optimizations
> on top.  Otherwise it is possible to paper over a problem while
> leaving the root cause unsolved.  Avoiding lseek(2) is very important,
> not just for qemu-img but also for running VMs.  lseek(2) should only
> be invoked if the image is growable/removable.
>
> When I run a VM from a virtio-blk raw image I see no lseek(2) calls.
>
> On the host:
> strace -p $pid_of_qemu -f
>
> Inside the guest:
> dd if=/dev/vda of=/dev/null iflag=direct
>
> I see pread(2) from the posix-aio-compat.c worker threads but no
> lseek(2).  The total_sectors cached value is being used.
>
> Does strace(1) show lseek(2) on your host?

No, I don't see lseek() in the strace output. But in the other hand I
see that bdrv_get_geometry() uses bdrv_getlength() which is
implemented using lseek() in block/raw-posix.c...
May be I'am mistaken about lseek(), but I get 9% slower version if
disable caching.

-- 
Дмитрий Конищев (Dmitry Konishchev)
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15 13:14                 ` Dmitry Konishchev
@ 2011-06-15 13:33                   ` Stefan Hajnoczi
  2011-06-15 13:37                     ` Dmitry Konishchev
  0 siblings, 1 reply; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-15 13:33 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 2:14 PM, Dmitry Konishchev <konishchev@gmail.com> wrote:
> On Wed, Jun 15, 2011 at 4:02 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> We need to fully understand performance before applying optimizations
>> on top.  Otherwise it is possible to paper over a problem while
>> leaving the root cause unsolved.  Avoiding lseek(2) is very important,
>> not just for qemu-img but also for running VMs.  lseek(2) should only
>> be invoked if the image is growable/removable.
>>
>> When I run a VM from a virtio-blk raw image I see no lseek(2) calls.
>>
>> On the host:
>> strace -p $pid_of_qemu -f
>>
>> Inside the guest:
>> dd if=/dev/vda of=/dev/null iflag=direct
>>
>> I see pread(2) from the posix-aio-compat.c worker threads but no
>> lseek(2).  The total_sectors cached value is being used.
>>
>> Does strace(1) show lseek(2) on your host?
>
> No, I don't see lseek() in the strace output. But in the other hand I
> see that bdrv_get_geometry() uses bdrv_getlength() which is
> implemented using lseek() in block/raw-posix.c...
> May be I'am mistaken about lseek(), but I get 9% slower version if
> disable caching.

"disable caching"?

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15 13:33                   ` Stefan Hajnoczi
@ 2011-06-15 13:37                     ` Dmitry Konishchev
  2011-06-15 13:57                       ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-15 13:37 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 5:33 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> "disable caching"?

Image geometry caching. I meant If I call bdrv_get_geometry() every
time I need image geometry instead of obtaining it from bs_geometry
variable.

-- 
Дмитрий Конищев (Dmitry Konishchev)
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15 13:37                     ` Dmitry Konishchev
@ 2011-06-15 13:57                       ` Stefan Hajnoczi
  2011-06-16  8:10                         ` Dmitry Konishchev
  0 siblings, 1 reply; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-15 13:57 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

2011/6/15 Dmitry Konishchev <konishchev@gmail.com>:
> On Wed, Jun 15, 2011 at 5:33 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> "disable caching"?
>
> Image geometry caching. I meant If I call bdrv_get_geometry() every
> time I need image geometry instead of obtaining it from bs_geometry
> variable.

Haha, sorry.  Too much caching: -drive cache=none, total_sectors
cached value, bdrv_get_geometry() cached values :).

Anyway, bdrv_getlength() will return the total_sectors value instead
of calling into raw-posix.c .bdrv_getlength().  That's why it should
be cheap.

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-15 13:57                       ` Stefan Hajnoczi
@ 2011-06-16  8:10                         ` Dmitry Konishchev
  2011-06-17  7:25                           ` Stefan Hajnoczi
  0 siblings, 1 reply; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-16  8:10 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: Kevin Wolf, qemu-devel

On Wed, Jun 15, 2011 at 5:57 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> Anyway, bdrv_getlength() will return the total_sectors value instead
> of calling into raw-posix.c .bdrv_getlength().  That's why it should
> be cheap.

Yeah, I see it now after a closer look in the drivers code. It looks
like I get this 9% slowdown just because for my particular test
bdrv_get_geometry() is called 37,348,536 times, which is a big number
even for function which calls another function which checks a few IFs
and returns a multiplication of two numbers.

Anyway, I think that caching the geometry is a good thing to do, so I
believe that the second version of the patch is good enough.

-- 
Dmitry Konishchev
mailto:konishchev@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-16  8:10                         ` Dmitry Konishchev
@ 2011-06-17  7:25                           ` Stefan Hajnoczi
  0 siblings, 0 replies; 16+ messages in thread
From: Stefan Hajnoczi @ 2011-06-17  7:25 UTC (permalink / raw)
  To: Dmitry Konishchev; +Cc: Kevin Wolf, qemu-devel

On Thu, Jun 16, 2011 at 9:10 AM, Dmitry Konishchev <konishchev@gmail.com> wrote:
> On Wed, Jun 15, 2011 at 5:57 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> Anyway, bdrv_getlength() will return the total_sectors value instead
>> of calling into raw-posix.c .bdrv_getlength().  That's why it should
>> be cheap.
>
> Yeah, I see it now after a closer look in the drivers code. It looks
> like I get this 9% slowdown just because for my particular test
> bdrv_get_geometry() is called 37,348,536 times, which is a big number
> even for function which calls another function which checks a few IFs
> and returns a multiplication of two numbers.

Ultimately the QEMU block layer should make total_sectors always
up-to-date so it can be fetched cheaply.  This requires looking at all
the block drivers and considering when device size may change at
runtime.

Your patch improves backing file convert cases without regressing
other cases too much.

If you continue optimizing qemu-img, consider profiling the block
operations and improving the core block driver performance instead of
the outer loop of the qemu-img command.  That way running VMs will
benefit from your optimizations too.  And the simple outer loop may be
acceptable if the operation inside it is fast enough.

Kevin: Are you happy with this patch?

Stefan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
  2011-06-13  8:26 ` Stefan Hajnoczi
  2011-06-13  9:13   ` Dmitry Konishchev
@ 2011-06-15 10:14   ` Dmitry Konishchev
  1 sibling, 0 replies; 16+ messages in thread
From: Dmitry Konishchev @ 2011-06-15 10:14 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Dmitry Konishchev

This patch optimizes 'qemu-img convert' operation for volumes which are
almost fully unallocated. Here are the results of simple tests:

We have a snapshot of a volume:
$ qemu-img info snapshot.qcow2
image: snapshot.qcow2
file format: qcow2
virtual size: 5.0G (5372805120 bytes)
disk size: 4.0G
cluster_size: 65536

Create a volume from the snapshot and use it a little:
$ qemu-img create -f qcow2 -o backing_file=snapshot.qcow2 volume.qcow2

For volumes which are almost fully allocated we have a little regression:
$ time qemu-img convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  2m43.864s
user  0m9.257s
sys   0m40.559s
$ time qemu-img-patched convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  2m46.899s
user  0m9.749s
sys	  0m40.471s

But now create a volume which is almost fully unallocated:
$ qemu-img create -f qcow2 -o backing_file=snapshot.qcow2 volume.qcow2 1T

And now we have more than twice decreased CPU consumption:
$ time qemu-img convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  6m40.985s
user  4m13.832s
sys   0m33.738s
$ time qemu-img-patched convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real  4m28.448s
user  1m43.882s
sys   0m33.894s

Signed-off-by: Dmitry Konishchev <konishchev@gmail.com>
---
 qemu-img.c |  184 ++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 143 insertions(+), 41 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 4f162d1..7f3d853 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -27,6 +27,7 @@
 #include "osdep.h"
 #include "sysemu.h"
 #include "block_int.h"
+#include "block.h"
 #include <stdio.h>
 
 #ifdef _WIN32
@@ -586,19 +587,95 @@ static int compare_sectors(const uint8_t *buf1, const uint8_t *buf2, int n,
     return res;
 }
 
+/*
+ * Copies sectors from one image to another.
+ * Writes only non-zero bytes to the output image.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+static int copy_allocated_sectors(
+    BlockDriverState *out_bs, BlockDriverState *bs, int n,
+    int64_t sector_num, int64_t bs_offset, const uint64_t *bs_geometry, uint8_t *buf)
+{
+    BlockDriverState *cur_bs;
+    uint64_t cur_sectors;
+    uint64_t bs_sector;
+    int backing_depth;
+    int allocated_num;
+    int sector_found;
+    int cur_n;
+
+    while (n > 0) {
+        /* Look for the sectors in the source image and if they are not
+           allocated - sequentially in all its backing images. */
+
+        cur_bs = bs;
+        bs_sector = sector_num - bs_offset;
+        backing_depth = 0;
+        sector_found = 0;
+
+        do {
+            cur_sectors = bs_geometry[backing_depth++];
+
+            if (bs_sector >= cur_sectors) {
+                continue;
+            }
+
+            if (bs_sector + n <= cur_sectors) {
+                cur_n = n;
+            } else {
+                cur_n = cur_sectors - bs_sector;
+            }
+
+            if (bdrv_is_allocated(cur_bs, bs_sector, cur_n, &allocated_num)) {
+                const uint8_t *cur_buf = buf;
+                sector_found = 1;
+
+                if (bdrv_read(cur_bs, bs_sector, buf, allocated_num) < 0) {
+                    error_report("error while reading");
+                    return -1;
+                }
+
+                while (allocated_num > 0) {
+                    if (is_allocated_sectors(cur_buf, allocated_num, &cur_n)) {
+                        if (bdrv_write(out_bs, sector_num, cur_buf, cur_n) < 0) {
+                            error_report("error while writing");
+                            return -1;
+                        }
+                    }
+
+                    n -= cur_n;
+                    sector_num += cur_n;
+                    allocated_num -= cur_n;
+                    cur_buf += cur_n * BDRV_SECTOR_SIZE;
+                }
+
+                break;
+            }
+        } while(( cur_bs = cur_bs->backing_hd ));
+
+        if (!sector_found) {
+            sector_num++;
+            n--;
+        }
+    }
+
+    return 0;
+}
+
 #define IO_BUF_SIZE (2 * 1024 * 1024)
 
 static int img_convert(int argc, char **argv)
 {
-    int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size, cluster_sectors;
+    int c, ret = 0, n, cur_n, bs_n, bs_i, compress, cluster_size, cluster_sectors;
     int progress = 0;
     const char *fmt, *out_fmt, *out_baseimg, *out_filename;
     BlockDriver *drv, *proto_drv;
     BlockDriverState **bs = NULL, *out_bs = NULL;
     int64_t total_sectors, nb_sectors, sector_num, bs_offset;
     uint64_t bs_sectors;
+    uint64_t *bs_geometry = NULL;
     uint8_t * buf = NULL;
-    const uint8_t *buf1;
     BlockDriverInfo bdi;
     QEMUOptionParameter *param = NULL, *create_options = NULL;
     QEMUOptionParameter *out_baseimg_param;
@@ -874,14 +951,20 @@ static int img_convert(int argc, char **argv)
         /* signal EOF to align */
         bdrv_write_compressed(out_bs, 0, NULL, 0);
     } else {
+        int bs_i_prev = -1;
+        float progress = 100;
+        BlockDriverState *cur_bs;
         int has_zero_init = bdrv_has_zero_init(out_bs);
 
         sector_num = 0; // total number of sectors converted so far
         nb_sectors = total_sectors - sector_num;
-        local_progress = (float)100 /
-            (nb_sectors / MIN(nb_sectors, IO_BUF_SIZE / 512));
 
         for(;;) {
+            if (total_sectors) {
+                progress = (long double) sector_num / total_sectors * 100;
+            }
+            qemu_progress_print(progress, 0);
+
             nb_sectors = total_sectors - sector_num;
             if (nb_sectors <= 0) {
                 break;
@@ -893,15 +976,38 @@ static int img_convert(int argc, char **argv)
             }
 
             while (sector_num - bs_offset >= bs_sectors) {
-                bs_i ++;
-                assert (bs_i < bs_n);
+                bs_i++;
+                assert(bs_i < bs_n);
                 bs_offset += bs_sectors;
                 bdrv_get_geometry(bs[bs_i], &bs_sectors);
+
                 /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
                   "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
                    sector_num, bs_i, bs_offset, bs_sectors); */
             }
 
+            if (bs_i != bs_i_prev) {
+                /* Getting geometry of the image and all its backing images */
+
+                int backing_depth = 1;
+                cur_bs = bs[bs_i];
+                while (( cur_bs = cur_bs->backing_hd )) {
+                    backing_depth++;
+                }
+
+                bs_geometry = (uint64_t *) qemu_realloc(
+                    bs_geometry, backing_depth * sizeof(uint64_t));
+
+                backing_depth = 1;
+                cur_bs = bs[bs_i];
+                *bs_geometry = bs_sectors;
+                while (( cur_bs = cur_bs->backing_hd )) {
+                    bdrv_get_geometry(cur_bs, bs_geometry + backing_depth++);
+                }
+
+                bs_i_prev = bs_i;
+            }
+
             if (n > bs_offset + bs_sectors - sector_num) {
                 n = bs_offset + bs_sectors - sector_num;
             }
@@ -912,55 +1018,51 @@ static int img_convert(int argc, char **argv)
                    are present in both the output's and input's base images (no
                    need to copy them). */
                 if (out_baseimg) {
-                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
-                                           n, &n1)) {
-                        sector_num += n1;
+                    if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset, n, &cur_n)) {
+                        sector_num += cur_n;
                         continue;
                     }
-                    /* The next 'n1' sectors are allocated in the input image. Copy
+                    /* The next 'cur_n' sectors are allocated in the input image. Copy
                        only those as they may be followed by unallocated sectors. */
-                    n = n1;
+                    n = cur_n;
                 }
-            } else {
-                n1 = n;
             }
 
-            ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
-            if (ret < 0) {
-                error_report("error while reading");
-                goto out;
-            }
-            /* NOTE: at the same time we convert, we do not write zero
-               sectors to have a chance to compress the image. Ideally, we
-               should add a specific call to have the info to go faster */
-            buf1 = buf;
-            while (n > 0) {
-                /* If the output image is being created as a copy on write image,
-                   copy all sectors even the ones containing only NUL bytes,
-                   because they may differ from the sectors in the base image.
-
-                   If the output is to a host device, we also write out
-                   sectors that are entirely 0, since whatever data was
-                   already there is garbage, not 0s. */
-                if (!has_zero_init || out_baseimg ||
-                    is_allocated_sectors(buf1, n, &n1)) {
-                    ret = bdrv_write(out_bs, sector_num, buf1, n1);
-                    if (ret < 0) {
-                        error_report("error while writing");
-                        goto out;
-                    }
+            /* If the output image is being created as a copy on write image,
+               copy all sectors even the ones containing only zero bytes,
+               because they may differ from the sectors in the base image.
+
+               If the output is to a host device, we also write out
+               sectors that are entirely 0, since whatever data was
+               already there is garbage, not 0s. */
+            if (!has_zero_init || out_baseimg) {
+                ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
+                if (ret < 0) {
+                    error_report("error while reading");
+                    goto out;
+                }
+
+                ret = bdrv_write(out_bs, sector_num, buf, n);
+                if (ret < 0) {
+                    error_report("error while writing");
+                    goto out;
+                }
+            } else {
+                ret = copy_allocated_sectors(out_bs, bs[bs_i], n,
+                    sector_num, bs_offset, bs_geometry, buf);
+                if (ret < 0) {
+                    goto out;
                 }
-                sector_num += n1;
-                n -= n1;
-                buf1 += n1 * 512;
             }
-            qemu_progress_print(local_progress, 100);
+
+            sector_num += n;
         }
     }
 out:
     qemu_progress_end();
     free_option_parameters(create_options);
     free_option_parameters(param);
+    qemu_free(bs_geometry);
     qemu_free(buf);
     if (out_bs) {
         bdrv_delete(out_bs);
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2011-06-17  7:25 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-06-08 14:50 [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated() Dmitry Konishchev
2011-06-13  8:26 ` Stefan Hajnoczi
2011-06-13  9:13   ` Dmitry Konishchev
2011-06-14  7:43     ` Dmitry Konishchev
2011-06-14 15:58       ` Stefan Hajnoczi
2011-06-15  7:38         ` Dmitry Konishchev
2011-06-15  8:39           ` Stefan Hajnoczi
2011-06-15  9:50             ` Dmitry Konishchev
2011-06-15 12:02               ` Stefan Hajnoczi
2011-06-15 13:14                 ` Dmitry Konishchev
2011-06-15 13:33                   ` Stefan Hajnoczi
2011-06-15 13:37                     ` Dmitry Konishchev
2011-06-15 13:57                       ` Stefan Hajnoczi
2011-06-16  8:10                         ` Dmitry Konishchev
2011-06-17  7:25                           ` Stefan Hajnoczi
2011-06-15 10:14   ` Dmitry Konishchev

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).