From: Dmitry Konishchev <konishchev@gmail.com>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>, Dmitry Konishchev <konishchev@gmail.com>
Subject: [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated()
Date: Wed, 8 Jun 2011 18:50:25 +0400 [thread overview]
Message-ID: <1307544625-22907-1-git-send-email-konishchev@gmail.com> (raw)
This patch optimizes 'qemu-img convert' operation for volumes which are
almost fully unallocated. Here are the results of simple tests:
We have a snapshot of a volume:
$ qemu-img info snapshot.qcow2
image: snapshot.qcow2
file format: qcow2
virtual size: 5.0G (5372805120 bytes)
disk size: 4.0G
cluster_size: 65536
Create a volume from the snapshot and use it a little:
$ qemu-img create -f qcow2 -o backing_file=snapshot.qcow2 volume.qcow2
For volumes which are almost fully allocated we have a little regression:
$ time qemu-img convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real 2m43.864s
user 0m9.257s
sys 0m40.559s
$ time qemu-img-patched convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real 2m46.899s
user 0m9.749s
sys 0m40.471s
But now create a volume which is almost fully unallocated:
$ qemu-img create -f qcow2 -o backing_file=snapshot.qcow2 volume.qcow2 1T
And now we have more than twice decreased CPU consumption:
$ time qemu-img convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real 6m40.985s
user 4m13.832s
sys 0m33.738s
$ time qemu-img-patched convert -O qcow2 volume.qcow2 volume_snapshot.qcow2
real 4m28.448s
user 1m43.882s
sys 0m33.894s
Signed-off-by: Dmitry Konishchev <konishchev@gmail.com>
---
qemu-img.c | 168 +++++++++++++++++++++++++++++++++++++++++++++---------------
1 files changed, 127 insertions(+), 41 deletions(-)
diff --git a/qemu-img.c b/qemu-img.c
index 4f162d1..9d905ed 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -38,6 +38,8 @@ typedef struct img_cmd_t {
int (*handler)(int argc, char **argv);
} img_cmd_t;
+static const int SECTOR_SIZE = 512;
+
/* Default to cache=writeback as data integrity is not important for qemu-tcg. */
#define BDRV_O_FLAGS BDRV_O_CACHE_WB
@@ -531,7 +533,7 @@ static int is_not_zero(const uint8_t *sector, int len)
}
/*
- * Returns true iff the first sector pointed to by 'buf' contains at least
+ * Returns true if the first sector pointed to by 'buf' contains at least
* a non-NUL byte.
*
* 'pnum' is set to the number of sectors (including and immediately following
@@ -590,15 +592,15 @@ static int compare_sectors(const uint8_t *buf1, const uint8_t *buf2, int n,
static int img_convert(int argc, char **argv)
{
- int c, ret = 0, n, n1, bs_n, bs_i, compress, cluster_size, cluster_sectors;
+ int c, ret = 0, n, cur_n, bs_n, bs_i, compress, cluster_size, cluster_sectors;
int progress = 0;
const char *fmt, *out_fmt, *out_baseimg, *out_filename;
BlockDriver *drv, *proto_drv;
BlockDriverState **bs = NULL, *out_bs = NULL;
int64_t total_sectors, nb_sectors, sector_num, bs_offset;
uint64_t bs_sectors;
+ uint64_t *bs_geometry = NULL;
uint8_t * buf = NULL;
- const uint8_t *buf1;
BlockDriverInfo bdi;
QEMUOptionParameter *param = NULL, *create_options = NULL;
QEMUOptionParameter *out_baseimg_param;
@@ -874,14 +876,21 @@ static int img_convert(int argc, char **argv)
/* signal EOF to align */
bdrv_write_compressed(out_bs, 0, NULL, 0);
} else {
+ int backing_depth;
+ int bs_i_prev = -1;
+ float progress = 100;
+ BlockDriverState *cur_bs;
int has_zero_init = bdrv_has_zero_init(out_bs);
sector_num = 0; // total number of sectors converted so far
nb_sectors = total_sectors - sector_num;
- local_progress = (float)100 /
- (nb_sectors / MIN(nb_sectors, IO_BUF_SIZE / 512));
for(;;) {
+ if (total_sectors) {
+ progress = (long double) sector_num / total_sectors * 100;
+ }
+ qemu_progress_print(progress, 0);
+
nb_sectors = total_sectors - sector_num;
if (nb_sectors <= 0) {
break;
@@ -893,15 +902,38 @@ static int img_convert(int argc, char **argv)
}
while (sector_num - bs_offset >= bs_sectors) {
- bs_i ++;
- assert (bs_i < bs_n);
+ bs_i++;
+ assert(bs_i < bs_n);
bs_offset += bs_sectors;
bdrv_get_geometry(bs[bs_i], &bs_sectors);
+
/* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, "
"bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n",
sector_num, bs_i, bs_offset, bs_sectors); */
}
+ if (bs_i != bs_i_prev) {
+ /* Getting geometry of the image and all its backing images */
+
+ backing_depth = 1;
+ cur_bs = bs[bs_i];
+ while (( cur_bs = cur_bs->backing_hd )) {
+ backing_depth++;
+ }
+
+ bs_geometry = (uint64_t *) qemu_realloc(
+ bs_geometry, backing_depth * sizeof(uint64_t));
+
+ backing_depth = 1;
+ cur_bs = bs[bs_i];
+ *bs_geometry = bs_sectors;
+ while (( cur_bs = cur_bs->backing_hd )) {
+ bdrv_get_geometry(cur_bs, bs_geometry + backing_depth++);
+ }
+
+ bs_i_prev = bs_i;
+ }
+
if (n > bs_offset + bs_sectors - sector_num) {
n = bs_offset + bs_sectors - sector_num;
}
@@ -912,55 +944,109 @@ static int img_convert(int argc, char **argv)
are present in both the output's and input's base images (no
need to copy them). */
if (out_baseimg) {
- if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
- n, &n1)) {
- sector_num += n1;
+ if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset, n, &cur_n)) {
+ sector_num += cur_n;
continue;
}
- /* The next 'n1' sectors are allocated in the input image. Copy
+ /* The next 'cur_n' sectors are allocated in the input image. Copy
only those as they may be followed by unallocated sectors. */
- n = n1;
+ n = cur_n;
}
- } else {
- n1 = n;
}
- ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
- if (ret < 0) {
- error_report("error while reading");
- goto out;
- }
- /* NOTE: at the same time we convert, we do not write zero
- sectors to have a chance to compress the image. Ideally, we
- should add a specific call to have the info to go faster */
- buf1 = buf;
- while (n > 0) {
- /* If the output image is being created as a copy on write image,
- copy all sectors even the ones containing only NUL bytes,
- because they may differ from the sectors in the base image.
-
- If the output is to a host device, we also write out
- sectors that are entirely 0, since whatever data was
- already there is garbage, not 0s. */
- if (!has_zero_init || out_baseimg ||
- is_allocated_sectors(buf1, n, &n1)) {
- ret = bdrv_write(out_bs, sector_num, buf1, n1);
- if (ret < 0) {
- error_report("error while writing");
- goto out;
+ /* If the output image is being created as a copy on write image,
+ copy all sectors even the ones containing only zero bytes,
+ because they may differ from the sectors in the base image.
+
+ If the output is to a host device, we also write out
+ sectors that are entirely 0, since whatever data was
+ already there is garbage, not 0s. */
+ if (!has_zero_init || out_baseimg) {
+ ret = bdrv_read(bs[bs_i], sector_num - bs_offset, buf, n);
+ if (ret < 0) {
+ error_report("error while reading");
+ goto out;
+ }
+
+ ret = bdrv_write(out_bs, sector_num, buf, n);
+ if (ret < 0) {
+ error_report("error while writing");
+ goto out;
+ }
+
+ sector_num += n;
+ } else {
+ /* Look for the sectors in the image and if they are not
+ allocated - sequentially in all its backing images.
+
+ Write only non-zero bytes to the output image. */
+
+ uint64_t cur_sectors;
+ uint64_t bs_sector;
+ int allocated_num;
+ int sector_found;
+
+ while (n > 0) {
+ cur_bs = bs[bs_i];
+ bs_sector = sector_num - bs_offset;
+ backing_depth = 0;
+ sector_found = 0;
+
+ do {
+ cur_sectors = bs_geometry[backing_depth++];
+
+ if (bs_sector >= cur_sectors) {
+ continue;
+ }
+
+ if (bs_sector + n <= cur_sectors) {
+ cur_n = n;
+ } else {
+ cur_n = cur_sectors - bs_sector;
+ }
+
+ if (bdrv_is_allocated(cur_bs, bs_sector, cur_n, &allocated_num)) {
+ const uint8_t *cur_buf = buf;
+ sector_found = 1;
+
+ ret = bdrv_read(cur_bs, bs_sector, buf, allocated_num);
+ if (ret < 0) {
+ error_report("error while reading");
+ goto out;
+ }
+
+ while (allocated_num > 0) {
+ if (is_allocated_sectors(cur_buf, allocated_num, &cur_n)) {
+ ret = bdrv_write(out_bs, sector_num, cur_buf, cur_n);
+ if (ret < 0) {
+ error_report("error while writing");
+ goto out;
+ }
+ }
+
+ n -= cur_n;
+ sector_num += cur_n;
+ allocated_num -= cur_n;
+ cur_buf += cur_n * SECTOR_SIZE;
+ }
+
+ break;
+ }
+ } while(( cur_bs = cur_bs->backing_hd ));
+
+ if (!sector_found) {
+ sector_num++;
+ n--;
}
}
- sector_num += n1;
- n -= n1;
- buf1 += n1 * 512;
}
- qemu_progress_print(local_progress, 100);
}
}
out:
qemu_progress_end();
free_option_parameters(create_options);
free_option_parameters(param);
+ qemu_free(bs_geometry);
qemu_free(buf);
if (out_bs) {
bdrv_delete(out_bs);
--
1.7.4.1
next reply other threads:[~2011-06-08 14:51 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-06-08 14:50 Dmitry Konishchev [this message]
2011-06-13 8:26 ` [Qemu-devel] [PATCH] CPU consumption optimization of 'qemu-img convert' using bdrv_is_allocated() Stefan Hajnoczi
2011-06-13 9:13 ` Dmitry Konishchev
2011-06-14 7:43 ` Dmitry Konishchev
2011-06-14 15:58 ` Stefan Hajnoczi
2011-06-15 7:38 ` Dmitry Konishchev
2011-06-15 8:39 ` Stefan Hajnoczi
2011-06-15 9:50 ` Dmitry Konishchev
2011-06-15 12:02 ` Stefan Hajnoczi
2011-06-15 13:14 ` Dmitry Konishchev
2011-06-15 13:33 ` Stefan Hajnoczi
2011-06-15 13:37 ` Dmitry Konishchev
2011-06-15 13:57 ` Stefan Hajnoczi
2011-06-16 8:10 ` Dmitry Konishchev
2011-06-17 7:25 ` Stefan Hajnoczi
2011-06-15 10:14 ` Dmitry Konishchev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1307544625-22907-1-git-send-email-konishchev@gmail.com \
--to=konishchev@gmail.com \
--cc=kwolf@redhat.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).