qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: "Denis V. Lunev" <den@openvz.org>
To: qemu-block@nongnu.org, qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <fam@euphon.net>,
	Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>,
	Juan Quintela <quintela@redhat.com>,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>,
	Max Reitz <mreitz@redhat.com>,
	Denis Plotnikov <dplotnikov@virtuozzo.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	"Denis V. Lunev" <den@openvz.org>
Subject: [PATCH 4/4] block/io: improve savevm performance
Date: Thu, 11 Jun 2020 20:11:43 +0300	[thread overview]
Message-ID: <20200611171143.21589-5-den@openvz.org> (raw)
In-Reply-To: <20200611171143.21589-1-den@openvz.org>

This patch does 2 standard basic things:
- it creates intermediate buffer for all writes from QEMU migration code
  to block driver,
- this buffer is sent to disk asynchronously, allowing several writes to
  run in parallel.

Thus bdrv_vmstate_write() is becoming asynchronous. All pending operations
completion are performed in newly invented bdrv_flush_vmstate().

In general, migration code is fantastically inefficent (by observation),
buffers are not aligned and sent with arbitrary pieces, a lot of time
less than 100 bytes at a chunk, which results in read-modify-write
operations if target file descriptor is opened with O_DIRECT. It should
also be noted that all operations are performed into unallocated image
blocks, which also suffer due to partial writes to such new clusters
even on cached file descriptors.

Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
                original     fixed
cached:          1.79s       1.27s
non-cached:      3.29s       0.81s

The difference over HDD would be more significant :)

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Max Reitz <mreitz@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Fam Zheng <fam@euphon.net>
CC: Juan Quintela <quintela@redhat.com>
CC: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
CC: Denis Plotnikov <dplotnikov@virtuozzo.com>
---
 block/io.c                | 121 +++++++++++++++++++++++++++++++++++++-
 include/block/block_int.h |   8 +++
 2 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index fbf352f39d..698f1eef76 100644
--- a/block/io.c
+++ b/block/io.c
@@ -26,6 +26,7 @@
 #include "trace.h"
 #include "sysemu/block-backend.h"
 #include "block/aio-wait.h"
+#include "block/aio_task.h"
 #include "block/blockjob.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@@ -2633,6 +2634,102 @@ typedef struct BdrvVmstateCo {
     int                 ret;
 } BdrvVmstateCo;
 
+typedef struct BdrvVMStateTask {
+    AioTask task;
+
+    BlockDriverState *bs;
+    int64_t offset;
+    void *buf;
+    size_t bytes;
+} BdrvVMStateTask;
+
+typedef struct BdrvSaveVMState {
+    AioTaskPool *pool;
+    BdrvVMStateTask *t;
+} BdrvSaveVMState;
+
+
+static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task)
+{
+    int err = 0;
+    BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task);
+
+    if (t->bytes != 0) {
+        QEMUIOVector local_qiov;
+        qemu_iovec_init_buf(&local_qiov, t->buf, t->bytes);
+
+        bdrv_inc_in_flight(t->bs);
+        err = t->bs->drv->bdrv_save_vmstate(t->bs, &local_qiov, t->offset);
+        bdrv_dec_in_flight(t->bs);
+    }
+
+    qemu_vfree(t->buf);
+    return err;
+}
+
+static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs,
+                                                 int64_t pos, size_t size)
+{
+    BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1);
+
+    *t = (BdrvVMStateTask) {
+        .task.func = bdrv_co_vmstate_save_task_entry,
+        .buf = qemu_blockalign(bs, size),
+        .offset = pos,
+        .bs = bs,
+    };
+
+    return t;
+}
+
+static int bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+                                   int64_t pos)
+{
+    BdrvSaveVMState *state = bs->savevm_state;
+    BdrvVMStateTask *t;
+    size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
+    size_t to_copy;
+    size_t off;
+
+    if (state == NULL) {
+        state = g_new(BdrvSaveVMState, 1);
+        *state = (BdrvSaveVMState) {
+            .pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
+            .t = bdrv_vmstate_task_create(bs, pos, buf_size),
+        };
+
+        bs->savevm_state = state;
+    }
+
+    if (aio_task_pool_status(state->pool) < 0) {
+        return aio_task_pool_status(state->pool);
+    }
+
+    t = state->t;
+    if (t->offset + t->bytes != pos) {
+        /* Normally this branch is not reachable from migration */
+        return bs->drv->bdrv_save_vmstate(bs, qiov, pos);
+    }
+
+    off = 0;
+    while (1) {
+        to_copy = MIN(qiov->size - off, buf_size - t->bytes);
+        qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
+        t->bytes += to_copy;
+        if (t->bytes < buf_size) {
+            return qiov->size;
+        }
+
+        aio_task_pool_start_task(state->pool, &t->task);
+
+        pos += to_copy;
+        off += to_copy;
+        state->t = t = bdrv_vmstate_task_create(bs, pos, buf_size);
+    }
+
+    return qiov->size;
+}
+
 static int coroutine_fn
 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
                    bool is_read)
@@ -2648,7 +2745,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
         if (is_read) {
             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
         } else {
-            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
+            ret = bdrv_co_do_save_vmstate(bs, qiov, pos);
         }
     } else if (bs->file) {
         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
@@ -2733,7 +2830,27 @@ typedef struct FlushVMStateCo {
 
 static int coroutine_fn bdrv_co_flush_vmstate(BlockDriverState *bs)
 {
-    return 0;
+    int err;
+    BdrvSaveVMState *state = bs->savevm_state;
+
+    if (bs->drv->bdrv_save_vmstate == NULL && bs->file != NULL) {
+        return bdrv_co_flush_vmstate(bs->file->bs);
+    }
+    if (state == NULL) {
+        return 0;
+    }
+
+    aio_task_pool_start_task(state->pool, &state->t->task);
+
+    aio_task_pool_wait_all(state->pool);
+    err = aio_task_pool_status(state->pool);
+
+    aio_task_pool_free(state->pool);
+    g_free(state);
+
+    bs->savevm_state = NULL;
+
+    return err;
 }
 
 static void coroutine_fn bdrv_flush_vmstate_co_entry(void *opaque)
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 791de6a59c..f90f0e8b6a 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -61,6 +61,8 @@
 
 #define BLOCK_PROBE_BUF_SIZE        512
 
+#define BDRV_VMSTATE_WORKERS_MAX    8
+
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
     BDRV_TRACKED_WRITE,
@@ -784,6 +786,9 @@ struct BdrvChild {
     QLIST_ENTRY(BdrvChild) next_parent;
 };
 
+
+typedef struct BdrvSaveVMState BdrvSaveVMState;
+
 /*
  * Note: the function bdrv_append() copies and swaps contents of
  * BlockDriverStates, so if you add new fields to this struct, please
@@ -947,6 +952,9 @@ struct BlockDriverState {
 
     /* BdrvChild links to this node may never be frozen */
     bool never_freeze;
+
+    /* Intermediate buffer for VM state saving from snapshot creation code */
+    BdrvSaveVMState *savevm_state;
 };
 
 struct BlockBackendRootState {
-- 
2.17.1



  parent reply	other threads:[~2020-06-11 17:13 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-11 17:11 [PATCH v3 0/4] block: seriously improve savevm performance Denis V. Lunev
2020-06-11 17:11 ` [PATCH 1/4] migration/savevm: respect qemu_fclose() error code in save_snapshot() Denis V. Lunev
2020-06-15  7:39   ` Vladimir Sementsov-Ogievskiy
2020-06-15 12:03   ` Dr. David Alan Gilbert
2020-06-11 17:11 ` [PATCH 2/4] block/aio_task: allow start/wait task from any coroutine Denis V. Lunev
2020-06-15  7:47   ` Vladimir Sementsov-Ogievskiy
2020-06-15  9:34     ` Vladimir Sementsov-Ogievskiy
2020-06-16 14:49       ` Denis V. Lunev
2020-06-11 17:11 ` [PATCH 3/4] block, migration: add bdrv_flush_vmstate helper Denis V. Lunev
2020-06-15  7:56   ` Vladimir Sementsov-Ogievskiy
2020-06-11 17:11 ` Denis V. Lunev [this message]
2020-06-15  9:25   ` [PATCH 4/4] block/io: improve savevm performance Vladimir Sementsov-Ogievskiy
2020-06-15 12:17 ` [PATCH v3 0/4] block: seriously " Dr. David Alan Gilbert
2020-06-15 12:36   ` Denis V. Lunev
2020-06-15 12:49     ` Dr. David Alan Gilbert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200611171143.21589-5-den@openvz.org \
    --to=den@openvz.org \
    --cc=dgilbert@redhat.com \
    --cc=dplotnikov@virtuozzo.com \
    --cc=fam@euphon.net \
    --cc=kwolf@redhat.com \
    --cc=mreitz@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=quintela@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=vsementsov@virtuozzo.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).