qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Wen Congyang <wency@cn.fujitsu.com>
To: qemu devel <qemu-devel@nongnu.org>, Kevin Wolf <kwolf@redhat.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Paolo Bonzini <pbonzini@redhat.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>,
	Jiang Yunhong <yunhong.jiang@intel.com>,
	Dong Eddie <eddie.dong@intel.com>,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>,
	Gonglei <arei.gonglei@huawei.com>,
	Yang Hongyang <yanghy@cn.fujitsu.com>,
	zhanghailiang <zhang.zhanghailiang@huawei.com>
Subject: [Qemu-devel] [RFC PATCH 14/14] COLO: implement a new block driver
Date: Thu, 12 Feb 2015 11:07:18 +0800	[thread overview]
Message-ID: <1423710438-14377-15-git-send-email-wency@cn.fujitsu.com> (raw)
In-Reply-To: <1423710438-14377-1-git-send-email-wency@cn.fujitsu.com>

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Gonglei <arei.gonglei@huawei.com>
---
 block/Makefile.objs |   2 +-
 block/blkcolo.c     | 409 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 block/blkcolo.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 1b7b458..021e891 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -21,7 +21,7 @@ block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
 block-obj-y += write-threshold.o
-block-obj-y += blkcolo-buffer.o
+block-obj-y += blkcolo-buffer.o blkcolo.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
diff --git a/block/blkcolo.c b/block/blkcolo.c
new file mode 100644
index 0000000..2f73486
--- /dev/null
+++ b/block/blkcolo.c
@@ -0,0 +1,409 @@
+/*
+ * Block driver for block replication
+ *
+ * Copyright Fujitsu, Corp. 2015
+ * Copyright (c) 2015 Intel Corporation
+ * Copyright (c) 2015 HUAWEI TECHNOLOGIES CO.,LTD.
+ *
+ * Authors:
+ *     Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "block/block_int.h"
+#include "sysemu/block-backend.h"
+#include "block/blkcolo.h"
+#include "block/nbd.h"
+
+#define COLO_OPT_EXPORT         "export"
+
+#define COLO_CLUSTER_BITS 16
+#define COLO_CLUSTER_SIZE (1 << COLO_CLUSTER_BITS)
+#define COLO_SECTORS_PER_CLUSTER (COLO_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+typedef struct BDRVBlkcoloState BDRVBlkcoloState;
+
+struct BDRVBlkcoloState {
+    BlockDriverState *bs;
+    char *export_name;
+    int mode;
+    disk_buffer disk_buffer;
+    NotifierWithReturn before_write;
+    NBDExport *exp;
+    CowJob cow_job;
+    bool error;
+};
+
+static void colo_svm_init(BDRVBlkcoloState *s);
+static void colo_svm_fini(BDRVBlkcoloState *s);
+
+static int switch_mode(BDRVBlkcoloState *s, int new_mode)
+{
+    if (s->mode == new_mode) {
+        return 0;
+    }
+
+    if (s->mode == COLO_SECONDARY_MODE) {
+        colo_svm_fini(s);
+    }
+
+    s->mode = new_mode;
+    if (s->mode == COLO_SECONDARY_MODE) {
+        colo_svm_init(s);
+    }
+
+    return 0;
+}
+
+/*
+ * Secondary mode functions
+ *
+ * All write requests are forwarded to secondary QEMU from primary QEMU.
+ * The secondary QEMU should do the following things:
+ * 1. Use NBD server to receive and handle the forwarded write requests
+ * 2. Buffer the secondary write requests
+ */
+
+static int coroutine_fn
+colo_svm_co_writev(BlockDriverState *bs, int64_t sector_num,
+                   int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    /*
+     * Write the request to the disk buffer. How to limit the
+     * write speed?
+     */
+    qiov_write_to_buffer(&s->disk_buffer, qiov, sector_num, nb_sectors, true);
+
+    return 0;
+}
+
+static int coroutine_fn
+colo_svm_co_readv(BlockDriverState *bs, int64_t sector_num,
+                  int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+    int ret;
+
+    /*
+     * Read the sector content from secondary disk first. If the sector
+     * content is buffered, use the buffered content.
+     */
+    ret = bdrv_co_readv(bs->backing_hd, sector_num, nb_sectors, qiov);
+    if (ret) {
+        return ret;
+    }
+
+    /* Read from the buffer */
+    qiov_read_from_buffer(&s->disk_buffer, qiov, sector_num, nb_sectors);
+    return 0;
+}
+
+static int coroutine_fn
+colo_do_cow(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+{
+    BDRVBlkcoloState *s = bs->origin_file->opaque;
+    CowRequest cow_request;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buffer = NULL;
+    int ret = 0;
+    int64_t start, end;
+    int n;
+
+    start = sector_num / COLO_SECTORS_PER_CLUSTER;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, COLO_SECTORS_PER_CLUSTER);
+
+    wait_for_overlapping_requests(&s->cow_job, start, end);
+    cow_request_begin(&cow_request, &s->cow_job, start, end);
+
+    nb_sectors = COLO_SECTORS_PER_CLUSTER;
+    for (; start < end; start++) {
+        sector_num = start * COLO_SECTORS_PER_CLUSTER;
+        if (!buffer_has_empty_range(&s->disk_buffer, sector_num, nb_sectors)) {
+            continue;
+        }
+
+        /* TODO */
+        n = COLO_SECTORS_PER_CLUSTER;
+
+        if (!bounce_buffer) {
+            bounce_buffer = qemu_blockalign(bs, COLO_CLUSTER_SIZE);
+        }
+        iov.iov_base = bounce_buffer;
+        iov.iov_len = n * BDRV_SECTOR_SIZE;
+        qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+        ret = bdrv_co_readv(bs, sector_num, n, &bounce_qiov);
+        if (ret < 0) {
+            goto out;
+        }
+
+        qiov_write_to_buffer(&s->disk_buffer, &bounce_qiov,
+                             sector_num, n, false);
+    }
+
+out:
+    cow_request_end(&cow_request);
+    return ret;
+}
+
+static int coroutine_fn
+colo_before_write_notify(NotifierWithReturn *notifier, void *opaque)
+{
+    BdrvTrackedRequest *req = opaque;
+    BlockDriverState *bs = req->bs;
+    BDRVBlkcoloState *s = bs->origin_file->opaque;
+    int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
+    int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
+    int ret;
+
+    assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+    assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+
+    ret = colo_do_cow(bs, sector_num, nb_sectors);
+    if (ret) {
+        s->error = true;
+    }
+
+    return ret;
+}
+
+/*
+ * It should be called in the migration/checkpoint thread, and the caller
+ * should be hold io thread lock
+ */
+static int svm_do_checkpoint(BDRVBlkcoloState *s)
+{
+    if (s->error) {
+        /* TODO: we should report the error more earlier */
+        return -1;
+    }
+
+    /* clear disk buffer */
+    clear_all_buffered_data(&s->disk_buffer);
+    return 0;
+}
+
+/* It should be called in the migration/checkpoint thread */
+static void svm_stop_replication(BDRVBlkcoloState *s)
+{
+    /* switch to unprotected mode */
+    switch_mode(s, COLO_UNPROTECTED_MODE);
+}
+
+static void colo_svm_init(BDRVBlkcoloState *s)
+{
+    BlockBackend *blk = s->bs->backing_hd->blk;
+
+    /* Init Disk Buffer */
+    init_disk_buffer(&s->disk_buffer);
+
+    s->before_write.notify = colo_before_write_notify;
+    bdrv_add_before_write_notifier(s->bs->backing_hd, &s->before_write);
+
+    /* start NBD server */
+    s->exp = nbd_export_new(blk, 0, -1, 0, NULL);
+    nbd_export_set_name(s->exp, s->export_name);
+
+    s->error = false;
+    QLIST_INIT(&s->cow_job.inflight_reqs);
+}
+
+static void colo_svm_fini(BDRVBlkcoloState *s)
+{
+    /* stop NBD server */
+    nbd_export_close(s->exp);
+    nbd_export_put(s->exp);
+
+    /* notifier_with_return_remove */
+    notifier_with_return_remove(&s->before_write);
+
+    /* TODO: All pvm write requests have been done? */
+
+    /* flush all buffered data to secondary disk */
+    flush_buffered_data_to_disk(&s->disk_buffer, s->bs->backing_hd);
+}
+
+/* block driver interfaces */
+static QemuOptsList colo_runtime_opts = {
+    .name = "colo",
+    .head = QTAILQ_HEAD_INITIALIZER(colo_runtime_opts.head),
+    .desc = {
+        {
+            .name = COLO_OPT_EXPORT,
+            .type = QEMU_OPT_STRING,
+            .help = "The NBD server name",
+        },
+        { /* end of list */ }
+    },
+};
+
+/*
+ * usage: -drive if=xxx,driver=colo,export=xxx,\
+ *        backing.file.filename=1.raw,\
+ *        backing.driver=raw
+ */
+static int blkcolo_open(BlockDriverState *bs, QDict *options, int flags,
+                        Error **errp)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+    Error *local_err = NULL;
+    QemuOpts *opts = NULL;
+    int ret = 0;
+
+    s->bs = bs;
+
+    opts = qemu_opts_create(&colo_runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        goto exit;
+    }
+
+    s->export_name = g_strdup(qemu_opt_get(opts, COLO_OPT_EXPORT));
+    if (!s->export_name) {
+        error_setg(&local_err, "Missing the option export");
+        ret = -EINVAL;
+        goto exit;
+    }
+
+exit:
+    qemu_opts_del(opts);
+    /* propagate error */
+    if (local_err) {
+        error_propagate(errp, local_err);
+    }
+    return ret;
+}
+
+static void blkcolo_close(BlockDriverState *bs)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    if (s->mode == COLO_SECONDARY_MODE) {
+        switch_mode(s, COLO_UNPROTECTED_MODE);
+    }
+
+    g_free(s->export_name);
+}
+
+static int64_t blkcolo_getlength(BlockDriverState *bs)
+{
+    if (!bs->backing_hd) {
+        return 0;
+    } else {
+        return bdrv_getlength(bs->backing_hd);
+    }
+}
+
+static int blkcolo_co_readv(BlockDriverState *bs, int64_t sector_num,
+                            int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    if (s->mode == COLO_SECONDARY_MODE) {
+        return colo_svm_co_readv(bs, sector_num, nb_sectors, qiov);
+    }
+
+    assert(s->mode == COLO_UNPROTECTED_MODE);
+
+    if (!bs->backing_hd) {
+        return -EIO;
+    } else {
+        return bdrv_co_readv(bs->backing_hd, sector_num, nb_sectors, qiov);
+    }
+}
+
+static int blkcolo_co_writev(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    if (s->mode == COLO_SECONDARY_MODE) {
+        return colo_svm_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
+
+    assert(s->mode == COLO_UNPROTECTED_MODE);
+
+    if (!bs->backing_hd) {
+        return -EIO;
+    } else {
+        return bdrv_co_writev(bs->backing_hd, sector_num, nb_sectors, qiov);
+    }
+}
+
+static int blkcolo_start_replication(BlockDriverState *bs, int mode)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    if (mode != COLO_SECONDARY_MODE ||
+        s->mode != COLO_UNPROTECTED_MODE ||
+        !bs->backing_hd) {
+        return -1;
+    }
+
+    if (!blk_is_inserted(bs->backing_hd->blk)) {
+        return -1;
+    }
+
+    if (blk_is_read_only(bs->backing_hd->blk)) {
+        return -1;
+    }
+
+    return switch_mode(s, mode);
+}
+
+static int blkcolo_do_checkpoint(BlockDriverState *bs)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    if (s->mode != COLO_SECONDARY_MODE) {
+        return -1;
+    }
+
+    return svm_do_checkpoint(s);
+}
+
+static int blkcolo_stop_replication(BlockDriverState *bs)
+{
+    BDRVBlkcoloState *s = bs->opaque;
+
+    if (s->mode != COLO_SECONDARY_MODE) {
+        return -1;
+    }
+
+    svm_stop_replication(s);
+    return 0;
+}
+
+static BlockDriver bdrv_blkcolo = {
+    .format_name                = "blkcolo",
+    .protocol_name              = "blkcolo",
+    .instance_size              = sizeof(BDRVBlkcoloState),
+
+    .bdrv_file_open             = blkcolo_open,
+    .bdrv_close                 = blkcolo_close,
+    .bdrv_getlength             = blkcolo_getlength,
+
+    .bdrv_co_readv              = blkcolo_co_readv,
+    .bdrv_co_writev             = blkcolo_co_writev,
+
+    .bdrv_start_replication     = blkcolo_start_replication,
+    .bdrv_do_checkpoint         = blkcolo_do_checkpoint,
+    .bdrv_stop_replication      = blkcolo_stop_replication,
+
+    .supports_backing           = true,
+    .has_variable_length        = true,
+};
+
+static void bdrv_blkcolo_init(void)
+{
+    bdrv_register(&bdrv_blkcolo);
+};
+
+block_init(bdrv_blkcolo_init);
-- 
2.1.0

  parent reply	other threads:[~2015-02-12  3:05 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-02-12  3:07 [Qemu-devel] [RFC PATCH 00/14] Block replication for continuous checkpoints Wen Congyang
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 01/14] docs: block replication's description Wen Congyang
2015-02-12  7:21   ` Fam Zheng
2015-02-12  7:40     ` Wen Congyang
2015-02-12  8:44       ` Fam Zheng
2015-02-12  9:33         ` Wen Congyang
2015-02-12  9:44           ` Fam Zheng
2015-02-12 10:11             ` Wen Congyang
2015-02-12 10:26               ` famz
2015-02-13  5:09                 ` Wen Congyang
2015-02-13  7:01                   ` Fam Zheng
2015-02-13 20:29                     ` John Snow
2015-03-03  7:53                 ` Wen Congyang
2015-03-03  7:59                   ` Fam Zheng
2015-03-03 12:12                     ` Wen Congyang
2015-03-11  6:44                     ` Wen Congyang
2015-03-11  6:49                       ` Fam Zheng
2015-03-11  7:01                         ` Wen Congyang
2015-03-11  7:04                           ` Fam Zheng
2015-03-11  7:12                             ` Wen Congyang
2015-03-13  9:01                         ` Wen Congyang
2015-03-13  9:05                           ` Fam Zheng
2015-03-16  6:19                             ` Wen Congyang
2015-03-25 12:41                               ` Paolo Bonzini
2015-02-12  9:36         ` Hongyang Yang
2015-02-12  9:46           ` Fam Zheng
2015-02-24  7:50         ` Wen Congyang
2015-02-25  2:46           ` Fam Zheng
2015-02-25  8:36             ` Wen Congyang
2015-02-25  8:58               ` Fam Zheng
2015-02-25  9:58                 ` Wen Congyang
2015-02-26  6:38             ` Wen Congyang
2015-02-26  8:44               ` Fam Zheng
2015-02-26  9:07                 ` Wen Congyang
2015-02-26 10:02                   ` Fam Zheng
2015-02-27  2:27                     ` Wen Congyang
2015-02-27  2:32                       ` Fam Zheng
2015-02-25  8:11         ` Wen Congyang
2015-02-25  8:18           ` Fam Zheng
2015-02-25  9:10         ` Wen Congyang
2015-02-25  9:45           ` Fam Zheng
2015-03-04 16:35   ` Dr. David Alan Gilbert
2015-03-05  1:03     ` Wen Congyang
2015-03-05 19:04       ` Dr. David Alan Gilbert
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 02/14] quorom: add a new read pattern Wen Congyang
2015-02-12  6:42   ` Gonglei
2015-02-23 20:36   ` Max Reitz
2015-02-23 21:56   ` Eric Blake
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 03/14] quorum: ignore 0-length child Wen Congyang
2015-02-23 20:43   ` Max Reitz
2015-02-24  2:33     ` Wen Congyang
2015-03-18  5:29     ` Wen Congyang
2015-03-18 12:57       ` Max Reitz
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 04/14] Add new block driver interfaces to control disk replication Wen Congyang
2015-02-23 20:57   ` Max Reitz
2015-02-23 21:58     ` Eric Blake
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 05/14] quorom: implement block driver interfaces for block replication Wen Congyang
2015-02-23 21:22   ` Max Reitz
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 06/14] NBD client: connect to nbd server later Wen Congyang
2015-02-23 21:31   ` Max Reitz
2015-02-25  2:23     ` Wen Congyang
2015-02-25 14:22       ` Max Reitz
2015-02-26 14:07         ` Paolo Bonzini
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 07/14] NBD client: implement block driver interfaces for block replication Wen Congyang
2015-02-23 21:41   ` Max Reitz
2015-02-26 14:08     ` Paolo Bonzini
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 08/14] block: add a new API to create a hidden BlockBackend Wen Congyang
2015-02-23 21:48   ` Max Reitz
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 09/14] block: give backing image its own BlockBackend Wen Congyang
2015-02-23 21:53   ` Max Reitz
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 10/14] allow the backing image access the origin BlockDriverState Wen Congyang
2015-02-23 22:01   ` Max Reitz
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 11/14] allow writing to the backing file Wen Congyang
2015-02-23 22:03   ` Max Reitz
2015-02-26 14:15     ` Paolo Bonzini
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 12/14] Add disk buffer for block replication Wen Congyang
2015-02-23 22:27   ` Max Reitz
2015-02-12  3:07 ` [Qemu-devel] [RFC PATCH 13/14] COW: move cow interfaces to a seperate file Wen Congyang
2015-02-12  3:07 ` Wen Congyang [this message]
2015-02-23 22:35   ` [Qemu-devel] [RFC PATCH 14/14] COLO: implement a new block driver Max Reitz
2015-02-18 16:26 ` [Qemu-devel] [RFC PATCH 00/14] Block replication for continuous checkpoints Paolo Bonzini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1423710438-14377-15-git-send-email-wency@cn.fujitsu.com \
    --to=wency@cn.fujitsu.com \
    --cc=arei.gonglei@huawei.com \
    --cc=dgilbert@redhat.com \
    --cc=eddie.dong@intel.com \
    --cc=kwolf@redhat.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    --cc=yanghy@cn.fujitsu.com \
    --cc=yunhong.jiang@intel.com \
    --cc=zhang.zhanghailiang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).