From: Stefan Hajnoczi <stefanha@redhat.com>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>,
Peter Maydell <peter.maydell@linaro.org>,
Benoit Canet <benoit@irqsave.net>,
Stefan Hajnoczi <stefanha@redhat.com>,
Liu Yuan <namei.unix@gmail.com>
Subject: [Qemu-devel] [PULL 09/35] block/quorum: add simple read pattern support
Date: Fri, 29 Aug 2014 17:29:37 +0100 [thread overview]
Message-ID: <1409329803-20744-10-git-send-email-stefanha@redhat.com> (raw)
In-Reply-To: <1409329803-20744-1-git-send-email-stefanha@redhat.com>
From: Liu Yuan <namei.unix@gmail.com>
This patch adds single read pattern to quorum driver and quorum vote is default
pattern.
For now we do a quorum vote on all the reads, it is designed for unreliable
underlying storage such as non-redundant NFS to make sure data integrity at the
cost of the read performance.
For some use cases as following:
VM
--------------
| |
v v
A B
Both A and B has hardware raid storage to justify the data integrity on its own.
So it would help performance if we do a single read instead of on all the nodes.
Further, if we run VM on either of the storage node, we can make a local read
request for better performance.
This patch generalize the above 2 nodes case in the N nodes. That is,
vm -> write to all the N nodes, read just one of them. If single read fails, we
try to read next node in FIFO order specified by the startup command.
The 2 nodes case is very similar to DRBD[1] though lack of auto-sync
functionality in the single device/node failure for now. But compared with DRBD
we still have some advantages over it:
- Suppose we have 20 VMs running on one(assume A) of 2 nodes' DRBD backed
storage. And if A crashes, we need to restart all the VMs on node B. But for
practice case, we can't because B might not have enough resources to setup 20 VMs
at once. So if we run our 20 VMs with quorum driver, and scatter the replicated
images over the data center, we can very likely restart 20 VMs without any
resource problem.
After all, I think we can build a more powerful replicated image functionality
on quorum and block jobs(block mirror) to meet various High Availibility needs.
E.g, Enable single read pattern on 2 children,
-drive driver=quorum,children.0.file.filename=0.qcow2,\
children.1.file.filename=1.qcow2,read-pattern=fifo,vote-threshold=1
[1] http://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device
[Dropped \n from an error_setg() error message
--Stefan]
Cc: Benoit Canet <benoit@irqsave.net>
Cc: Eric Blake <eblake@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Liu Yuan <namei.unix@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/quorum.c | 177 +++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 129 insertions(+), 48 deletions(-)
diff --git a/block/quorum.c b/block/quorum.c
index 0de07bb..0160fe3 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -29,6 +29,7 @@
#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
#define QUORUM_OPT_BLKVERIFY "blkverify"
#define QUORUM_OPT_REWRITE "rewrite-corrupted"
+#define QUORUM_OPT_READ_PATTERN "read-pattern"
/* This union holds a vote hash value */
typedef union QuorumVoteValue {
@@ -79,6 +80,8 @@ typedef struct BDRVQuorumState {
bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
* block if Quorum is reached.
*/
+
+ QuorumReadPattern read_pattern;
} BDRVQuorumState;
typedef struct QuorumAIOCB QuorumAIOCB;
@@ -122,6 +125,7 @@ struct QuorumAIOCB {
bool is_read;
int vote_ret;
+ int child_iter; /* which child to read in fifo pattern */
};
static bool quorum_vote(QuorumAIOCB *acb);
@@ -148,7 +152,6 @@ static AIOCBInfo quorum_aiocb_info = {
static void quorum_aio_finalize(QuorumAIOCB *acb)
{
- BDRVQuorumState *s = acb->common.bs->opaque;
int i, ret = 0;
if (acb->vote_ret) {
@@ -158,7 +161,8 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
acb->common.cb(acb->common.opaque, ret);
if (acb->is_read) {
- for (i = 0; i < s->num_children; i++) {
+ /* on the quorum case acb->child_iter == s->num_children - 1 */
+ for (i = 0; i <= acb->child_iter; i++) {
qemu_vfree(acb->qcrs[i].buf);
qemu_iovec_destroy(&acb->qcrs[i].qiov);
}
@@ -261,6 +265,21 @@ static void quorum_rewrite_aio_cb(void *opaque, int ret)
quorum_aio_finalize(acb);
}
+static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb);
+
+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+{
+ int i;
+ assert(dest->niov == source->niov);
+ assert(dest->size == source->size);
+ for (i = 0; i < source->niov; i++) {
+ assert(dest->iov[i].iov_len == source->iov[i].iov_len);
+ memcpy(dest->iov[i].iov_base,
+ source->iov[i].iov_base,
+ source->iov[i].iov_len);
+ }
+}
+
static void quorum_aio_cb(void *opaque, int ret)
{
QuorumChildRequest *sacb = opaque;
@@ -268,6 +287,21 @@ static void quorum_aio_cb(void *opaque, int ret)
BDRVQuorumState *s = acb->common.bs->opaque;
bool rewrite = false;
+ if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) {
+ /* We try to read next child in FIFO order if we fail to read */
+ if (ret < 0 && ++acb->child_iter < s->num_children) {
+ read_fifo_child(acb);
+ return;
+ }
+
+ if (ret == 0) {
+ quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov);
+ }
+ acb->vote_ret = ret;
+ quorum_aio_finalize(acb);
+ return;
+ }
+
sacb->ret = ret;
acb->count++;
if (ret == 0) {
@@ -348,19 +382,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
return count;
}
-static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
-{
- int i;
- assert(dest->niov == source->niov);
- assert(dest->size == source->size);
- for (i = 0; i < source->niov; i++) {
- assert(dest->iov[i].iov_len == source->iov[i].iov_len);
- memcpy(dest->iov[i].iov_base,
- source->iov[i].iov_base,
- source->iov[i].iov_len);
- }
-}
-
static void quorum_count_vote(QuorumVotes *votes,
QuorumVoteValue *value,
int index)
@@ -620,34 +641,62 @@ free_exit:
return rewrite;
}
-static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BlockDriverCompletionFunc *cb,
- void *opaque)
+static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb)
{
- BDRVQuorumState *s = bs->opaque;
- QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
- nb_sectors, cb, opaque);
+ BDRVQuorumState *s = acb->common.bs->opaque;
int i;
- acb->is_read = true;
-
for (i = 0; i < s->num_children; i++) {
- acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size);
- qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov);
- qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf);
+ acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size);
+ qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov);
+ qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf);
}
for (i = 0; i < s->num_children; i++) {
- bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors,
- quorum_aio_cb, &acb->qcrs[i]);
+ bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov,
+ acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]);
}
return &acb->common;
}
+static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb)
+{
+ BDRVQuorumState *s = acb->common.bs->opaque;
+
+ acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter],
+ acb->qiov->size);
+ qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov);
+ qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov,
+ acb->qcrs[acb->child_iter].buf);
+ bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num,
+ &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors,
+ quorum_aio_cb, &acb->qcrs[acb->child_iter]);
+
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BDRVQuorumState *s = bs->opaque;
+ QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
+ nb_sectors, cb, opaque);
+ acb->is_read = true;
+
+ if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+ acb->child_iter = s->num_children - 1;
+ return read_quorum_children(acb);
+ }
+
+ acb->child_iter = 0;
+ return read_fifo_child(acb);
+}
+
static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
@@ -787,10 +836,33 @@ static QemuOptsList quorum_runtime_opts = {
.type = QEMU_OPT_BOOL,
.help = "Rewrite corrupted block on read quorum",
},
+ {
+ .name = QUORUM_OPT_READ_PATTERN,
+ .type = QEMU_OPT_STRING,
+ .help = "Allowed pattern: quorum, fifo. Quorum is default",
+ },
{ /* end of list */ }
},
};
+static int parse_read_pattern(const char *opt)
+{
+ int i;
+
+ if (!opt) {
+ /* Set quorum as default */
+ return QUORUM_READ_PATTERN_QUORUM;
+ }
+
+ for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) {
+ if (!strcmp(opt, QuorumReadPattern_lookup[i])) {
+ return i;
+ }
+ }
+
+ return -EINVAL;
+}
+
static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
Error **errp)
{
@@ -832,28 +904,37 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
}
s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0);
-
- /* and validate it against s->num_children */
- ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+ ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN));
if (ret < 0) {
+ error_setg(&local_err, "Please set read-pattern as fifo or quorum");
goto exit;
}
+ s->read_pattern = ret;
- /* is the driver in blkverify mode */
- if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
- s->num_children == 2 && s->threshold == 2) {
- s->is_blkverify = true;
- } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
- fprintf(stderr, "blkverify mode is set by setting blkverify=on "
- "and using two files with vote_threshold=2\n");
- }
+ if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) {
+ /* and validate it against s->num_children */
+ ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err);
+ if (ret < 0) {
+ goto exit;
+ }
- s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
- if (s->rewrite_corrupted && s->is_blkverify) {
- error_setg(&local_err,
- "rewrite-corrupted=on cannot be used with blkverify=on");
- ret = -EINVAL;
- goto exit;
+ /* is the driver in blkverify mode */
+ if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) &&
+ s->num_children == 2 && s->threshold == 2) {
+ s->is_blkverify = true;
+ } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) {
+ fprintf(stderr, "blkverify mode is set by setting blkverify=on "
+ "and using two files with vote_threshold=2\n");
+ }
+
+ s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE,
+ false);
+ if (s->rewrite_corrupted && s->is_blkverify) {
+ error_setg(&local_err,
+ "rewrite-corrupted=on cannot be used with blkverify=on");
+ ret = -EINVAL;
+ goto exit;
+ }
}
/* allocate the children BlockDriverState array */
--
1.9.3
next prev parent reply other threads:[~2014-08-29 16:30 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-08-29 16:29 [Qemu-devel] [PULL 00/35] Block patches Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 01/35] ide: Fix bootindex for bus_id > 9 Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 02/35] block.curl: adding 'timeout' option Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 03/35] qemu-img: fix img_commit() error return value Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 04/35] qemu-img: fix img_compare() flags error path Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 05/35] qemu-img: always goto out in img_snapshot() error paths Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 06/35] sheepdog: adopting protocol update for VDI locking Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 07/35] sheepdog: improve error handling for a case of failed lock Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 08/35] qapi: add read-pattern enum for quorum Stefan Hajnoczi
2014-08-29 16:29 ` Stefan Hajnoczi [this message]
2014-08-29 16:47 ` [Qemu-devel] [PULL 09/35] block/quorum: add simple read pattern support Benoît Canet
2014-09-01 15:21 ` Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 10/35] coroutine: Drop co_sleep_ns Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 11/35] blockdev: fix drive-mirror 'granularity' error message Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 12/35] AioContext: take bottom halves into account when computing aio_poll timeout Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 13/35] aio-win32: Evaluate timers after handles Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 14/35] aio-win32: Factor out duplicate code into aio_dispatch_handlers Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 15/35] AioContext: run bottom halves after polling Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 16/35] AioContext: export and use aio_dispatch Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 17/35] test-aio: test timers on Windows too Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 18/35] aio-win32: add aio_set_dispatching optimization Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 19/35] AioContext: introduce aio_prepare Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 20/35] qemu-coroutine-io: fix for Win32 Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 21/35] aio-win32: add support for sockets Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 22/35] sheepdog: fix a core dump while do auto-reconnecting Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 23/35] nbd: Drop nbd_can_read() Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 24/35] block: Add AIO context notifiers Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 25/35] nbd: Follow the BDS' AIO context Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 26/35] block: fix overlapping multiwrite requests Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 27/35] qemu-iotests: add multiwrite test cases Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 28/35] linux-aio: avoid deadlock in nested aio_poll() calls Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 29/35] block: acquire AioContext in do_drive_del() Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 30/35] virtio-blk: allow drive_del with dataplane Stefan Hajnoczi
2014-08-29 16:29 ` [Qemu-devel] [PULL 31/35] curl: Allow a cookie or cookies to be sent with http/https requests Stefan Hajnoczi
2014-08-29 16:30 ` [Qemu-devel] [PULL 32/35] curl: Don't deref NULL pointer in call to aio_poll Stefan Hajnoczi
2014-08-29 16:30 ` [Qemu-devel] [PULL 33/35] nfs: Fix leak of opts in nfs_file_open Stefan Hajnoczi
2014-08-29 16:30 ` [Qemu-devel] [PULL 34/35] blkverify: Fix leak of opts in blkverify_open Stefan Hajnoczi
2014-08-29 16:30 ` [Qemu-devel] [PULL 35/35] quorum: Fix leak of opts in quorum_open Stefan Hajnoczi
2014-09-01 9:49 ` [Qemu-devel] [PULL 00/35] Block patches Peter Maydell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1409329803-20744-10-git-send-email-stefanha@redhat.com \
--to=stefanha@redhat.com \
--cc=benoit@irqsave.net \
--cc=kwolf@redhat.com \
--cc=namei.unix@gmail.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).