* [Qemu-devel] [RFC V5 01/11] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 02/11] quorum: Create BDRVQuorumState and BlkDriver and do init Benoît Canet
` (9 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/Makefile.objs | 1 +
block/quorum.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 46 insertions(+)
create mode 100644 block/quorum.c
diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d3..66af6dc 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -4,6 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
block-obj-y += qed-check.o
block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
block-obj-y += stream.o
+block-obj-y += quorum.o
block-obj-$(CONFIG_WIN32) += raw-win32.o
block-obj-$(CONFIG_POSIX) += raw-posix.o
block-obj-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/quorum.c b/block/quorum.c
new file mode 100644
index 0000000..65a6b55
--- /dev/null
+++ b/block/quorum.c
@@ -0,0 +1,45 @@
+/*
+ * Quorum Block filter
+ *
+ * Copyright (C) 2012 Nodalink, SARL.
+ *
+ * Author:
+ * Benoît Canet <benoit.canet@irqsave.net>
+ *
+ * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp)
+ * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc).
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "block_int.h"
+
+typedef struct QuorumAIOCB QuorumAIOCB;
+
+typedef struct QuorumSingleAIOCB {
+ BlockDriverAIOCB *aiocb;
+ uint8_t *buf;
+ int ret;
+ QuorumAIOCB *parent;
+} QuorumSingleAIOCB;
+
+struct QuorumAIOCB {
+ BlockDriverAIOCB common;
+ QEMUBH *bh;
+
+ /* Request metadata */
+ int64_t sector_num;
+ int nb_sectors;
+
+ QEMUIOVector *qiov; /* calling readv IOV */
+
+ QuorumSingleAIOCB *aios; /* individual AIOs */
+ QEMUIOVector *qiovs; /* individual IOVs */
+ int count; /* number of completed AIOCB */
+ int success_count; /* number of successfully completed AIOCB */
+ bool *finished; /* completion signal for cancel */
+
+ void (*vote)(QuorumAIOCB *acb);
+ int vote_ret;
+};
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 02/11] quorum: Create BDRVQuorumState and BlkDriver and do init.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 01/11] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 03/11] quorum: Add quorum_open() and quorum_close() Benoît Canet
` (8 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index 65a6b55..19a9a44 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -15,6 +15,13 @@
#include "block_int.h"
+typedef struct {
+ BlockDriverState **bs;
+ int threshold;
+ int total;
+ char **filenames;
+} BDRVQuorumState;
+
typedef struct QuorumAIOCB QuorumAIOCB;
typedef struct QuorumSingleAIOCB {
@@ -26,6 +33,7 @@ typedef struct QuorumSingleAIOCB {
struct QuorumAIOCB {
BlockDriverAIOCB common;
+ BDRVQuorumState *bqs;
QEMUBH *bh;
/* Request metadata */
@@ -43,3 +51,17 @@ struct QuorumAIOCB {
void (*vote)(QuorumAIOCB *acb);
int vote_ret;
};
+
+static BlockDriver bdrv_quorum = {
+ .format_name = "quorum",
+ .protocol_name = "quorum",
+
+ .instance_size = sizeof(BDRVQuorumState),
+};
+
+static void bdrv_quorum_init(void)
+{
+ bdrv_register(&bdrv_quorum);
+}
+
+block_init(bdrv_quorum_init);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 03/11] quorum: Add quorum_open() and quorum_close().
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 01/11] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 02/11] quorum: Create BDRVQuorumState and BlkDriver and do init Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 17:59 ` Eric Blake
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 04/11] quorum: Add quorum_aio_writev and its dependencies Benoît Canet
` (7 subsequent siblings)
10 siblings, 1 reply; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Valid quorum resources look like
quorum:threshold/total:path/to/image_1, ... ,path/to/image_total
',' is used as a separator to allow to use networked path
'\' is the escaping character for filename containing ','
'\' escape itself
On the command line for quorum files "img,test.raw", "img2.raw"
and "img3.raw" invocation look like:
-drive file=quorum:2/3:img\\,,test.raw,,img2.raw,,img3.raw
(note the double ,, and \\)
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 123 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index 19a9a44..b9fb2b9 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -52,11 +52,134 @@ struct QuorumAIOCB {
int vote_ret;
};
+/* Valid quorum resources look like
+ * quorum:threshold/total:path/to/image_1, ... ,path/to/image_total
+ *
+ * ',' is used as a separator to allow to use network path
+ * '\' is the escaping character for filename containing ','
+ */
+static int quorum_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVQuorumState *s = bs->opaque;
+ int i, j, k, len, ret = 0;
+ char *a, *b, *names;
+ bool escape;
+
+ /* Parse the quorum: prefix */
+ if (strncmp(filename, "quorum:", strlen("quorum:"))) {
+ return -EINVAL;
+ }
+
+ filename += strlen("quorum:");
+
+ /* Get threshold */
+ errno = 0;
+ s->threshold = strtoul(filename, &a, 10);
+ if (*a != '/' || errno) {
+ return -EINVAL;
+ }
+ a++;
+
+ /* Get total */
+ errno = 0;
+ s->total = strtoul(a, &b, 10);
+ if (*b != ':' || errno) {
+ return -EINVAL;
+ }
+ b++;
+
+ if (s->threshold < 1 || s->total < 2) {
+ return -EINVAL;
+ }
+
+ if (s->threshold > s->total) {
+ return -EINVAL;
+ }
+
+ s->bs = g_malloc0(sizeof(BlockDriverState *) * s->total);
+ /* Two allocations for all filenames: simpler to free */
+ s->filenames = g_malloc0(sizeof(char *) * s->total);
+ names = g_strdup(b);
+
+ /* Get the filenames pointers */
+ escape = false;
+ s->filenames[0] = names;
+ len = strlen(names);
+ for (i = j = k = 0; i < len && j < s->total; i++) {
+ /* separation between two files */
+ if (!escape && names[i] == ',') {
+ char *prev = s->filenames[j];
+ prev[k] = '\0';
+ s->filenames[++j] = prev + k + 1;
+ k = 0;
+ continue;
+ }
+
+ escape = !escape && names[i] == '\\';
+
+ /* if we are not escaping copy */
+ if (!escape) {
+ s->filenames[j][k++] = names[i];
+ }
+ }
+ /* terminate last string */
+ s->filenames[j][k] = '\0';
+
+ if ((j + 1) != s->total) {
+ ret = -EINVAL;
+ goto free_exit;
+ }
+
+ /* Open files */
+ for (i = 0; i < s->total; i++) {
+ s->bs[i] = bdrv_new("");
+ ret = bdrv_open(s->bs[i], s->filenames[i], flags, NULL);
+ if (ret < 0) {
+ goto error_exit;
+ }
+ }
+
+ goto exit;
+
+error_exit:
+ for (; i >= 0; i--) {
+ bdrv_delete(s->bs[i]);
+ s->bs[i] = NULL;
+ }
+free_exit:
+ g_free(s->filenames[0]);
+ g_free(s->filenames);
+ s->filenames = NULL;
+ g_free(s->bs);
+exit:
+ return ret;
+}
+
+static void quorum_close(BlockDriverState *bs)
+{
+ BDRVQuorumState *s = bs->opaque;
+ int i;
+
+ for (i = 0; i < s->total; i++) {
+ /* Ensure writes reach stable storage */
+ bdrv_flush(s->bs[i]);
+ bdrv_delete(s->bs[i]);
+ }
+
+ g_free(s->filenames[0]);
+ g_free(s->filenames);
+ s->filenames = NULL;
+ g_free(s->bs);
+}
+
static BlockDriver bdrv_quorum = {
.format_name = "quorum",
.protocol_name = "quorum",
.instance_size = sizeof(BDRVQuorumState),
+
+ .bdrv_file_open = quorum_open,
+ .bdrv_close = quorum_close,
};
static void bdrv_quorum_init(void)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC V5 03/11] quorum: Add quorum_open() and quorum_close().
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 03/11] quorum: Add quorum_open() and quorum_close() Benoît Canet
@ 2012-08-27 17:59 ` Eric Blake
2012-08-27 19:20 ` Benoît Canet
0 siblings, 1 reply; 16+ messages in thread
From: Eric Blake @ 2012-08-27 17:59 UTC (permalink / raw)
To: Benoît Canet
Cc: kwolf, Benoît Canet, qemu-devel, blauwirbel, pbonzini,
stefanha
[-- Attachment #1: Type: text/plain, Size: 1457 bytes --]
On 08/27/2012 01:30 AM, Benoît Canet wrote:
> Valid quorum resources look like
> quorum:threshold/total:path/to/image_1, ... ,path/to/image_total
>
> ',' is used as a separator to allow to use networked path
Isn't this a step backwards? After all, on the command line, we would
have something like:
-drive file=quorum:...,readonly=on
but if the 'quorum:...' portion contains commas, then the qemu option
parsing requires you to write those commas as doubled. Which means to
escape those commas for _both_ quorum: processing _and_ qemu command
line parsing, I'd have to write:
-drive "file=quorum:2/2:path/to/image\\,,1,,path/to/image\\\\2,readonly=on"
to use the two files:
path/to/image,1
path/to/image\2
with appropriate shell, qemu, and quorum escaping. Using : rather than
, as the separator between quorum: elements was a bit simpler, since I
don't have to type double commas for every single comma to be taken by
the quorum parsing, escaped or not.
> '\' is the escaping character for filename containing ','
> '\' escape itself
s/escape/escapes/
>
> On the command line for quorum files "img,test.raw", "img2.raw"
> and "img3.raw" invocation look like:
>
> -drive file=quorum:2/3:img\\,,test.raw,,img2.raw,,img3.raw
> (note the double ,, and \\)
Yes, that's what I'm worried about.
--
Eric Blake eblake@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 620 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC V5 03/11] quorum: Add quorum_open() and quorum_close().
2012-08-27 17:59 ` Eric Blake
@ 2012-08-27 19:20 ` Benoît Canet
0 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 19:20 UTC (permalink / raw)
To: Eric Blake
Cc: kwolf, Benoît Canet, stefanha, qemu-devel, blauwirbel,
pbonzini
Le Monday 27 Aug 2012 à 11:59:34 (-0600), Eric Blake a écrit :
> On 08/27/2012 01:30 AM, Benoît Canet wrote:
> > Valid quorum resources look like
> > quorum:threshold/total:path/to/image_1, ... ,path/to/image_total
> >
> > ',' is used as a separator to allow to use networked path
>
> Isn't this a step backwards? After all, on the command line, we would
> have something like:
>
> -drive file=quorum:...,readonly=on
>
> but if the 'quorum:...' portion contains commas, then the qemu option
> parsing requires you to write those commas as doubled. Which means to
> escape those commas for _both_ quorum: processing _and_ qemu command
> line parsing, I'd have to write:
>
> -drive "file=quorum:2/2:path/to/image\\,,1,,path/to/image\\\\2,readonly=on"
>
> to use the two files:
> path/to/image,1
> path/to/image\2
>
> with appropriate shell, qemu, and quorum escaping. Using : rather than
> , as the separator between quorum: elements was a bit simpler, since I
> don't have to type double commas for every single comma to be taken by
> the quorum parsing, escaped or not.
>
> > '\' is the escaping character for filename containing ','
> > '\' escape itself
>
> s/escape/escapes/
>
> >
> > On the command line for quorum files "img,test.raw", "img2.raw"
> > and "img3.raw" invocation look like:
> >
> > -drive file=quorum:2/3:img\\,,test.raw,,img2.raw,,img3.raw
> > (note the double ,, and \\)
>
> Yes, that's what I'm worried about.
I was following an idea of Blue Swirl.
You will have to use \\ on the command line.
':' can be used instead of ',' but it won't work
with networked urls.
>
> --
> Eric Blake eblake@redhat.com +1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 04/11] quorum: Add quorum_aio_writev and its dependencies.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (2 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 03/11] quorum: Add quorum_open() and quorum_close() Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 05/11] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify Benoît Canet
` (6 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 112 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index b9fb2b9..cd11cfb 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -172,6 +172,116 @@ static void quorum_close(BlockDriverState *bs)
g_free(s->bs);
}
+static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
+ bool finished = false;
+
+ /* Wait for the request to finish */
+ acb->finished = &finished;
+ while (!finished) {
+ qemu_aio_wait();
+ }
+}
+
+static AIOPool quorum_aio_pool = {
+ .aiocb_size = sizeof(QuorumAIOCB),
+ .cancel = quorum_aio_cancel,
+};
+
+static void quorum_aio_bh(void *opaque)
+{
+ QuorumAIOCB *acb = opaque;
+ BDRVQuorumState *s = acb->bqs;
+ int ret;
+
+ ret = s->threshold <= acb->success_count ? 0 : -EIO;
+
+ qemu_bh_delete(acb->bh);
+ acb->common.cb(acb->common.opaque, ret);
+ if (acb->finished) {
+ *acb->finished = true;
+ }
+ g_free(acb->aios);
+ g_free(acb->qiovs);
+ qemu_aio_release(acb);
+}
+
+static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
+ BlockDriverState *bs,
+ QEMUIOVector *qiov,
+ int64_t sector_num,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ QuorumAIOCB *acb = qemu_aio_get(&quorum_aio_pool, bs, cb, opaque);
+ int i;
+
+ acb->aios = g_new0(QuorumSingleAIOCB, s->total);
+ acb->qiovs = g_new0(QEMUIOVector, s->total);
+
+ acb->bqs = s;
+ acb->qiov = qiov;
+ acb->bh = NULL;
+ acb->count = 0;
+ acb->success_count = 0;
+ acb->sector_num = sector_num;
+ acb->nb_sectors = nb_sectors;
+ acb->vote = NULL;
+ acb->vote_ret = 0;
+
+ for (i = 0; i < s->total; i++) {
+ acb->aios[i].buf = NULL;
+ acb->aios[i].ret = 0;
+ acb->aios[i].parent = acb;
+ }
+
+ return acb;
+}
+
+static void quorum_aio_cb(void *opaque, int ret)
+{
+ QuorumSingleAIOCB *sacb = opaque;
+ QuorumAIOCB *acb = sacb->parent;
+ BDRVQuorumState *s = acb->bqs;
+
+ sacb->ret = ret;
+ acb->count++;
+ if (ret == 0) {
+ acb->success_count++;
+ }
+ assert(acb->count <= s->total);
+ assert(acb->success_count <= s->total);
+ if (acb->count < s->total) {
+ return;
+ }
+
+ acb->bh = qemu_bh_new(quorum_aio_bh, acb);
+ qemu_bh_schedule(acb->bh);
+}
+
+static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BDRVQuorumState *s = bs->opaque;
+ QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors,
+ cb, opaque);
+ int i;
+
+ for (i = 0; i < s->total; i++) {
+ acb->aios[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov,
+ nb_sectors, &quorum_aio_cb,
+ &acb->aios[i]);
+ }
+
+ return &acb->common;
+}
+
static BlockDriver bdrv_quorum = {
.format_name = "quorum",
.protocol_name = "quorum",
@@ -180,6 +290,8 @@ static BlockDriver bdrv_quorum = {
.bdrv_file_open = quorum_open,
.bdrv_close = quorum_close,
+
+ .bdrv_aio_writev = quorum_aio_writev,
};
static void bdrv_quorum_init(void)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 05/11] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (3 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 04/11] quorum: Add quorum_aio_writev and its dependencies Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 06/11] quorum: Add quorum_co_flush() Benoît Canet
` (5 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/blkverify.c | 108 +----------------------------------------------------
cutils.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++
qemu-common.h | 2 +
3 files changed, 107 insertions(+), 106 deletions(-)
diff --git a/block/blkverify.c b/block/blkverify.c
index 9d5f1ec..79d36d5 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -123,110 +123,6 @@ static int64_t blkverify_getlength(BlockDriverState *bs)
return bdrv_getlength(s->test_file);
}
-/**
- * Check that I/O vector contents are identical
- *
- * @a: I/O vector
- * @b: I/O vector
- * @ret: Offset to first mismatching byte or -1 if match
- */
-static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
-{
- int i;
- ssize_t offset = 0;
-
- assert(a->niov == b->niov);
- for (i = 0; i < a->niov; i++) {
- size_t len = 0;
- uint8_t *p = (uint8_t *)a->iov[i].iov_base;
- uint8_t *q = (uint8_t *)b->iov[i].iov_base;
-
- assert(a->iov[i].iov_len == b->iov[i].iov_len);
- while (len < a->iov[i].iov_len && *p++ == *q++) {
- len++;
- }
-
- offset += len;
-
- if (len != a->iov[i].iov_len) {
- return offset;
- }
- }
- return -1;
-}
-
-typedef struct {
- int src_index;
- struct iovec *src_iov;
- void *dest_base;
-} IOVectorSortElem;
-
-static int sortelem_cmp_src_base(const void *a, const void *b)
-{
- const IOVectorSortElem *elem_a = a;
- const IOVectorSortElem *elem_b = b;
-
- /* Don't overflow */
- if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
- return -1;
- } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
- return 1;
- } else {
- return 0;
- }
-}
-
-static int sortelem_cmp_src_index(const void *a, const void *b)
-{
- const IOVectorSortElem *elem_a = a;
- const IOVectorSortElem *elem_b = b;
-
- return elem_a->src_index - elem_b->src_index;
-}
-
-/**
- * Copy contents of I/O vector
- *
- * The relative relationships of overlapping iovecs are preserved. This is
- * necessary to ensure identical semantics in the cloned I/O vector.
- */
-static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src,
- void *buf)
-{
- IOVectorSortElem sortelems[src->niov];
- void *last_end;
- int i;
-
- /* Sort by source iovecs by base address */
- for (i = 0; i < src->niov; i++) {
- sortelems[i].src_index = i;
- sortelems[i].src_iov = &src->iov[i];
- }
- qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
-
- /* Allocate buffer space taking into account overlapping iovecs */
- last_end = NULL;
- for (i = 0; i < src->niov; i++) {
- struct iovec *cur = sortelems[i].src_iov;
- ptrdiff_t rewind = 0;
-
- /* Detect overlap */
- if (last_end && last_end > cur->iov_base) {
- rewind = last_end - cur->iov_base;
- }
-
- sortelems[i].dest_base = buf - rewind;
- buf += cur->iov_len - MIN(rewind, cur->iov_len);
- last_end = MAX(cur->iov_base + cur->iov_len, last_end);
- }
-
- /* Sort by source iovec index and build destination iovec */
- qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
- for (i = 0; i < src->niov; i++) {
- qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
- }
-}
-
static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write,
int64_t sector_num, QEMUIOVector *qiov,
int nb_sectors,
@@ -290,7 +186,7 @@ static void blkverify_aio_cb(void *opaque, int ret)
static void blkverify_verify_readv(BlkverifyAIOCB *acb)
{
- ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov);
+ ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov);
if (offset != -1) {
blkverify_err(acb, "contents mismatch in sector %" PRId64,
acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE));
@@ -308,7 +204,7 @@ static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs,
acb->verify = blkverify_verify_readv;
acb->buf = qemu_blockalign(bs->file, qiov->size);
qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov);
- blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
+ qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf);
bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors,
blkverify_aio_cb, acb);
diff --git a/cutils.c b/cutils.c
index ee4614d..dcdd60f 100644
--- a/cutils.c
+++ b/cutils.c
@@ -245,6 +245,109 @@ size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
return iov_memset(qiov->iov, qiov->niov, offset, fillc, bytes);
}
+/**
+ * Check that I/O vector contents are identical
+ *
+ * @a: I/O vector
+ * @b: I/O vector
+ * @ret: Offset to first mismatching byte or -1 if match
+ */
+ssize_t qemu_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+ int i;
+ ssize_t offset = 0;
+
+ assert(a->niov == b->niov);
+ for (i = 0; i < a->niov; i++) {
+ size_t len = 0;
+ uint8_t *p = (uint8_t *)a->iov[i].iov_base;
+ uint8_t *q = (uint8_t *)b->iov[i].iov_base;
+
+ assert(a->iov[i].iov_len == b->iov[i].iov_len);
+ while (len < a->iov[i].iov_len && *p++ == *q++) {
+ len++;
+ }
+
+ offset += len;
+
+ if (len != a->iov[i].iov_len) {
+ return offset;
+ }
+ }
+ return -1;
+}
+
+typedef struct {
+ int src_index;
+ struct iovec *src_iov;
+ void *dest_base;
+} IOVectorSortElem;
+
+static int sortelem_cmp_src_base(const void *a, const void *b)
+{
+ const IOVectorSortElem *elem_a = a;
+ const IOVectorSortElem *elem_b = b;
+
+ /* Don't overflow */
+ if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
+ return -1;
+ } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static int sortelem_cmp_src_index(const void *a, const void *b)
+{
+ const IOVectorSortElem *elem_a = a;
+ const IOVectorSortElem *elem_b = b;
+
+ return elem_a->src_index - elem_b->src_index;
+}
+
+/**
+ * Copy contents of I/O vector
+ *
+ * The relative relationships of overlapping iovecs are preserved. This is
+ * necessary to ensure identical semantics in the cloned I/O vector.
+ */
+void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf)
+{
+ IOVectorSortElem sortelems[src->niov];
+ void *last_end;
+ int i;
+
+ /* Sort by source iovecs by base address */
+ for (i = 0; i < src->niov; i++) {
+ sortelems[i].src_index = i;
+ sortelems[i].src_iov = &src->iov[i];
+ }
+ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
+
+ /* Allocate buffer space taking into account overlapping iovecs */
+ last_end = NULL;
+ for (i = 0; i < src->niov; i++) {
+ struct iovec *cur = sortelems[i].src_iov;
+ ptrdiff_t rewind = 0;
+
+ /* Detect overlap */
+ if (last_end && last_end > cur->iov_base) {
+ rewind = last_end - cur->iov_base;
+ }
+
+ sortelems[i].dest_base = buf - rewind;
+ buf += cur->iov_len - MIN(rewind, cur->iov_len);
+ last_end = MAX(cur->iov_base + cur->iov_len, last_end);
+ }
+
+ /* Sort by source iovec index and build destination iovec */
+ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
+ for (i = 0; i < src->niov; i++) {
+ qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
+ }
+}
+
/*
* Checks if a buffer is all zeroes
*
diff --git a/qemu-common.h b/qemu-common.h
index 095e28d..724d08a 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -371,6 +371,8 @@ size_t qemu_iovec_from_buf(QEMUIOVector *qiov, size_t offset,
const void *buf, size_t bytes);
size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
int fillc, size_t bytes);
+ssize_t qemu_iovec_compare(QEMUIOVector *a, QEMUIOVector *b);
+void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf);
bool buffer_is_zero(const void *buf, size_t len);
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 06/11] quorum: Add quorum_co_flush().
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (4 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 05/11] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 07/11] quorum: Add quorum_aio_readv Benoît Canet
` (4 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index cd11cfb..f83b4cf 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -282,6 +282,18 @@ static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
return &acb->common;
}
+static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
+{
+ BDRVQuorumState *s = bs->opaque;
+ int i;
+
+ for (i = 0; i < s->total; i++) {
+ bdrv_co_flush(s->bs[i]);
+ }
+
+ return 0;
+}
+
static BlockDriver bdrv_quorum = {
.format_name = "quorum",
.protocol_name = "quorum",
@@ -290,6 +302,7 @@ static BlockDriver bdrv_quorum = {
.bdrv_file_open = quorum_open,
.bdrv_close = quorum_close,
+ .bdrv_co_flush_to_disk = quorum_co_flush,
.bdrv_aio_writev = quorum_aio_writev,
};
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 07/11] quorum: Add quorum_aio_readv.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (5 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 06/11] quorum: Add quorum_co_flush() Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 08/11] quorum: Add quorum mechanism Benoît Canet
` (3 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 38 +++++++++++++++++++++++++++++++++++++-
1 file changed, 37 insertions(+), 1 deletion(-)
diff --git a/block/quorum.c b/block/quorum.c
index f83b4cf..791ef4a 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -193,15 +193,24 @@ static void quorum_aio_bh(void *opaque)
{
QuorumAIOCB *acb = opaque;
BDRVQuorumState *s = acb->bqs;
- int ret;
+ int i, ret;
ret = s->threshold <= acb->success_count ? 0 : -EIO;
+ for (i = 0; i < s->total; i++) {
+ qemu_vfree(acb->aios[i].buf);
+ acb->aios[i].buf = NULL;
+ acb->aios[i].ret = 0;
+ }
+
qemu_bh_delete(acb->bh);
acb->common.cb(acb->common.opaque, ret);
if (acb->finished) {
*acb->finished = true;
}
+ for (i = 0; i < s->total; i++) {
+ qemu_iovec_destroy(&acb->qiovs[i]);
+ }
g_free(acb->aios);
g_free(acb->qiovs);
qemu_aio_release(acb);
@@ -261,6 +270,32 @@ static void quorum_aio_cb(void *opaque, int ret)
qemu_bh_schedule(acb->bh);
}
+static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov,
+ int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque)
+{
+ BDRVQuorumState *s = bs->opaque;
+ QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num,
+ nb_sectors, cb, opaque);
+ int i;
+
+ for (i = 0; i < s->total; i++) {
+ acb->aios[i].buf = qemu_blockalign(bs->file, qiov->size);
+ qemu_iovec_init(&acb->qiovs[i], qiov->niov);
+ qemu_iovec_clone(&acb->qiovs[i], qiov, acb->aios[i].buf);
+ }
+
+ for (i = 0; i < s->total; i++) {
+ bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors,
+ quorum_aio_cb, &acb->aios[i]);
+ }
+
+ return &acb->common;
+}
+
static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
@@ -304,6 +339,7 @@ static BlockDriver bdrv_quorum = {
.bdrv_close = quorum_close,
.bdrv_co_flush_to_disk = quorum_co_flush,
+ .bdrv_aio_readv = quorum_aio_readv,
.bdrv_aio_writev = quorum_aio_writev,
};
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 08/11] quorum: Add quorum mechanism.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (6 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 07/11] quorum: Add quorum_aio_readv Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 09/11] quorum: Add quorum_getlength() Benoît Canet
` (2 subsequent siblings)
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 222 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 221 insertions(+), 1 deletion(-)
diff --git a/block/quorum.c b/block/quorum.c
index 791ef4a..3fa9d53 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,6 +14,20 @@
*/
#include "block_int.h"
+#include "zlib.h"
+
+typedef struct QuorumVoteItem {
+ int index;
+ QLIST_ENTRY(QuorumVoteItem) next;
+} QuorumVoteItem;
+
+typedef struct QuorumVoteVersion {
+ unsigned long value;
+ int index;
+ int vote_count;
+ QLIST_HEAD(, QuorumVoteItem) items;
+ QLIST_ENTRY(QuorumVoteVersion) next;
+} QuorumVoteVersion;
typedef struct {
BlockDriverState **bs;
@@ -31,6 +45,10 @@ typedef struct QuorumSingleAIOCB {
QuorumAIOCB *parent;
} QuorumSingleAIOCB;
+typedef struct QuorumVotes {
+ QLIST_HEAD(, QuorumVoteVersion) vote_list;
+} QuorumVotes;
+
struct QuorumAIOCB {
BlockDriverAIOCB common;
BDRVQuorumState *bqs;
@@ -48,6 +66,8 @@ struct QuorumAIOCB {
int success_count; /* number of successfully completed AIOCB */
bool *finished; /* completion signal for cancel */
+ QuorumVotes votes;
+
void (*vote)(QuorumAIOCB *acb);
int vote_ret;
};
@@ -204,6 +224,11 @@ static void quorum_aio_bh(void *opaque)
}
qemu_bh_delete(acb->bh);
+
+ if (acb->vote_ret) {
+ ret = acb->vote_ret;
+ }
+
acb->common.cb(acb->common.opaque, ret);
if (acb->finished) {
*acb->finished = true;
@@ -239,6 +264,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
acb->nb_sectors = nb_sectors;
acb->vote = NULL;
acb->vote_ret = 0;
+ QLIST_INIT(&acb->votes.vote_list);
for (i = 0; i < s->total; i++) {
acb->aios[i].buf = NULL;
@@ -266,10 +292,202 @@ static void quorum_aio_cb(void *opaque, int ret)
return;
}
+ /* Do the vote */
+ if (acb->vote) {
+ acb->vote(acb);
+ }
+
acb->bh = qemu_bh_new(quorum_aio_bh, acb);
qemu_bh_schedule(acb->bh);
}
+static void quorum_print_bad(QuorumAIOCB *acb, const char *filename)
+{
+ fprintf(stderr, "quorum: corrected error in quorum file %s: sector_num=%"
+ PRId64 " nb_sectors=%i\n", filename, acb->sector_num,
+ acb->nb_sectors);
+}
+
+static void quorum_print_failure(QuorumAIOCB *acb)
+{
+ fprintf(stderr, "quorum: failure sector_num=%" PRId64 " nb_sectors=%i\n",
+ acb->sector_num, acb->nb_sectors);
+}
+
+static void quorum_print_bad_versions(QuorumAIOCB *acb,
+ unsigned long checksum)
+{
+ QuorumVoteVersion *version;
+ QuorumVoteItem *item;
+ BDRVQuorumState *s = acb->bqs;
+
+ QLIST_FOREACH(version, &acb->votes.vote_list, next) {
+ if (version->value == checksum) {
+ continue;
+ }
+ QLIST_FOREACH(item, &version->items, next) {
+ quorum_print_bad(acb, s->filenames[item->index]);
+ }
+ }
+}
+
+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
+{
+ int i;
+ assert(dest->niov == source->niov);
+ assert(dest->size == source->size);
+ for (i = 0; i < source->niov; i++) {
+ assert(dest->iov[i].iov_len == source->iov[i].iov_len);
+ memcpy(dest->iov[i].iov_base,
+ source->iov[i].iov_base,
+ source->iov[i].iov_len);
+ }
+}
+
+static void quorum_count_vote(QuorumVotes *votes,
+ unsigned long checksum,
+ int index)
+{
+ QuorumVoteVersion *v = NULL, *version = NULL;
+ QuorumVoteItem *item;
+
+ /* look if we have something with this checksum */
+ QLIST_FOREACH(v, &votes->vote_list, next) {
+ if (v->value == checksum) {
+ version = v;
+ break;
+ }
+ }
+
+ /* It's a version not yet in the list add it */
+ if (!version) {
+ version = g_new0(QuorumVoteVersion, 1);
+ QLIST_INIT(&version->items);
+ version->value = checksum;
+ version->index = index;
+ version->vote_count = 0;
+ QLIST_INSERT_HEAD(&votes->vote_list, version, next);
+ }
+
+ version->vote_count++;
+
+ item = g_new0(QuorumVoteItem, 1);
+ item->index = index;
+ QLIST_INSERT_HEAD(&version->items, item, next);
+}
+
+static void quorum_free_vote_list(QuorumVotes *votes)
+{
+ QuorumVoteVersion *version, *next_version;
+ QuorumVoteItem *item, *next_item;
+
+ QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
+ QLIST_REMOVE(version, next);
+ QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
+ QLIST_REMOVE(item, next);
+ g_free(item);
+ }
+ g_free(version);
+ }
+}
+
+static unsigned long quorum_compute_checksum(QuorumAIOCB *acb, int i)
+{
+ int j;
+ unsigned long adler = adler32(0L, Z_NULL, 0);
+ QEMUIOVector *qiov = &acb->qiovs[i];
+
+ for (j = 0; j < qiov->niov; j++) {
+ adler = adler32(adler,
+ qiov->iov[j].iov_base,
+ qiov->iov[j].iov_len);
+ }
+
+ return adler;
+}
+
+static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
+{
+ int i = 0;
+ QuorumVoteVersion *candidate, *winner = NULL;
+
+ QLIST_FOREACH(candidate, &votes->vote_list, next) {
+ if (candidate->vote_count > i) {
+ i = candidate->vote_count;
+ winner = candidate;
+ }
+ }
+
+ return winner;
+}
+
+static void quorum_vote(QuorumAIOCB *acb)
+{
+ bool quorum = true;
+ int i, j;
+ unsigned long checksum = 0;
+ BDRVQuorumState *s = acb->bqs;
+ QuorumVoteVersion *winner;
+
+ /* get the index of the first successfull read */
+ for (i = 0; i < s->total; i++) {
+ if (!acb->aios[i].ret) {
+ break;
+ }
+ }
+
+ /* compare this read with all other successfull read looking for quorum */
+ for (j = i + 1; j < s->total; j++) {
+ if (acb->aios[j].ret) {
+ continue;
+ }
+ if (qemu_iovec_compare(&acb->qiovs[i],
+ &acb->qiovs[j]) != -1) {
+ quorum = false;
+ break;
+ }
+ }
+
+ /* Every successfull read agrees -> Quorum */
+ if (quorum) {
+ quorum_copy_qiov(acb->qiov, &acb->qiovs[i]);
+ return;
+ }
+
+ /* compute checksums for each successfull read, also store indexes */
+ for (i = 0; i < s->total; i++) {
+ if (acb->aios[i].ret) {
+ continue;
+ }
+ checksum = quorum_compute_checksum(acb, i);
+ quorum_count_vote(&acb->votes, checksum, i);
+ }
+
+ /* vote to select the most represented version */
+ winner = quorum_get_vote_winner(&acb->votes);
+ assert(winner != NULL);
+
+ /* if the winner count is smaller than threshold read fail */
+ if (winner->vote_count < s->threshold) {
+ quorum_print_failure(acb);
+ acb->vote_ret = -EIO;
+ fprintf(stderr, "quorum: vote result inferior to threshold\n");
+ goto free_exit;
+ }
+
+ /* we have a winner: copy it */
+ quorum_copy_qiov(acb->qiov, &acb->qiovs[winner->index]);
+
+ /* if some versions are bad print them */
+ if (i < s->total) {
+ quorum_print_bad_versions(acb, winner->value);
+ }
+
+free_exit:
+ /* free lists */
+ quorum_free_vote_list(&acb->votes);
+}
+
static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov,
@@ -282,6 +500,8 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
nb_sectors, cb, opaque);
int i;
+ acb->vote = quorum_vote;
+
for (i = 0; i < s->total; i++) {
acb->aios[i].buf = qemu_blockalign(bs->file, qiov->size);
qemu_iovec_init(&acb->qiovs[i], qiov->niov);
@@ -289,7 +509,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
}
for (i = 0; i < s->total; i++) {
- bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors,
+ bdrv_aio_readv(s->bs[i], sector_num, &acb->qiovs[i], nb_sectors,
quorum_aio_cb, &acb->aios[i]);
}
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 09/11] quorum: Add quorum_getlength().
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (7 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 08/11] quorum: Add quorum mechanism Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 18:03 ` Eric Blake
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 10/11] quorum: Add quorum_invalidate_cache() Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 11/11] quorum: Add quorum_co_is_allocated Benoît Canet
10 siblings, 1 reply; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 24 ++++++++++++++++++++++++
1 file changed, 24 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index 3fa9d53..09eed84 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -549,12 +549,36 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
return 0;
}
+static int64_t quorum_getlength(BlockDriverState *bs)
+{
+ BDRVQuorumState *s = bs->opaque;
+ QuorumVoteVersion *winner = NULL;
+ QuorumVotes votes;
+ int64_t value;
+ int i;
+
+ QLIST_INIT(&votes.vote_list);
+ for (i = 0; i < s->total; i++) {
+ quorum_count_vote(&votes, (unsigned long) bdrv_getlength(s->bs[i]), i);
+ }
+
+ /* vote to select the most represented version */
+ winner = quorum_get_vote_winner(&votes);
+
+ value = (int64_t) winner->value;
+ quorum_free_vote_list(&votes);
+
+ return value;
+}
+
static BlockDriver bdrv_quorum = {
.format_name = "quorum",
.protocol_name = "quorum",
.instance_size = sizeof(BDRVQuorumState),
+ .bdrv_getlength = quorum_getlength,
+
.bdrv_file_open = quorum_open,
.bdrv_close = quorum_close,
.bdrv_co_flush_to_disk = quorum_co_flush,
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC V5 09/11] quorum: Add quorum_getlength().
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 09/11] quorum: Add quorum_getlength() Benoît Canet
@ 2012-08-27 18:03 ` Eric Blake
2012-08-27 19:23 ` Benoît Canet
0 siblings, 1 reply; 16+ messages in thread
From: Eric Blake @ 2012-08-27 18:03 UTC (permalink / raw)
To: Benoît Canet
Cc: kwolf, Benoît Canet, qemu-devel, blauwirbel, pbonzini,
stefanha
[-- Attachment #1: Type: text/plain, Size: 968 bytes --]
On 08/27/2012 01:30 AM, Benoît Canet wrote:
> Signed-off-by: Benoit Canet <benoit@irqsave.net>
> ---
> block/quorum.c | 24 ++++++++++++++++++++++++
> 1 file changed, 24 insertions(+)
Say I'm using a 2/3 quorum. What happens if:
image A and B agree on initial content, but not length
image B and C agree on length, but not content
Does disagreeing with quorum consensus invalidate that member of the
quorum from influencing further decisions? If there is a length
discrepancy, should we declare the quorum failed rather than relying on
a mere majority vote? Or, if there is a length discrepancy, does
reading the contents beyond the end of the shorter files consider the
contents to be okay if the longer files have only NUL bytes in the extra
length? I'm worried that you haven't fully thought through all the
scenarios here.
--
Eric Blake eblake@redhat.com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 620 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [Qemu-devel] [RFC V5 09/11] quorum: Add quorum_getlength().
2012-08-27 18:03 ` Eric Blake
@ 2012-08-27 19:23 ` Benoît Canet
0 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 19:23 UTC (permalink / raw)
To: Eric Blake
Cc: kwolf, Benoît Canet, stefanha, qemu-devel, blauwirbel,
pbonzini
Le Monday 27 Aug 2012 à 12:03:48 (-0600), Eric Blake a écrit :
> On 08/27/2012 01:30 AM, Benoît Canet wrote:
> > Signed-off-by: Benoit Canet <benoit@irqsave.net>
> > ---
> > block/quorum.c | 24 ++++++++++++++++++++++++
> > 1 file changed, 24 insertions(+)
>
> Say I'm using a 2/3 quorum. What happens if:
>
> image A and B agree on initial content, but not length
> image B and C agree on length, but not content
>
> Does disagreeing with quorum consensus invalidate that member of the
> quorum from influencing further decisions? If there is a length
> discrepancy, should we declare the quorum failed rather than relying on
> a mere majority vote? Or, if there is a length discrepancy, does
> reading the contents beyond the end of the shorter files consider the
> contents to be okay if the longer files have only NUL bytes in the extra
> length? I'm worried that you haven't fully thought through all the
> scenarios here.
True,
Maybe I shoud just check that every quorum file have the same length
and return 0 on error. This way the quorum in disabled in case of length
discrepancy.
>
> --
> Eric Blake eblake@redhat.com +1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>
^ permalink raw reply [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 10/11] quorum: Add quorum_invalidate_cache().
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (8 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 09/11] quorum: Add quorum_getlength() Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 11/11] quorum: Add quorum_co_is_allocated Benoît Canet
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index 09eed84..c9dcd9c 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -571,6 +571,16 @@ static int64_t quorum_getlength(BlockDriverState *bs)
return value;
}
+static void quorum_invalidate_cache(BlockDriverState *bs)
+{
+ BDRVQuorumState *s = bs->opaque;
+ int i;
+
+ for (i = 0; i < s->total; i++) {
+ bdrv_invalidate_cache(s->bs[i]);
+ }
+}
+
static BlockDriver bdrv_quorum = {
.format_name = "quorum",
.protocol_name = "quorum",
@@ -585,6 +595,7 @@ static BlockDriver bdrv_quorum = {
.bdrv_aio_readv = quorum_aio_readv,
.bdrv_aio_writev = quorum_aio_writev,
+ .bdrv_invalidate_cache = quorum_invalidate_cache,
};
static void bdrv_quorum_init(void)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [Qemu-devel] [RFC V5 11/11] quorum: Add quorum_co_is_allocated.
2012-08-27 7:30 [Qemu-devel] [RFC V5 00/11] Quorum disk image corruption resiliency Benoît Canet
` (9 preceding siblings ...)
2012-08-27 7:30 ` [Qemu-devel] [RFC V5 10/11] quorum: Add quorum_invalidate_cache() Benoît Canet
@ 2012-08-27 7:30 ` Benoît Canet
10 siblings, 0 replies; 16+ messages in thread
From: Benoît Canet @ 2012-08-27 7:30 UTC (permalink / raw)
To: qemu-devel
Cc: kwolf, stefanha, blauwirbel, pbonzini, eblake, Benoît Canet
Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
block/quorum.c | 32 ++++++++++++++++++++++++++++++++
1 file changed, 32 insertions(+)
diff --git a/block/quorum.c b/block/quorum.c
index c9dcd9c..5a9f598 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -581,6 +581,37 @@ static void quorum_invalidate_cache(BlockDriverState *bs)
}
}
+static int coroutine_fn quorum_co_is_allocated(BlockDriverState *bs,
+ int64_t sector_num,
+ int nb_sectors,
+ int *pnum)
+{
+ BDRVQuorumState *s = bs->opaque;
+ QuorumVoteVersion *winner = NULL;
+ QuorumVotes result_votes, num_votes;
+ int i, result, num;
+
+ QLIST_INIT(&result_votes.vote_list);
+ QLIST_INIT(&num_votes.vote_list);
+
+ for (i = 0; i < s->total; i++) {
+ result = bdrv_co_is_allocated(s->bs[i], sector_num, nb_sectors, &num);
+ quorum_count_vote(&result_votes, result, i);
+ quorum_count_vote(&num_votes, num, i);
+ }
+
+ winner = quorum_get_vote_winner(&result_votes);
+ result = winner->value;
+
+ winner = quorum_get_vote_winner(&num_votes);
+ *pnum = winner->value;
+
+ quorum_free_vote_list(&result_votes);
+ quorum_free_vote_list(&num_votes);
+
+ return result;
+}
+
static BlockDriver bdrv_quorum = {
.format_name = "quorum",
.protocol_name = "quorum",
@@ -596,6 +627,7 @@ static BlockDriver bdrv_quorum = {
.bdrv_aio_readv = quorum_aio_readv,
.bdrv_aio_writev = quorum_aio_writev,
.bdrv_invalidate_cache = quorum_invalidate_cache,
+ .bdrv_co_is_allocated = quorum_co_is_allocated,
};
static void bdrv_quorum_init(void)
--
1.7.9.5
^ permalink raw reply related [flat|nested] 16+ messages in thread