From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:40040) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1W9Nio-0006HS-Vh for qemu-devel@nongnu.org; Fri, 31 Jan 2014 18:42:12 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1W9Nii-0000Ah-Uj for qemu-devel@nongnu.org; Fri, 31 Jan 2014 18:42:06 -0500 Received: from mx1.redhat.com ([209.132.183.28]:55955) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1W9Nii-0000AO-JW for qemu-devel@nongnu.org; Fri, 31 Jan 2014 18:42:00 -0500 Message-ID: <52EC2BE1.7090800@redhat.com> Date: Sat, 01 Feb 2014 00:04:01 +0100 From: Max Reitz MIME-Version: 1.0 References: <1390927974-31325-1-git-send-email-benoit.canet@irqsave.net> <1390927974-31325-7-git-send-email-benoit.canet@irqsave.net> In-Reply-To: <1390927974-31325-7-git-send-email-benoit.canet@irqsave.net> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH V10 06/13] quorum: Add quorum mechanism. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: =?UTF-8?B?QmVub8OudCBDYW5ldA==?= , qemu-devel@nongnu.org Cc: kwolf@redhat.com, =?UTF-8?B?QmVub8OudCBDYW5ldA==?= , stefanha@redhat.com On 28.01.2014 17:52, Beno=C3=AEt Canet wrote: > From: Beno=C3=AEt Canet > > Use gnutls's SHA-256 to compare versions. > > Signed-off-by: Benoit Canet > --- > block/Makefile.objs | 2 +- > block/quorum.c | 333 +++++++++++++++++++++++++++++++++++++= ++++++++- > configure | 36 +++++ > docs/qmp/qmp-events.txt | 33 +++++ > include/monitor/monitor.h | 2 + > monitor.c | 2 + > 6 files changed, 406 insertions(+), 2 deletions(-) > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index a2650b9..4ca9d43 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -3,7 +3,7 @@ block-obj-y +=3D qcow2.o qcow2-refcount.o qcow2-cluster= .o qcow2-snapshot.o qcow2-c > block-obj-y +=3D qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-clu= ster.o > block-obj-y +=3D qed-check.o > block-obj-$(CONFIG_VHDX) +=3D vhdx.o vhdx-endian.o vhdx-log.o > -block-obj-y +=3D quorum.o > +block-obj-$(CONFIG_QUORUM) +=3D quorum.o > block-obj-y +=3D parallels.o blkdebug.o blkverify.o > block-obj-y +=3D snapshot.o qapi.o > block-obj-$(CONFIG_WIN32) +=3D raw-win32.o win32-aio.o > diff --git a/block/quorum.c b/block/quorum.c > index 5bf37b3..c319719 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -13,7 +13,43 @@ > * See the COPYING file in the top-level directory. > */ > =20 > +#include > +#include > #include "block/block_int.h" > +#include "qapi/qmp/qjson.h" > + > +#define HASH_LENGTH 32 > + > +/* This union holds a vote hash value */ > +typedef union QuorumVoteValue { > + char h[HASH_LENGTH]; /* SHA-256 hash */ > + int64_t l; /* simpler 64 bits hash */ > +} QuorumVoteValue; > + > +/* A vote item */ > +typedef struct QuorumVoteItem { > + int index; > + QLIST_ENTRY(QuorumVoteItem) next; > +} QuorumVoteItem; > + > +/* this structure is a vote version. A version is the set of votes sha= ring the > + * same vote value. > + * The set of votes will be tracked with the items field and its cardi= nality is > + * vote_count. > + */ > +typedef struct QuorumVoteVersion { > + QuorumVoteValue value; > + int index; > + int vote_count; > + QLIST_HEAD(, QuorumVoteItem) items; > + QLIST_ENTRY(QuorumVoteVersion) next; > +} QuorumVoteVersion; > + > +/* this structure holds a group of vote versions together */ > +typedef struct QuorumVotes { > + QLIST_HEAD(, QuorumVoteVersion) vote_list; > + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); > +} QuorumVotes; > =20 > /* the following structure holds the state of one quorum instance */ > typedef struct { > @@ -60,10 +96,14 @@ struct QuorumAIOCB { > int success_count; /* number of successfully completed A= IOCB */ > bool *finished; /* completion signal for cancel */ > =20 > + QuorumVotes votes; > + > bool is_read; > int vote_ret; > }; > =20 > +static void quorum_vote(QuorumAIOCB *acb); > + > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > { > QuorumAIOCB *acb =3D container_of(blockacb, QuorumAIOCB, common); > @@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > acb->aios[i].ret =3D 0; > } > =20 > + if (acb->vote_ret) { > + ret =3D acb->vote_ret; > + } Hm, this makes the vote_ret take precedence over other errors returned=20 by the children. If they differ (and both are not 0), we can choose=20 between returning either (vote_ret or =E2=80=9Creal=E2=80=9D errors repor= ted by the=20 child devices). Both are errors, so I don't see any natural precedence.=20 However, generally, vote_ret will contain a more generic error code=20 (i.e., -EIO), thus I could imagine the other error code reported by the=20 child device to be more appropriate, as it might contain more useful=20 information. But, well, on the other had, Quorum is designed for hiding errors=20 reported by a minority of child devices; therefore, hiding such errors=20 in this case as well is probably just consequent. I'll leave this up to you; I'm fine with it as it is and I'd be fine if=20 (for whatever reason) you were to change it. :-) > + > acb->common.cb(acb->common.opaque, ret); > if (acb->finished) { > *acb->finished =3D true; > @@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > qemu_aio_release(acb); > } > =20 > +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *= b) > +{ > + return memcmp(a->h, b->h, HASH_LENGTH); > +} > + > static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > BlockDriverState *bs, > QEMUIOVector *qiov, > @@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState = *s, > acb->count =3D 0; > acb->success_count =3D 0; > acb->finished =3D NULL; > + acb->votes.compare =3D quorum_sha256_compare; > acb->is_read =3D false; > acb->vote_ret =3D 0; > =20 > @@ -170,9 +220,290 @@ static void quorum_aio_cb(void *opaque, int ret) > return; > } > =20 > + /* Do the vote on read */ > + if (acb->is_read) { > + quorum_vote(acb); > + } > + > quorum_aio_finalize(acb); > } > =20 > +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name) > +{ > + QObject *data; > + data =3D qobject_from_jsonf("{ 'node-name': \"%s\"" > + ", 'sector-num': %" PRId64 > + ", 'sectors-count': %i }", > + node_name, > + acb->sector_num, > + acb->nb_sectors); > + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); > + qobject_decref(data); > +} > + > +static void quorum_report_failure(QuorumAIOCB *acb) > +{ > + QObject *data; > + data =3D qobject_from_jsonf("{ 'sector-num': %" PRId64 > + ", 'sectors-count': %i }", > + acb->sector_num, > + acb->nb_sectors); > + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); > + qobject_decref(data); > +} > + > +static void quorum_report_bad_versions(BDRVQuorumState *s, > + QuorumAIOCB *acb, > + QuorumVoteValue *value) > +{ > + QuorumVoteVersion *version; > + QuorumVoteItem *item; > + > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > + if (!acb->votes.compare(&version->value, value)) { > + continue; > + } > + QLIST_FOREACH(item, &version->items, next) { > + quorum_report_bad(acb, s->bs[item->index]->node_name); > + } > + } > +} > + > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > +{ > + int i; > + assert(dest->niov =3D=3D source->niov); > + assert(dest->size =3D=3D source->size); > + for (i =3D 0; i < source->niov; i++) { > + assert(dest->iov[i].iov_len =3D=3D source->iov[i].iov_len); > + memcpy(dest->iov[i].iov_base, > + source->iov[i].iov_base, > + source->iov[i].iov_len); > + } > +} > + > +static void quorum_count_vote(QuorumVotes *votes, > + QuorumVoteValue *value, > + int index) > +{ > + QuorumVoteVersion *v =3D NULL, *version =3D NULL; > + QuorumVoteItem *item; > + > + /* look if we have something with this hash */ > + QLIST_FOREACH(v, &votes->vote_list, next) { > + if (!votes->compare(&v->value, value)) { > + version =3D v; > + break; > + } > + } > + > + /* It's a version not yet in the list add it */ > + if (!version) { > + version =3D g_new0(QuorumVoteVersion, 1); > + QLIST_INIT(&version->items); > + memcpy(&version->value, value, sizeof(version->value)); > + version->index =3D index; > + version->vote_count =3D 0; > + QLIST_INSERT_HEAD(&votes->vote_list, version, next); > + } > + > + version->vote_count++; > + > + item =3D g_new0(QuorumVoteItem, 1); > + item->index =3D index; > + QLIST_INSERT_HEAD(&version->items, item, next); > +} > + > +static void quorum_free_vote_list(QuorumVotes *votes) > +{ > + QuorumVoteVersion *version, *next_version; > + QuorumVoteItem *item, *next_item; > + > + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version)= { > + QLIST_REMOVE(version, next); > + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { > + QLIST_REMOVE(item, next); > + g_free(item); > + } > + g_free(version); > + } > +} > + > +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValu= e *hash) > +{ > + int j, ret; > + gnutls_hash_hd_t dig; > + QEMUIOVector *qiov =3D &acb->aios[i].qiov; > + > + ret =3D gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); > + > + if (ret < 0) { > + return ret; > + } > + > + for (j =3D 0; j < qiov->niov; j++) { > + ret =3D gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].i= ov_len); > + if (ret < 0) { > + break; > + } > + } > + > + gnutls_hash_deinit(dig, (void *) hash); > + return ret; > +} > + > +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) > +{ > + int i =3D 0; > + QuorumVoteVersion *candidate, *winner =3D NULL; > + > + QLIST_FOREACH(candidate, &votes->vote_list, next) { > + if (candidate->vote_count > i) { > + i =3D candidate->vote_count; > + winner =3D candidate; > + } > + } > + > + return winner; > +} > + > +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) > +{ > + int i; > + int result; > + > + assert(a->niov =3D=3D b->niov); > + for (i =3D 0; i < a->niov; i++) { > + assert(a->iov[i].iov_len =3D=3D b->iov[i].iov_len); > + result =3D memcmp(a->iov[i].iov_base, > + b->iov[i].iov_base, > + a->iov[i].iov_len); > + if (result) { > + return false; > + } > + } > + > + return true; > +} > + > +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, > + const char *fmt, ...) > +{ > + va_list ap; > + > + va_start(ap, fmt); > + fprintf(stderr, "quorum: sector_num=3D%" PRId64 " nb_sectors=3D%d = ", > + acb->sector_num, acb->nb_sectors); > + vfprintf(stderr, fmt, ap); > + fprintf(stderr, "\n"); > + va_end(ap); > + exit(1); > +} > + > +static bool quorum_compare(QuorumAIOCB *acb, > + QEMUIOVector *a, > + QEMUIOVector *b) > +{ > + BDRVQuorumState *s =3D acb->bqs; > + ssize_t offset; > + > + /* This driver will replace blkverify in this particular case */ > + if (s->is_blkverify) { > + offset =3D qemu_iovec_compare(a, b); > + if (offset !=3D -1) { > + quorum_err(acb, "contents mismatch in sector %" PRId64, > + acb->sector_num + > + (uint64_t)(offset / BDRV_SECTOR_SIZE)); > + } > + return true; > + } > + > + return quorum_iovec_compare(a, b); > +} > + > +static void quorum_vote(QuorumAIOCB *acb) > +{ > + bool quorum =3D false; > + int i, j, ret; > + QuorumVoteValue hash; > + BDRVQuorumState *s =3D acb->bqs; > + QuorumVoteVersion *winner; > + > + QLIST_INIT(&acb->votes.vote_list); > + > + /* if we don't get any successful read use the first error code */ > + if (!acb->success_count) { > + acb->vote_ret =3D acb->aios[0].ret; > + return; > + } > + > + /* get the index of the first successful read (we are sure to find= one) */ > + for (i =3D 0; i < s->total; i++) { > + if (!acb->aios[i].ret) { > + break; > + } > + } > + > + /* compare this read with all other successful read looking for qu= orum */ > + for (j =3D i + 1; j < s->total; j++) { > + if (acb->aios[j].ret) { > + continue; > + } > + quorum =3D quorum_compare(acb, &acb->aios[i].qiov, &acb->aios[= j].qiov); > + if (!quorum) { > + break; > + } > + } > + > + /* Every successful read agrees and their count is higher or equal= threshold > + * -> Quorum > + */ > + if (quorum && acb->success_count >=3D s->threshold) { > + quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov); > + return; > + } Well, for quorum && acb->success_count < s->threshold, calculating and=20 comparing hashes doesn't help at all (since the vote winner will only be=20 able to get up to acb->success_count votes anyway). We should probably=20 catch this case in the first if condition in this function (change it to=20 =E2=80=9Cif (acb->success_count < s->threshold)=E2=80=9D). > + > + /* compute hashs for each successful read, also store indexes */ > + for (i =3D 0; i < s->total; i++) { > + if (acb->aios[i].ret) { > + continue; > + } > + ret =3D quorum_compute_hash(acb, i, &hash); > + /* if ever the hash computation failed */ > + if (ret < 0) { > + acb->vote_ret =3D ret; > + goto free_exit; > + } > + quorum_count_vote(&acb->votes, &hash, i); > + } > + > + /* vote to select the most represented version */ > + winner =3D quorum_get_vote_winner(&acb->votes); > + /* every vote version are differents -> error */ > + if (!winner) { > + quorum_report_failure(acb); > + acb->vote_ret =3D -EIO; > + goto free_exit; > + } > + > + /* if the winner count is smaller than threshold the read fails */ > + if (winner->vote_count < s->threshold) { > + quorum_report_failure(acb); > + acb->vote_ret =3D -EIO; > + goto free_exit; > + } > + > + /* we have a winner: copy it */ > + quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov); > + > + /* some versions are bad print them */ > + quorum_report_bad_versions(s, acb, &winner->value); > + > +free_exit: > + /* free lists */ > + quorum_free_vote_list(&acb->votes); > +} > + > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > int64_t sector_num, > QEMUIOVector *qiov, > @@ -194,7 +525,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriv= erState *bs, > } > =20 > for (i =3D 0; i < s->total; i++) { > - bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors, > + bdrv_aio_readv(s->bs[i], sector_num, &acb->aios[i].qiov, nb_se= ctors, > quorum_aio_cb, &acb->aios[i]); > } > =20 > diff --git a/configure b/configure > index b472694..5a68738 100755 > --- a/configure > +++ b/configure > @@ -263,6 +263,7 @@ gtkabi=3D"2.0" > tpm=3D"no" > libssh2=3D"" > vhdx=3D"" > +quorum=3D"no" > =20 > # parse CC options first > for opt do > @@ -1000,6 +1001,10 @@ for opt do > ;; > --disable-vhdx) vhdx=3D"no" > ;; > + --disable-quorum) quorum=3D"no" > + ;; > + --enable-quorum) quorum=3D"yes" > + ;; > *) echo "ERROR: unknown option $opt"; show_help=3D"yes" > ;; > esac > @@ -1254,6 +1259,8 @@ Advanced options (experts only): > --enable-libssh2 enable ssh block device support > --disable-vhdx disables support for the Microsoft VHDX im= age format > --enable-vhdx enable support for the Microsoft VHDX imag= e format > + --disable-quorum disable quorum block filter support > + --enable-quorum enable quorum block filter support > =20 > NOTE: The object files are built at the place where configure is laun= ched > EOF > @@ -1923,6 +1930,30 @@ EOF > fi > =20 > ########################################## > +# Quorum probe (check for gnutls) > +if test "$quorum" !=3D "no" ; then > +cat > $TMPC < +#include > +#include > +int main(void) {char data[4096], digest[32]; > +gnutls_hash_fast(GNUTLS_DIG_SHA256, data, 4096, digest); > +return 0; > +} > +EOF > +quorum_tls_cflags=3D`$pkg_config --cflags gnutls 2> /dev/null` > +quorum_tls_libs=3D`$pkg_config --libs gnutls 2> /dev/null` > +if compile_prog "$quorum_tls_cflags" "$quorum_tls_libs" ; then > + qcow_tls=3Dyes > + libs_softmmu=3D"$quorum_tls_libs $libs_softmmu" > + libs_tools=3D"$quorum_tls_libs $libs_softmmu" > + QEMU_CFLAGS=3D"$QEMU_CFLAGS $quorum_tls_cflags" > +else > + echo "gnutls > 2.10.0 required to compile Quorum" > + exit 1 > +fi > +fi > + > +########################################## > # VNC SASL detection > if test "$vnc" =3D "yes" -a "$vnc_sasl" !=3D "no" ; then > cat > $TMPC < @@ -3843,6 +3874,7 @@ echo "libssh2 support $libssh2" > echo "TPM passthrough $tpm_passthrough" > echo "QOM debugging $qom_cast_debug" > echo "vhdx $vhdx" > +echo "Quorum $quorum" > =20 > if test "$sdl_too_old" =3D "yes"; then > echo "-> Your SDL version is too old - please upgrade to have SDL sup= port" > @@ -4241,6 +4273,10 @@ if test "$libssh2" =3D "yes" ; then > echo "CONFIG_LIBSSH2=3Dy" >> $config_host_mak > fi > =20 > +if test "$quorum" =3D "yes" ; then > + echo "CONFIG_QUORUM=3Dy" >> $config_host_mak > +fi > + > if test "$virtio_blk_data_plane" =3D "yes" ; then > echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=3D$(CONFIG_VIRTIO)' >> $config_h= ost_mak > fi > diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt > index 6b87e97..1cbdde7 100644 > --- a/docs/qmp/qmp-events.txt > +++ b/docs/qmp/qmp-events.txt > @@ -500,3 +500,36 @@ Example: > =20 > Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event= is > followed respectively by the RESET, SHUTDOWN, or STOP events. > + > +QUORUM_FAILURE > +-------------- > + > +Emitted by the Quorum block driver when it fail to establish a quorum. Probably rather =E2=80=9Cif=E2=80=9D than =E2=80=9Cwhen=E2=80=9D; also, =E2= =80=9Cfails=E2=80=9D. > + > +Data: > + > +- "sector-num": Number of the first sector of the failed read operat= ion. > +- "sector-count": Failed read operation sector count. > + > +Example: > + > +{ "event": "QUORUM_FAILURE", > + "data": { "sector-num": 345435, "sector-count": 5 }, > + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } > + > +QUORUM_REPORT_BAD > +----------------- > + > +Emitted to report a corruption of a Quorum file. > + > +Data: > + > +- "node-name": The graph node name of the block driver state. The description is not aligned with the others, this is probably due to=20 the change from child-index to node-name=E2=80=A6? Max > +- "sector-num": Number of the first sector of the failed read operat= ion. > +- "sector-count": Failed read operation sector count. > + > +Example: > + > +{ "event": "QUORUM_REPORT_BAD", > + "data": { "node-name": "1.raw", "sector-num": 345435, "sector-cou= nt": 5 }, > + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } > diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h > index 7e5f752..a49ea11 100644 > --- a/include/monitor/monitor.h > +++ b/include/monitor/monitor.h > @@ -49,6 +49,8 @@ typedef enum MonitorEvent { > QEVENT_SPICE_MIGRATE_COMPLETED, > QEVENT_GUEST_PANICKED, > QEVENT_BLOCK_IMAGE_CORRUPTED, > + QEVENT_QUORUM_FAILURE, > + QEVENT_QUORUM_REPORT_BAD, > =20 > /* Add to 'monitor_event_names' array in monitor.c when > * defining new events here */ > diff --git a/monitor.c b/monitor.c > index 80456fb..f8f6aea 100644 > --- a/monitor.c > +++ b/monitor.c > @@ -507,6 +507,8 @@ static const char *monitor_event_names[] =3D { > [QEVENT_SPICE_MIGRATE_COMPLETED] =3D "SPICE_MIGRATE_COMPLETED", > [QEVENT_GUEST_PANICKED] =3D "GUEST_PANICKED", > [QEVENT_BLOCK_IMAGE_CORRUPTED] =3D "BLOCK_IMAGE_CORRUPTED", > + [QEVENT_QUORUM_FAILURE] =3D "QUORUM_FAILURE", > + [QEVENT_QUORUM_REPORT_BAD] =3D "QUORUM_REPORT_BAD", > }; > QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) !=3D QEVENT_MAX) > =20