From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:50795) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VamUb-0002zV-7Z for qemu-devel@nongnu.org; Mon, 28 Oct 2013 09:04:30 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1VamUV-0000lk-Hi for qemu-devel@nongnu.org; Mon, 28 Oct 2013 09:04:25 -0400 Received: from nodalink.pck.nerim.net ([62.212.105.220]:36945 helo=paradis.irqsave.net) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1VamUU-0000kT-OP for qemu-devel@nongnu.org; Mon, 28 Oct 2013 09:04:19 -0400 Date: Mon, 28 Oct 2013 14:04:02 +0100 From: =?iso-8859-1?Q?Beno=EEt?= Canet Message-ID: <20131028130401.GC2890@irqsave.net> References: <1380717564-11098-1-git-send-email-benoit@irqsave.net> <1380717564-11098-7-git-send-email-benoit@irqsave.net> <524ED52C.7010207@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline In-Reply-To: <524ED52C.7010207@redhat.com> Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Max Reitz Cc: kwolf@redhat.com, qemu-devel@nongnu.org, stefanha@redhat.com Le Friday 04 Oct 2013 =E0 16:48:12 (+0200), Max Reitz a =E9crit : > On 2013-10-02 14:39, Beno=EEt Canet wrote: > >Use gnutls's SHA-256 to compare versions. > Wouldn't CRC32 suffice? (I don't really oppose using SHA, but taking > in gnutls as a dependency just for comparing several memory areas > seems a bit much to me) >=20 > >Signed-off-by: Benoit Canet > >--- > > block/Makefile.objs | 2 +- > > block/quorum.c | 321 ++++++++++++++++++++++++++++++++++++= +++++++++- > > configure | 36 ++++++ > > include/monitor/monitor.h | 2 + > > monitor.c | 2 + > > 5 files changed, 361 insertions(+), 2 deletions(-) > > > >diff --git a/block/Makefile.objs b/block/Makefile.objs > >index 05a65c2..adcdc21 100644 > >--- a/block/Makefile.objs > >+++ b/block/Makefile.objs > >@@ -3,7 +3,7 @@ block-obj-y +=3D qcow2.o qcow2-refcount.o qcow2-cluste= r.o qcow2-snapshot.o qcow2-c > > block-obj-y +=3D qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cl= uster.o > > block-obj-y +=3D qed-check.o > > block-obj-y +=3D vhdx.o > >-block-obj-y +=3D quorum.o > >+block-obj-$(CONFIG_QUORUM) +=3D quorum.o > > block-obj-y +=3D parallels.o blkdebug.o blkverify.o > > block-obj-y +=3D snapshot.o qapi.o > > block-obj-$(CONFIG_WIN32) +=3D raw-win32.o win32-aio.o > >diff --git a/block/quorum.c b/block/quorum.c > >index f0fc0e9..e235ac1 100644 > >--- a/block/quorum.c > >+++ b/block/quorum.c > >@@ -13,7 +13,43 @@ > > * See the COPYING file in the top-level directory. > > */ > >+#include > >+#include > > #include "block/block_int.h" > >+#include "qapi/qmp/qjson.h" > >+ > >+#define HASH_LENGTH 32 > >+ > >+/* This union hold a vote hash value */ > *holds >=20 > >+typedef union QuorumVoteValue { > >+ char h[HASH_LENGTH]; /* SHA-256 hash */ > >+ int64_t l; /* simpler 64 bits hash */ > >+} QuorumVoteValue; > >+ > >+/* A vote item */ > >+typedef struct QuorumVoteItem { > >+ int index; > >+ QLIST_ENTRY(QuorumVoteItem) next; > >+} QuorumVoteItem; > >+ > >+/* this structure is a vote version. A version is the set of vote sha= ring the > *set of votes >=20 > >+ * same vote value. > >+ * The set of vote will be tracked with the items field and it's coun= t is > *set of votes or *vote set; also s/it's count/its cardinality/ or > something like that >=20 > >+ * vote_count. > >+ */ > >+typedef struct QuorumVoteVersion { > >+ QuorumVoteValue value; > >+ int index; > >+ int vote_count; > >+ QLIST_HEAD(, QuorumVoteItem) items; > >+ QLIST_ENTRY(QuorumVoteVersion) next; > >+} QuorumVoteVersion; > >+ > >+/* this structure hold a group of vote versions together */ > *holds >=20 > >+typedef struct QuorumVotes { > >+ QLIST_HEAD(, QuorumVoteVersion) vote_list; > >+ int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); > >+} QuorumVotes; > > /* the following structure hold the state of one quorum instance */ > > typedef struct { > >@@ -60,10 +96,14 @@ struct QuorumAIOCB { > > int success_count; /* number of successfully completed = AIOCB */ > > bool *finished; /* completion signal for cancel */ > >+ QuorumVotes votes; > >+ > > bool is_read; > > int vote_ret; > > }; > >+static void quorum_vote(QuorumAIOCB *acb); > >+ > > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > > { > > QuorumAIOCB *acb =3D container_of(blockacb, QuorumAIOCB, common)= ; > >@@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > > acb->aios[i].ret =3D 0; > > } > >+ if (acb->vote_ret) { > >+ ret =3D acb->vote_ret; > >+ } > >+ > > acb->common.cb(acb->common.opaque, ret); > > if (acb->finished) { > > *acb->finished =3D true; > >@@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > > qemu_aio_release(acb); > > } > >+static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue = *b) > >+{ > >+ return memcmp(a->h, b->h, HASH_LENGTH); > >+} > >+ > > static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > > BlockDriverState *bs, > > QEMUIOVector *qiov, > >@@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState= *s, > > acb->count =3D 0; > > acb->success_count =3D 0; > > acb->finished =3D NULL; > >+ acb->votes.compare =3D quorum_sha256_compare; > > acb->is_read =3D false; > > acb->vote_ret =3D 0; > >@@ -170,9 +220,278 @@ static void quorum_aio_cb(void *opaque, int ret) > > return; > > } > >+ /* Do the vote on read */ > >+ if (acb->is_read) { > >+ quorum_vote(acb); > >+ } > >+ > > quorum_aio_finalize(acb); > > } > >+static void quorum_report_bad(QuorumAIOCB *acb, int index) > >+{ > >+ QObject *data; > >+ data =3D qobject_from_jsonf("{ 'children-index': %i" > I'd prefer child-index. Generally, remember that the singular of > "children" is "child". >=20 > >+ ", 'sector-num': %" PRId64 > >+ ", 'sectors-count': %i }", > >+ index, > >+ acb->sector_num, > >+ acb->nb_sectors); > >+ monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); > How about decrementing the refcount of data? >=20 > >+} > >+ > >+static void quorum_report_failure(QuorumAIOCB *acb) > >+{ > >+ QObject *data; > >+ data =3D qobject_from_jsonf("{ 'sector-num': %" PRId64 > >+ ", 'sectors-count': %i }", > >+ acb->sector_num, > >+ acb->nb_sectors); > >+ monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); > Same here. >=20 > >+} > >+ > >+static void quorum_report_bad_versions(QuorumAIOCB *acb, > >+ QuorumVoteValue *value) > >+{ > >+ QuorumVoteVersion *version; > >+ QuorumVoteItem *item; > >+ > >+ QLIST_FOREACH(version, &acb->votes.vote_list, next) { > >+ if (!acb->votes.compare(&version->value, value)) { > >+ continue; > >+ } > >+ QLIST_FOREACH(item, &version->items, next) { > >+ quorum_report_bad(acb, item->index); > >+ } > >+ } > >+} > >+ > >+static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source= ) > >+{ > >+ int i; > >+ assert(dest->niov =3D=3D source->niov); > >+ assert(dest->size =3D=3D source->size); > >+ for (i =3D 0; i < source->niov; i++) { > >+ assert(dest->iov[i].iov_len =3D=3D source->iov[i].iov_len); > >+ memcpy(dest->iov[i].iov_base, > >+ source->iov[i].iov_base, > >+ source->iov[i].iov_len); > >+ } > >+} > >+ > >+static void quorum_count_vote(QuorumVotes *votes, > >+ QuorumVoteValue *value, > >+ int index) > >+{ > >+ QuorumVoteVersion *v =3D NULL, *version =3D NULL; > >+ QuorumVoteItem *item; > >+ > >+ /* look if we have something with this hash */ > >+ QLIST_FOREACH(v, &votes->vote_list, next) { > >+ if (!votes->compare(&v->value, value)) { > >+ version =3D v; > >+ break; > >+ } > >+ } > >+ > >+ /* It's a version not yet in the list add it */ > >+ if (!version) { > >+ version =3D g_new0(QuorumVoteVersion, 1); > >+ QLIST_INIT(&version->items); > >+ memcpy(&version->value, value, sizeof(version->value)); > >+ version->index =3D index; > >+ version->vote_count =3D 0; > >+ QLIST_INSERT_HEAD(&votes->vote_list, version, next); > >+ } > >+ > >+ version->vote_count++; > >+ > >+ item =3D g_new0(QuorumVoteItem, 1); > >+ item->index =3D index; > >+ QLIST_INSERT_HEAD(&version->items, item, next); > >+} > >+ > >+static void quorum_free_vote_list(QuorumVotes *votes) > >+{ > >+ QuorumVoteVersion *version, *next_version; > >+ QuorumVoteItem *item, *next_item; > >+ > >+ QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version= ) { > >+ QLIST_REMOVE(version, next); > >+ QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { > >+ QLIST_REMOVE(item, next); > >+ g_free(item); > >+ } > >+ g_free(version); > >+ } > >+} > >+ > >+static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteVal= ue *hash) > >+{ > >+ int j, ret; > >+ gnutls_hash_hd_t dig; > >+ QEMUIOVector *qiov =3D &acb->aios[i].qiov; > >+ > >+ ret =3D gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); > >+ > >+ if (ret < 0) { > >+ return ret; > >+ } > >+ > >+ for (j =3D 0; j < qiov->niov; j++) { > >+ ret =3D gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].= iov_len); > >+ if (ret < 0) { > >+ return ret; > gnutls_hash_deinit? >=20 > >+ } > >+ } > >+ > >+ gnutls_hash_deinit(dig, (void *) hash); > >+ > >+ return 0; > >+} > >+ > >+static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) > >+{ > >+ int i =3D 0; > >+ QuorumVoteVersion *candidate, *winner =3D NULL; > >+ > >+ QLIST_FOREACH(candidate, &votes->vote_list, next) { > >+ if (candidate->vote_count > i) { > >+ i =3D candidate->vote_count; > >+ winner =3D candidate; > >+ } > >+ } > >+ > >+ return winner; > >+} > >+ > >+static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) > >+{ > >+ int i; > >+ int result; > >+ > >+ assert(a->niov =3D=3D b->niov); > >+ for (i =3D 0; i < a->niov; i++) { > >+ assert(a->iov[i].iov_len =3D=3D b->iov[i].iov_len); > >+ result =3D memcmp(a->iov[i].iov_base, > >+ b->iov[i].iov_base, > >+ a->iov[i].iov_len); > >+ if (result) { > >+ return false; > >+ } > >+ } > >+ > >+ return true; > >+} > >+ > >+static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, > >+ const char *fmt, ...) > >+{ > >+ va_list ap; > >+ > >+ va_start(ap, fmt); > >+ fprintf(stderr, "quorum: sector_num=3D%" PRId64 " nb_sectors=3D%d= ", > >+ acb->sector_num, acb->nb_sectors); > >+ vfprintf(stderr, fmt, ap); > >+ fprintf(stderr, "\n"); > >+ va_end(ap); > >+ exit(1); > >+} > >+ > >+static bool quorum_compare(QuorumAIOCB *acb, > >+ QEMUIOVector *a, > >+ QEMUIOVector *b) > >+{ > >+ BDRVQuorumState *s =3D acb->bqs; > >+ ssize_t offset; > >+ > >+ /* This driver will replace blkverify in this particular case */ > >+ if (s->is_blkverify) { > >+ offset =3D qemu_iovec_compare(a, b); > >+ if (offset !=3D -1) { > >+ quorum_err(acb, "contents mismatch in sector %" PRId64, > >+ acb->sector_num + > >+ (uint64_t)(offset / BDRV_SECTOR_SIZE)); > >+ } > >+ return true; > >+ } > >+ > >+ return quorum_iovec_compare(a, b); > >+} > >+ > >+static void quorum_vote(QuorumAIOCB *acb) > >+{ > >+ bool quorum =3D true; > >+ int i, j, ret; > >+ QuorumVoteValue hash; > >+ BDRVQuorumState *s =3D acb->bqs; > >+ QuorumVoteVersion *winner; > >+ > >+ /* get the index of the first successful read */ > >+ for (i =3D 0; i < s->total; i++) { > >+ if (!acb->aios[i].ret) { > >+ break; > >+ } > >+ } > >+ > >+ /* compare this read with all other successful read looking for q= uorum */ > >+ for (j =3D i + 1; j < s->total; j++) { > >+ if (acb->aios[j].ret) { > >+ continue; > >+ } > >+ quorum =3D quorum_compare(acb, &acb->aios[i].qiov, &acb->aios= [j].qiov); > >+ if (!quorum) { > >+ break; > >+ } > >+ } > >+ > >+ /* Every successful read agrees -> Quorum */ > >+ if (quorum) { > >+ quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov); > If no reads were successful at all, quorum will be true and i will > be s->total. Therefore, this will result in an array access with an > index out of bounds here. Thanks a lot for spotting this. >=20 > Generally, why is this function executed at all if any read failed? > If a read fails, the quorum read function will return an error, thus > the caller will ignore the result anyway. I should if there is a quorum between the successfull reads; if all the s= uccess ful reads agrees and their count is greater than threshold. >=20 > >+ return; > >+ } > >+ > >+ /* compute hashs for each successful read, also store indexes */ > >+ for (i =3D 0; i < s->total; i++) { > >+ if (acb->aios[i].ret) { > >+ continue; > >+ } > >+ ret =3D quorum_compute_hash(acb, i, &hash); > >+ /* if ever the hash computation failed */ > >+ if (ret < 0) { > >+ acb->vote_ret =3D ret; > >+ goto free_exit; > >+ } > >+ quorum_count_vote(&acb->votes, &hash, i); > >+ } > >+ > >+ /* vote to select the most represented version */ > >+ winner =3D quorum_get_vote_winner(&acb->votes); > >+ /* every vote version are differents -> error */ > >+ if (!winner) { > >+ quorum_report_failure(acb); > >+ acb->vote_ret =3D -EIO; > >+ goto free_exit; > >+ } > >+ > >+ /* if the winner count is smaller then threshold read fail */ > s/then/than/, s/ read fail/, the read fails/ >=20 > >+ if (winner->vote_count < s->threshold) { > >+ quorum_report_failure(acb); > >+ acb->vote_ret =3D -EIO; > >+ goto free_exit; > >+ } > >+ > >+ /* we have a winner: copy it */ > >+ quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov); > >+ > >+ /* some versions are bad print them */ > >+ quorum_report_bad_versions(acb, &winner->value); > >+ > >+free_exit: > >+ /* free lists */ > >+ quorum_free_vote_list(&acb->votes); > >+} > >+ > > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > > int64_t sector_num, > > QEMUIOVector *qiov, > >@@ -194,7 +513,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDri= verState *bs, > > } > > for (i =3D 0; i < s->total; i++) { > >- bdrv_aio_readv(&s->bs[i], sector_num, qiov, nb_sectors, > >+ bdrv_aio_readv(&s->bs[i], sector_num, &acb->aios[i].qiov, nb_= sectors, > > quorum_aio_cb, &acb->aios[i]); > > } > >diff --git a/configure b/configure > >index 23dbaaf..efbe7d9 100755 > >--- a/configure > >+++ b/configure > >@@ -246,6 +246,7 @@ gtk=3D"" > > gtkabi=3D"2.0" > > tpm=3D"no" > > libssh2=3D"" > >+quorum=3D"no" > > # parse CC options first > > for opt do > >@@ -972,6 +973,10 @@ for opt do > > ;; > > --enable-libssh2) libssh2=3D"yes" > > ;; > >+ --disable-quorum) quorum=3D"no" > >+ ;; > >+ --enable-quorum) quorum=3D"yes" > >+ ;; > > *) echo "ERROR: unknown option $opt"; show_help=3D"yes" > > ;; > > esac > >@@ -1203,6 +1208,8 @@ echo " --gcov=3DGCOV use specified= gcov [$gcov_tool]" > > echo " --enable-tpm enable TPM support" > > echo " --disable-libssh2 disable ssh block device support" > > echo " --enable-libssh2 enable ssh block device support" > >+echo " --disable-quorum disable quorum block filter support" > >+echo " --enable-quorum enable quorum block filter support" > > echo "" > > echo "NOTE: The object files are built at the place where configure = is launched" > > exit 1 > >@@ -1895,6 +1902,30 @@ EOF > > fi > > ########################################## > >+# Quorum probe (check for gnutls) > >+if test "$quorum" !=3D "no" ; then > >+cat > $TMPC < >+#include > >+#include > >+int main(void) {char data[4096], digest[32]; > >+gnutls_hash_fast(GNUTLS_DIG_SHA256, data, 4096, digest); > >+return 0; > >+} > >+EOF > >+quorum_tls_cflags=3D`$pkg_config --cflags gnutls 2> /dev/null` > >+quorum_tls_libs=3D`$pkg_config --libs gnutls 2> /dev/null` > >+if compile_prog "$quorum_tls_cflags" "$quorum_tls_libs" ; then > >+ qcow_tls=3Dyes > >+ libs_softmmu=3D"$quorum_tls_libs $libs_softmmu" > >+ libs_tools=3D"$quorum_tls_libs $libs_softmmu" > >+ QEMU_CFLAGS=3D"$QEMU_CFLAGS $quorum_tls_cflags" > >+else > >+ echo "gnutls > 2.10.0 required to compile Quorum" > >+ exit 1 > >+fi > >+fi > >+ > >+########################################## > > # VNC SASL detection > > if test "$vnc" =3D "yes" -a "$vnc_sasl" !=3D "no" ; then > > cat > $TMPC < >@@ -3758,6 +3789,7 @@ echo "TPM support $tpm" > > echo "libssh2 support $libssh2" > > echo "TPM passthrough $tpm_passthrough" > > echo "QOM debugging $qom_cast_debug" > >+echo "Quorum $quorum" > > if test "$sdl_too_old" =3D "yes"; then > > echo "-> Your SDL version is too old - please upgrade to have SDL su= pport" > >@@ -4156,6 +4188,10 @@ if test "$libssh2" =3D "yes" ; then > > echo "CONFIG_LIBSSH2=3Dy" >> $config_host_mak > > fi > >+if test "$quorum" =3D "yes" ; then > >+ echo "CONFIG_QUORUM=3Dy" >> $config_host_mak > >+fi > >+ > > if test "$virtio_blk_data_plane" =3D "yes" ; then > > echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=3D$(CONFIG_VIRTIO)' >> $config_= host_mak > > fi > >diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h > >index 10fa0e3..51902ed 100644 > >--- a/include/monitor/monitor.h > >+++ b/include/monitor/monitor.h > >@@ -49,6 +49,8 @@ typedef enum MonitorEvent { > > QEVENT_SPICE_MIGRATE_COMPLETED, > > QEVENT_GUEST_PANICKED, > > QEVENT_BLOCK_IMAGE_CORRUPTED, > >+ QEVENT_QUORUM_FAILURE, > >+ QEVENT_QUORUM_REPORT_BAD, > > /* Add to 'monitor_event_names' array in monitor.c when > > * defining new events here */ > >diff --git a/monitor.c b/monitor.c > >index 74f3f1b..89d139f 100644 > >--- a/monitor.c > >+++ b/monitor.c > >@@ -507,6 +507,8 @@ static const char *monitor_event_names[] =3D { > > [QEVENT_SPICE_MIGRATE_COMPLETED] =3D "SPICE_MIGRATE_COMPLETED", > > [QEVENT_GUEST_PANICKED] =3D "GUEST_PANICKED", > > [QEVENT_BLOCK_IMAGE_CORRUPTED] =3D "BLOCK_IMAGE_CORRUPTED", > >+ [QEVENT_QUORUM_FAILURE] =3D "QUORUM_FAILURE", > >+ [QEVENT_QUORUM_REPORT_BAD] =3D "QUORUM_REPORT_BAD", > Could you document these events in docs/qmp/qmp-events.txt? >=20 > Max >=20 > > }; > > QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) !=3D QEVENT_MAX) >=20 >=20