From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:54126) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WAPNr-0000Tf-V5 for qemu-devel@nongnu.org; Mon, 03 Feb 2014 14:40:49 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1WAPNm-0002TB-Se for qemu-devel@nongnu.org; Mon, 03 Feb 2014 14:40:43 -0500 Received: from mx1.redhat.com ([209.132.183.28]:45911) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WAPNm-0002T0-E4 for qemu-devel@nongnu.org; Mon, 03 Feb 2014 14:40:38 -0500 Message-ID: <52EFF133.9060604@redhat.com> Date: Mon, 03 Feb 2014 20:42:43 +0100 From: Max Reitz MIME-Version: 1.0 References: <1391454712-14353-1-git-send-email-benoit.canet@irqsave.net> <1391454712-14353-7-git-send-email-benoit.canet@irqsave.net> In-Reply-To: <1391454712-14353-7-git-send-email-benoit.canet@irqsave.net> Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH V14 06/13] quorum: Add quorum mechanism. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: =?UTF-8?B?QmVub8OudCBDYW5ldA==?= , qemu-devel@nongnu.org Cc: kwolf@redhat.com, =?UTF-8?B?QmVub8OudCBDYW5ldA==?= , stefanha@redhat.com On 03.02.2014 20:11, Beno=C3=AEt Canet wrote: > From: Beno=C3=AEt Canet > > Use gnutls's SHA-256 to compare versions. > > Signed-off-by: Benoit Canet > --- > block/Makefile.objs | 2 +- > block/quorum.c | 401 +++++++++++++++++++++++++++++++++++++= ++++++--- > configure | 36 +++++ > docs/qmp/qmp-events.txt | 33 ++++ > include/monitor/monitor.h | 2 + > monitor.c | 2 + > 6 files changed, 457 insertions(+), 19 deletions(-) > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index a2650b9..4ca9d43 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -3,7 +3,7 @@ block-obj-y +=3D qcow2.o qcow2-refcount.o qcow2-cluster= .o qcow2-snapshot.o qcow2-c > block-obj-y +=3D qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-clu= ster.o > block-obj-y +=3D qed-check.o > block-obj-$(CONFIG_VHDX) +=3D vhdx.o vhdx-endian.o vhdx-log.o > -block-obj-y +=3D quorum.o > +block-obj-$(CONFIG_QUORUM) +=3D quorum.o > block-obj-y +=3D parallels.o blkdebug.o blkverify.o > block-obj-y +=3D snapshot.o qapi.o > block-obj-$(CONFIG_WIN32) +=3D raw-win32.o win32-aio.o > diff --git a/block/quorum.c b/block/quorum.c > index 5b11ac1..56ad6a0 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -13,7 +13,43 @@ > * See the COPYING file in the top-level directory. > */ > =20 > +#include > +#include > #include "block/block_int.h" > +#include "qapi/qmp/qjson.h" > + > +#define HASH_LENGTH 32 > + > +/* This union holds a vote hash value */ > +typedef union QuorumVoteValue { > + char h[HASH_LENGTH]; /* SHA-256 hash */ > + int64_t l; /* simpler 64 bits hash */ > +} QuorumVoteValue; > + > +/* A vote item */ > +typedef struct QuorumVoteItem { > + int index; > + QLIST_ENTRY(QuorumVoteItem) next; > +} QuorumVoteItem; > + > +/* this structure is a vote version. A version is the set of votes sha= ring the > + * same vote value. > + * The set of votes will be tracked with the items field and its cardi= nality is > + * vote_count. > + */ > +typedef struct QuorumVoteVersion { > + QuorumVoteValue value; > + int index; > + int vote_count; > + QLIST_HEAD(, QuorumVoteItem) items; > + QLIST_ENTRY(QuorumVoteVersion) next; > +} QuorumVoteVersion; > + > +/* this structure holds a group of vote versions together */ > +typedef struct QuorumVotes { > + QLIST_HEAD(, QuorumVoteVersion) vote_list; > + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); > +} QuorumVotes; > =20 > /* the following structure holds the state of one quorum instance */ > typedef struct { > @@ -60,10 +96,14 @@ struct QuorumAIOCB { > int success_count; /* number of successfully completed A= IOCB */ > bool *finished; /* completion signal for cancel */ > =20 > + QuorumVotes votes; > + > bool is_read; > int vote_ret; > }; > =20 > +static void quorum_vote(QuorumAIOCB *acb); > + > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > { > QuorumAIOCB *acb =3D container_of(blockacb, QuorumAIOCB, common); > @@ -81,29 +121,14 @@ static AIOCBInfo quorum_aiocb_info =3D { > .cancel =3D quorum_aio_cancel, > }; > =20 > -/* return the first error code get by each individual callbacks */ > -static int quorum_get_first_error(QuorumAIOCB *acb) > -{ > - BDRVQuorumState *s =3D acb->bqs; > - int i, ret =3D 0; > - > - for (i =3D 0; i < s->total; i++) { > - ret =3D acb->aios[i].ret; > - if (ret) { > - return ret; > - } > - } > - > - /* should not pass here */ > - assert(false); > -} > +static int quorum_vote_error(QuorumAIOCB *acb); > =20 > static void quorum_aio_finalize(QuorumAIOCB *acb) > { > BDRVQuorumState *s =3D acb->bqs; > int i, ret; > =20 > - ret =3D s->threshold <=3D acb->success_count ? 0 : quorum_get_firs= t_error(acb); > + ret =3D s->threshold <=3D acb->success_count ? 0 : quorum_vote_err= or(acb); > =20 > for (i =3D 0; i < s->total; i++) { > qemu_vfree(acb->aios[i].buf); > @@ -111,6 +136,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > acb->aios[i].ret =3D 0; > } > =20 > + if (acb->vote_ret) { > + ret =3D acb->vote_ret; > + } > + > acb->common.cb(acb->common.opaque, ret); > if (acb->finished) { > *acb->finished =3D true; > @@ -122,6 +151,27 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > qemu_aio_release(acb); > } > =20 > +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *= b) > +{ > + return memcmp(a->h, b->h, HASH_LENGTH); > +} > + > +static int quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *= b) > +{ > + int64_t i =3D a->l; > + int64_t j =3D b->l; > + > + if (i < j) { > + return -1; > + } > + > + if (i > j) { > + return 1; > + } > + > + return 0; > +} > + > static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > BlockDriverState *bs, > QEMUIOVector *qiov, > @@ -141,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState = *s, > acb->count =3D 0; > acb->success_count =3D 0; > acb->finished =3D NULL; > + acb->votes.compare =3D quorum_sha256_compare; > acb->is_read =3D false; > acb->vote_ret =3D 0; > =20 > @@ -170,9 +221,323 @@ static void quorum_aio_cb(void *opaque, int ret) > return; > } > =20 > + /* Do the vote on read */ > + if (acb->is_read) { > + quorum_vote(acb); > + } > + > quorum_aio_finalize(acb); > } > =20 > +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name) > +{ > + QObject *data; > + data =3D qobject_from_jsonf("{ 'node-name': \"%s\"" > + ", 'sector-num': %" PRId64 > + ", 'sectors-count': %i }", > + node_name, > + acb->sector_num, > + acb->nb_sectors); > + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); > + qobject_decref(data); > +} > + > +static void quorum_report_failure(QuorumAIOCB *acb) > +{ > + QObject *data; > + data =3D qobject_from_jsonf("{ 'sector-num': %" PRId64 > + ", 'sectors-count': %i }", > + acb->sector_num, > + acb->nb_sectors); > + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); > + qobject_decref(data); > +} > + > +static void quorum_report_bad_versions(BDRVQuorumState *s, > + QuorumAIOCB *acb, > + QuorumVoteValue *value) > +{ > + QuorumVoteVersion *version; > + QuorumVoteItem *item; > + > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > + if (!acb->votes.compare(&version->value, value)) { > + continue; > + } > + QLIST_FOREACH(item, &version->items, next) { > + quorum_report_bad(acb, s->bs[item->index]->node_name); > + } > + } > +} > + > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > +{ > + int i; > + assert(dest->niov =3D=3D source->niov); > + assert(dest->size =3D=3D source->size); > + for (i =3D 0; i < source->niov; i++) { > + assert(dest->iov[i].iov_len =3D=3D source->iov[i].iov_len); > + memcpy(dest->iov[i].iov_base, > + source->iov[i].iov_base, > + source->iov[i].iov_len); > + } > +} > + > +static void quorum_count_vote(QuorumVotes *votes, > + QuorumVoteValue *value, > + int index) > +{ > + QuorumVoteVersion *v =3D NULL, *version =3D NULL; > + QuorumVoteItem *item; > + > + /* look if we have something with this hash */ > + QLIST_FOREACH(v, &votes->vote_list, next) { > + if (!votes->compare(&v->value, value)) { > + version =3D v; > + break; > + } > + } > + > + /* It's a version not yet in the list add it */ > + if (!version) { > + version =3D g_new0(QuorumVoteVersion, 1); > + QLIST_INIT(&version->items); > + memcpy(&version->value, value, sizeof(version->value)); > + version->index =3D index; > + version->vote_count =3D 0; > + QLIST_INSERT_HEAD(&votes->vote_list, version, next); > + } > + > + version->vote_count++; > + > + item =3D g_new0(QuorumVoteItem, 1); > + item->index =3D index; > + QLIST_INSERT_HEAD(&version->items, item, next); > +} > + > +static void quorum_free_vote_list(QuorumVotes *votes) > +{ > + QuorumVoteVersion *version, *next_version; > + QuorumVoteItem *item, *next_item; > + > + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version)= { > + QLIST_REMOVE(version, next); > + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { > + QLIST_REMOVE(item, next); > + g_free(item); > + } > + g_free(version); > + } > +} > + > +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValu= e *hash) > +{ > + int j, ret; > + gnutls_hash_hd_t dig; > + QEMUIOVector *qiov =3D &acb->aios[i].qiov; > + > + ret =3D gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); > + > + if (ret < 0) { > + return ret; > + } > + > + for (j =3D 0; j < qiov->niov; j++) { > + ret =3D gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].i= ov_len); > + if (ret < 0) { > + break; > + } > + } > + > + gnutls_hash_deinit(dig, (void *) hash); > + return ret; > +} > + > +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) > +{ > + int i =3D 0; > + QuorumVoteVersion *candidate, *winner =3D NULL; > + > + QLIST_FOREACH(candidate, &votes->vote_list, next) { > + if (candidate->vote_count > i) { > + i =3D candidate->vote_count; > + winner =3D candidate; > + } > + } > + > + return winner; > +} > + > +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) > +{ > + int i; > + int result; > + > + assert(a->niov =3D=3D b->niov); > + for (i =3D 0; i < a->niov; i++) { > + assert(a->iov[i].iov_len =3D=3D b->iov[i].iov_len); > + result =3D memcmp(a->iov[i].iov_base, > + b->iov[i].iov_base, > + a->iov[i].iov_len); > + if (result) { > + return false; > + } > + } > + > + return true; > +} > + > +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, > + const char *fmt, ...) > +{ > + va_list ap; > + > + va_start(ap, fmt); > + fprintf(stderr, "quorum: sector_num=3D%" PRId64 " nb_sectors=3D%d = ", > + acb->sector_num, acb->nb_sectors); > + vfprintf(stderr, fmt, ap); > + fprintf(stderr, "\n"); > + va_end(ap); > + exit(1); > +} > + > +static bool quorum_compare(QuorumAIOCB *acb, > + QEMUIOVector *a, > + QEMUIOVector *b) > +{ > + BDRVQuorumState *s =3D acb->bqs; > + ssize_t offset; > + > + /* This driver will replace blkverify in this particular case */ > + if (s->is_blkverify) { > + offset =3D qemu_iovec_compare(a, b); > + if (offset !=3D -1) { > + quorum_err(acb, "contents mismatch in sector %" PRId64, > + acb->sector_num + > + (uint64_t)(offset / BDRV_SECTOR_SIZE)); > + } > + return true; > + } > + > + return quorum_iovec_compare(a, b); > +} > + > +/* Do a vote to get the error code */ > +static int quorum_vote_error(QuorumAIOCB *acb) > +{ > + BDRVQuorumState *s =3D acb->bqs; > + QuorumVoteVersion *winner =3D NULL; > + QuorumVotes error_votes; > + QuorumVoteValue result_value; > + int i, ret =3D 0; > + bool error =3D false; > + > + QLIST_INIT(&error_votes.vote_list); > + error_votes.compare =3D quorum_64bits_compare; > + > + for (i =3D 0; i < s->total; i++) { > + ret =3D acb->aios[i].ret; > + if (ret) { > + error =3D true; > + result_value.l =3D ret; > + quorum_count_vote(&error_votes, &result_value, i); > + } > + } > + > + if (error) { > + winner =3D quorum_get_vote_winner(&error_votes); > + ret =3D winner->value.l; > + } > + > + quorum_free_vote_list(&error_votes); > + > + return ret; > +} > + > +static void quorum_vote(QuorumAIOCB *acb) > +{ > + bool quorum =3D false; > + int i, j, ret; > + QuorumVoteValue hash; > + BDRVQuorumState *s =3D acb->bqs; > + QuorumVoteVersion *winner; > + > + QLIST_INIT(&acb->votes.vote_list); > + > + /* if we don't get enough successful read use the first error code= */ This comment should be updated, since there is now a vote. > + if (acb->success_count < s->threshold) { > + acb->vote_ret =3D quorum_vote_error(acb); > + quorum_report_failure(acb); > + return; > + } With this change (the if condition), I'm fine with leaving giving the=20 Quorum error code precedence in quorum_aio_finalize(). (If this hadn't=20 been changed, a non-zero success count below the threshold would have=20 led this code to set acb->vote_ret to -EIO and subsequently using this=20 as the return code in quorum_aio_finalize() due to the precedence. With=20 this change, we'll get a more meaningful return code in acb->vote_ret=20 and subsequently in quorum_aio_finalize().) Aside from the comment: Reviewed-by: Max Reitz