From: Max Reitz <mreitz@redhat.com>
To: "Benoît Canet" <benoit@irqsave.net>
Cc: kwolf@redhat.com, qemu-devel@nongnu.org, stefanha@redhat.com
Subject: Re: [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism.
Date: Fri, 04 Oct 2013 16:48:12 +0200 [thread overview]
Message-ID: <524ED52C.7010207@redhat.com> (raw)
In-Reply-To: <1380717564-11098-7-git-send-email-benoit@irqsave.net>
On 2013-10-02 14:39, Benoît Canet wrote:
> Use gnutls's SHA-256 to compare versions.
Wouldn't CRC32 suffice? (I don't really oppose using SHA, but taking in
gnutls as a dependency just for comparing several memory areas seems a
bit much to me)
> Signed-off-by: Benoit Canet <benoit@irqsave.net>
> ---
> block/Makefile.objs | 2 +-
> block/quorum.c | 321 +++++++++++++++++++++++++++++++++++++++++++++-
> configure | 36 ++++++
> include/monitor/monitor.h | 2 +
> monitor.c | 2 +
> 5 files changed, 361 insertions(+), 2 deletions(-)
>
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 05a65c2..adcdc21 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c
> block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> block-obj-y += qed-check.o
> block-obj-y += vhdx.o
> -block-obj-y += quorum.o
> +block-obj-$(CONFIG_QUORUM) += quorum.o
> block-obj-y += parallels.o blkdebug.o blkverify.o
> block-obj-y += snapshot.o qapi.o
> block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> diff --git a/block/quorum.c b/block/quorum.c
> index f0fc0e9..e235ac1 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -13,7 +13,43 @@
> * See the COPYING file in the top-level directory.
> */
>
> +#include <gnutls/gnutls.h>
> +#include <gnutls/crypto.h>
> #include "block/block_int.h"
> +#include "qapi/qmp/qjson.h"
> +
> +#define HASH_LENGTH 32
> +
> +/* This union hold a vote hash value */
*holds
> +typedef union QuorumVoteValue {
> + char h[HASH_LENGTH]; /* SHA-256 hash */
> + int64_t l; /* simpler 64 bits hash */
> +} QuorumVoteValue;
> +
> +/* A vote item */
> +typedef struct QuorumVoteItem {
> + int index;
> + QLIST_ENTRY(QuorumVoteItem) next;
> +} QuorumVoteItem;
> +
> +/* this structure is a vote version. A version is the set of vote sharing the
*set of votes
> + * same vote value.
> + * The set of vote will be tracked with the items field and it's count is
*set of votes or *vote set; also s/it's count/its cardinality/ or
something like that
> + * vote_count.
> + */
> +typedef struct QuorumVoteVersion {
> + QuorumVoteValue value;
> + int index;
> + int vote_count;
> + QLIST_HEAD(, QuorumVoteItem) items;
> + QLIST_ENTRY(QuorumVoteVersion) next;
> +} QuorumVoteVersion;
> +
> +/* this structure hold a group of vote versions together */
*holds
> +typedef struct QuorumVotes {
> + QLIST_HEAD(, QuorumVoteVersion) vote_list;
> + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
> +} QuorumVotes;
>
> /* the following structure hold the state of one quorum instance */
> typedef struct {
> @@ -60,10 +96,14 @@ struct QuorumAIOCB {
> int success_count; /* number of successfully completed AIOCB */
> bool *finished; /* completion signal for cancel */
>
> + QuorumVotes votes;
> +
> bool is_read;
> int vote_ret;
> };
>
> +static void quorum_vote(QuorumAIOCB *acb);
> +
> static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> {
> QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> @@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> acb->aios[i].ret = 0;
> }
>
> + if (acb->vote_ret) {
> + ret = acb->vote_ret;
> + }
> +
> acb->common.cb(acb->common.opaque, ret);
> if (acb->finished) {
> *acb->finished = true;
> @@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> qemu_aio_release(acb);
> }
>
> +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> +{
> + return memcmp(a->h, b->h, HASH_LENGTH);
> +}
> +
> static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> BlockDriverState *bs,
> QEMUIOVector *qiov,
> @@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> acb->count = 0;
> acb->success_count = 0;
> acb->finished = NULL;
> + acb->votes.compare = quorum_sha256_compare;
> acb->is_read = false;
> acb->vote_ret = 0;
>
> @@ -170,9 +220,278 @@ static void quorum_aio_cb(void *opaque, int ret)
> return;
> }
>
> + /* Do the vote on read */
> + if (acb->is_read) {
> + quorum_vote(acb);
> + }
> +
> quorum_aio_finalize(acb);
> }
>
> +static void quorum_report_bad(QuorumAIOCB *acb, int index)
> +{
> + QObject *data;
> + data = qobject_from_jsonf("{ 'children-index': %i"
I'd prefer child-index. Generally, remember that the singular of
"children" is "child".
> + ", 'sector-num': %" PRId64
> + ", 'sectors-count': %i }",
> + index,
> + acb->sector_num,
> + acb->nb_sectors);
> + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
How about decrementing the refcount of data?
> +}
> +
> +static void quorum_report_failure(QuorumAIOCB *acb)
> +{
> + QObject *data;
> + data = qobject_from_jsonf("{ 'sector-num': %" PRId64
> + ", 'sectors-count': %i }",
> + acb->sector_num,
> + acb->nb_sectors);
> + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
Same here.
> +}
> +
> +static void quorum_report_bad_versions(QuorumAIOCB *acb,
> + QuorumVoteValue *value)
> +{
> + QuorumVoteVersion *version;
> + QuorumVoteItem *item;
> +
> + QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> + if (!acb->votes.compare(&version->value, value)) {
> + continue;
> + }
> + QLIST_FOREACH(item, &version->items, next) {
> + quorum_report_bad(acb, item->index);
> + }
> + }
> +}
> +
> +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> +{
> + int i;
> + assert(dest->niov == source->niov);
> + assert(dest->size == source->size);
> + for (i = 0; i < source->niov; i++) {
> + assert(dest->iov[i].iov_len == source->iov[i].iov_len);
> + memcpy(dest->iov[i].iov_base,
> + source->iov[i].iov_base,
> + source->iov[i].iov_len);
> + }
> +}
> +
> +static void quorum_count_vote(QuorumVotes *votes,
> + QuorumVoteValue *value,
> + int index)
> +{
> + QuorumVoteVersion *v = NULL, *version = NULL;
> + QuorumVoteItem *item;
> +
> + /* look if we have something with this hash */
> + QLIST_FOREACH(v, &votes->vote_list, next) {
> + if (!votes->compare(&v->value, value)) {
> + version = v;
> + break;
> + }
> + }
> +
> + /* It's a version not yet in the list add it */
> + if (!version) {
> + version = g_new0(QuorumVoteVersion, 1);
> + QLIST_INIT(&version->items);
> + memcpy(&version->value, value, sizeof(version->value));
> + version->index = index;
> + version->vote_count = 0;
> + QLIST_INSERT_HEAD(&votes->vote_list, version, next);
> + }
> +
> + version->vote_count++;
> +
> + item = g_new0(QuorumVoteItem, 1);
> + item->index = index;
> + QLIST_INSERT_HEAD(&version->items, item, next);
> +}
> +
> +static void quorum_free_vote_list(QuorumVotes *votes)
> +{
> + QuorumVoteVersion *version, *next_version;
> + QuorumVoteItem *item, *next_item;
> +
> + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
> + QLIST_REMOVE(version, next);
> + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
> + QLIST_REMOVE(item, next);
> + g_free(item);
> + }
> + g_free(version);
> + }
> +}
> +
> +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
> +{
> + int j, ret;
> + gnutls_hash_hd_t dig;
> + QEMUIOVector *qiov = &acb->aios[i].qiov;
> +
> + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
> +
> + if (ret < 0) {
> + return ret;
> + }
> +
> + for (j = 0; j < qiov->niov; j++) {
> + ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
> + if (ret < 0) {
> + return ret;
gnutls_hash_deinit?
> + }
> + }
> +
> + gnutls_hash_deinit(dig, (void *) hash);
> +
> + return 0;
> +}
> +
> +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
> +{
> + int i = 0;
> + QuorumVoteVersion *candidate, *winner = NULL;
> +
> + QLIST_FOREACH(candidate, &votes->vote_list, next) {
> + if (candidate->vote_count > i) {
> + i = candidate->vote_count;
> + winner = candidate;
> + }
> + }
> +
> + return winner;
> +}
> +
> +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
> +{
> + int i;
> + int result;
> +
> + assert(a->niov == b->niov);
> + for (i = 0; i < a->niov; i++) {
> + assert(a->iov[i].iov_len == b->iov[i].iov_len);
> + result = memcmp(a->iov[i].iov_base,
> + b->iov[i].iov_base,
> + a->iov[i].iov_len);
> + if (result) {
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
> + const char *fmt, ...)
> +{
> + va_list ap;
> +
> + va_start(ap, fmt);
> + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
> + acb->sector_num, acb->nb_sectors);
> + vfprintf(stderr, fmt, ap);
> + fprintf(stderr, "\n");
> + va_end(ap);
> + exit(1);
> +}
> +
> +static bool quorum_compare(QuorumAIOCB *acb,
> + QEMUIOVector *a,
> + QEMUIOVector *b)
> +{
> + BDRVQuorumState *s = acb->bqs;
> + ssize_t offset;
> +
> + /* This driver will replace blkverify in this particular case */
> + if (s->is_blkverify) {
> + offset = qemu_iovec_compare(a, b);
> + if (offset != -1) {
> + quorum_err(acb, "contents mismatch in sector %" PRId64,
> + acb->sector_num +
> + (uint64_t)(offset / BDRV_SECTOR_SIZE));
> + }
> + return true;
> + }
> +
> + return quorum_iovec_compare(a, b);
> +}
> +
> +static void quorum_vote(QuorumAIOCB *acb)
> +{
> + bool quorum = true;
> + int i, j, ret;
> + QuorumVoteValue hash;
> + BDRVQuorumState *s = acb->bqs;
> + QuorumVoteVersion *winner;
> +
> + /* get the index of the first successful read */
> + for (i = 0; i < s->total; i++) {
> + if (!acb->aios[i].ret) {
> + break;
> + }
> + }
> +
> + /* compare this read with all other successful read looking for quorum */
> + for (j = i + 1; j < s->total; j++) {
> + if (acb->aios[j].ret) {
> + continue;
> + }
> + quorum = quorum_compare(acb, &acb->aios[i].qiov, &acb->aios[j].qiov);
> + if (!quorum) {
> + break;
> + }
> + }
> +
> + /* Every successful read agrees -> Quorum */
> + if (quorum) {
> + quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov);
If no reads were successful at all, quorum will be true and i will be
s->total. Therefore, this will result in an array access with an index
out of bounds here.
Generally, why is this function executed at all if any read failed? If a
read fails, the quorum read function will return an error, thus the
caller will ignore the result anyway.
> + return;
> + }
> +
> + /* compute hashs for each successful read, also store indexes */
> + for (i = 0; i < s->total; i++) {
> + if (acb->aios[i].ret) {
> + continue;
> + }
> + ret = quorum_compute_hash(acb, i, &hash);
> + /* if ever the hash computation failed */
> + if (ret < 0) {
> + acb->vote_ret = ret;
> + goto free_exit;
> + }
> + quorum_count_vote(&acb->votes, &hash, i);
> + }
> +
> + /* vote to select the most represented version */
> + winner = quorum_get_vote_winner(&acb->votes);
> + /* every vote version are differents -> error */
> + if (!winner) {
> + quorum_report_failure(acb);
> + acb->vote_ret = -EIO;
> + goto free_exit;
> + }
> +
> + /* if the winner count is smaller then threshold read fail */
s/then/than/, s/ read fail/, the read fails/
> + if (winner->vote_count < s->threshold) {
> + quorum_report_failure(acb);
> + acb->vote_ret = -EIO;
> + goto free_exit;
> + }
> +
> + /* we have a winner: copy it */
> + quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov);
> +
> + /* some versions are bad print them */
> + quorum_report_bad_versions(acb, &winner->value);
> +
> +free_exit:
> + /* free lists */
> + quorum_free_vote_list(&acb->votes);
> +}
> +
> static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
> int64_t sector_num,
> QEMUIOVector *qiov,
> @@ -194,7 +513,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
> }
>
> for (i = 0; i < s->total; i++) {
> - bdrv_aio_readv(&s->bs[i], sector_num, qiov, nb_sectors,
> + bdrv_aio_readv(&s->bs[i], sector_num, &acb->aios[i].qiov, nb_sectors,
> quorum_aio_cb, &acb->aios[i]);
> }
>
> diff --git a/configure b/configure
> index 23dbaaf..efbe7d9 100755
> --- a/configure
> +++ b/configure
> @@ -246,6 +246,7 @@ gtk=""
> gtkabi="2.0"
> tpm="no"
> libssh2=""
> +quorum="no"
>
> # parse CC options first
> for opt do
> @@ -972,6 +973,10 @@ for opt do
> ;;
> --enable-libssh2) libssh2="yes"
> ;;
> + --disable-quorum) quorum="no"
> + ;;
> + --enable-quorum) quorum="yes"
> + ;;
> *) echo "ERROR: unknown option $opt"; show_help="yes"
> ;;
> esac
> @@ -1203,6 +1208,8 @@ echo " --gcov=GCOV use specified gcov [$gcov_tool]"
> echo " --enable-tpm enable TPM support"
> echo " --disable-libssh2 disable ssh block device support"
> echo " --enable-libssh2 enable ssh block device support"
> +echo " --disable-quorum disable quorum block filter support"
> +echo " --enable-quorum enable quorum block filter support"
> echo ""
> echo "NOTE: The object files are built at the place where configure is launched"
> exit 1
> @@ -1895,6 +1902,30 @@ EOF
> fi
>
> ##########################################
> +# Quorum probe (check for gnutls)
> +if test "$quorum" != "no" ; then
> +cat > $TMPC <<EOF
> +#include <gnutls/gnutls.h>
> +#include <gnutls/crypto.h>
> +int main(void) {char data[4096], digest[32];
> +gnutls_hash_fast(GNUTLS_DIG_SHA256, data, 4096, digest);
> +return 0;
> +}
> +EOF
> +quorum_tls_cflags=`$pkg_config --cflags gnutls 2> /dev/null`
> +quorum_tls_libs=`$pkg_config --libs gnutls 2> /dev/null`
> +if compile_prog "$quorum_tls_cflags" "$quorum_tls_libs" ; then
> + qcow_tls=yes
> + libs_softmmu="$quorum_tls_libs $libs_softmmu"
> + libs_tools="$quorum_tls_libs $libs_softmmu"
> + QEMU_CFLAGS="$QEMU_CFLAGS $quorum_tls_cflags"
> +else
> + echo "gnutls > 2.10.0 required to compile Quorum"
> + exit 1
> +fi
> +fi
> +
> +##########################################
> # VNC SASL detection
> if test "$vnc" = "yes" -a "$vnc_sasl" != "no" ; then
> cat > $TMPC <<EOF
> @@ -3758,6 +3789,7 @@ echo "TPM support $tpm"
> echo "libssh2 support $libssh2"
> echo "TPM passthrough $tpm_passthrough"
> echo "QOM debugging $qom_cast_debug"
> +echo "Quorum $quorum"
>
> if test "$sdl_too_old" = "yes"; then
> echo "-> Your SDL version is too old - please upgrade to have SDL support"
> @@ -4156,6 +4188,10 @@ if test "$libssh2" = "yes" ; then
> echo "CONFIG_LIBSSH2=y" >> $config_host_mak
> fi
>
> +if test "$quorum" = "yes" ; then
> + echo "CONFIG_QUORUM=y" >> $config_host_mak
> +fi
> +
> if test "$virtio_blk_data_plane" = "yes" ; then
> echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=$(CONFIG_VIRTIO)' >> $config_host_mak
> fi
> diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
> index 10fa0e3..51902ed 100644
> --- a/include/monitor/monitor.h
> +++ b/include/monitor/monitor.h
> @@ -49,6 +49,8 @@ typedef enum MonitorEvent {
> QEVENT_SPICE_MIGRATE_COMPLETED,
> QEVENT_GUEST_PANICKED,
> QEVENT_BLOCK_IMAGE_CORRUPTED,
> + QEVENT_QUORUM_FAILURE,
> + QEVENT_QUORUM_REPORT_BAD,
>
> /* Add to 'monitor_event_names' array in monitor.c when
> * defining new events here */
> diff --git a/monitor.c b/monitor.c
> index 74f3f1b..89d139f 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -507,6 +507,8 @@ static const char *monitor_event_names[] = {
> [QEVENT_SPICE_MIGRATE_COMPLETED] = "SPICE_MIGRATE_COMPLETED",
> [QEVENT_GUEST_PANICKED] = "GUEST_PANICKED",
> [QEVENT_BLOCK_IMAGE_CORRUPTED] = "BLOCK_IMAGE_CORRUPTED",
> + [QEVENT_QUORUM_FAILURE] = "QUORUM_FAILURE",
> + [QEVENT_QUORUM_REPORT_BAD] = "QUORUM_REPORT_BAD",
Could you document these events in docs/qmp/qmp-events.txt?
Max
> };
> QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) != QEVENT_MAX)
>
next prev parent reply other threads:[~2013-10-04 14:48 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-10-02 12:39 [Qemu-devel] [PATCH V9 00/11] Quorum block driver Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 01/11] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB Benoît Canet
2013-10-04 14:33 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 02/11] quorum: Create BDRVQuorumState and BlkDriver and do init Benoît Canet
2013-10-04 14:34 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 03/11] quorum: Add quorum_aio_writev and its dependencies Benoît Canet
2013-10-04 14:35 ` Max Reitz
2013-10-28 12:21 ` Benoît Canet
2013-10-29 17:21 ` Max Reitz
2013-10-29 20:06 ` Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 04/11] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 05/11] quorum: Add quorum_aio_readv Benoît Canet
2013-10-04 14:38 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism Benoît Canet
2013-10-04 14:48 ` Max Reitz [this message]
2013-10-28 12:31 ` Benoît Canet
2013-10-29 16:42 ` Max Reitz
2013-10-28 13:04 ` Benoît Canet
2013-10-29 17:34 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 07/11] quorum: Add quorum_getlength() Benoît Canet
2013-10-04 14:49 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 08/11] quorum: Add quorum_invalidate_cache() Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 09/11] quorum: Add quorum_co_get_block_status Benoît Canet
2013-10-04 14:51 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 10/11] quorum: Add quorum_co_flush() Benoît Canet
2013-10-04 14:53 ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 11/11] quorum: Add quorum_open() and quorum_close() Benoît Canet
2013-10-04 15:14 ` Max Reitz
2013-10-04 15:31 ` Eric Blake
2013-10-07 7:23 ` Max Reitz
2013-10-04 14:31 ` [Qemu-devel] [PATCH V9 00/11] Quorum block driver Max Reitz
2013-10-04 22:25 ` Benoît Canet
2013-10-29 7:59 ` Benoît Canet
2013-10-29 17:55 ` Max Reitz
2013-10-29 17:56 ` Max Reitz
2013-10-29 20:06 ` Benoît Canet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=524ED52C.7010207@redhat.com \
--to=mreitz@redhat.com \
--cc=benoit@irqsave.net \
--cc=kwolf@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).