From: Max Reitz <mreitz@redhat.com>
To: "Benoît Canet" <benoit.canet@irqsave.net>, qemu-devel@nongnu.org
Cc: kwolf@redhat.com, "Benoît Canet" <benoit@irqsave.net>,
stefanha@redhat.com
Subject: Re: [Qemu-devel] [PATCH V14 06/13] quorum: Add quorum mechanism.
Date: Mon, 03 Feb 2014 20:42:43 +0100 [thread overview]
Message-ID: <52EFF133.9060604@redhat.com> (raw)
In-Reply-To: <1391454712-14353-7-git-send-email-benoit.canet@irqsave.net>
On 03.02.2014 20:11, Benoît Canet wrote:
> From: Benoît Canet <benoit@irqsave.net>
>
> Use gnutls's SHA-256 to compare versions.
>
> Signed-off-by: Benoit Canet <benoit@irqsave.net>
> ---
> block/Makefile.objs | 2 +-
> block/quorum.c | 401 +++++++++++++++++++++++++++++++++++++++++++---
> configure | 36 +++++
> docs/qmp/qmp-events.txt | 33 ++++
> include/monitor/monitor.h | 2 +
> monitor.c | 2 +
> 6 files changed, 457 insertions(+), 19 deletions(-)
>
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index a2650b9..4ca9d43 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c
> block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
> block-obj-y += qed-check.o
> block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o
> -block-obj-y += quorum.o
> +block-obj-$(CONFIG_QUORUM) += quorum.o
> block-obj-y += parallels.o blkdebug.o blkverify.o
> block-obj-y += snapshot.o qapi.o
> block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
> diff --git a/block/quorum.c b/block/quorum.c
> index 5b11ac1..56ad6a0 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -13,7 +13,43 @@
> * See the COPYING file in the top-level directory.
> */
>
> +#include <gnutls/gnutls.h>
> +#include <gnutls/crypto.h>
> #include "block/block_int.h"
> +#include "qapi/qmp/qjson.h"
> +
> +#define HASH_LENGTH 32
> +
> +/* This union holds a vote hash value */
> +typedef union QuorumVoteValue {
> + char h[HASH_LENGTH]; /* SHA-256 hash */
> + int64_t l; /* simpler 64 bits hash */
> +} QuorumVoteValue;
> +
> +/* A vote item */
> +typedef struct QuorumVoteItem {
> + int index;
> + QLIST_ENTRY(QuorumVoteItem) next;
> +} QuorumVoteItem;
> +
> +/* this structure is a vote version. A version is the set of votes sharing the
> + * same vote value.
> + * The set of votes will be tracked with the items field and its cardinality is
> + * vote_count.
> + */
> +typedef struct QuorumVoteVersion {
> + QuorumVoteValue value;
> + int index;
> + int vote_count;
> + QLIST_HEAD(, QuorumVoteItem) items;
> + QLIST_ENTRY(QuorumVoteVersion) next;
> +} QuorumVoteVersion;
> +
> +/* this structure holds a group of vote versions together */
> +typedef struct QuorumVotes {
> + QLIST_HEAD(, QuorumVoteVersion) vote_list;
> + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
> +} QuorumVotes;
>
> /* the following structure holds the state of one quorum instance */
> typedef struct {
> @@ -60,10 +96,14 @@ struct QuorumAIOCB {
> int success_count; /* number of successfully completed AIOCB */
> bool *finished; /* completion signal for cancel */
>
> + QuorumVotes votes;
> +
> bool is_read;
> int vote_ret;
> };
>
> +static void quorum_vote(QuorumAIOCB *acb);
> +
> static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
> {
> QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
> @@ -81,29 +121,14 @@ static AIOCBInfo quorum_aiocb_info = {
> .cancel = quorum_aio_cancel,
> };
>
> -/* return the first error code get by each individual callbacks */
> -static int quorum_get_first_error(QuorumAIOCB *acb)
> -{
> - BDRVQuorumState *s = acb->bqs;
> - int i, ret = 0;
> -
> - for (i = 0; i < s->total; i++) {
> - ret = acb->aios[i].ret;
> - if (ret) {
> - return ret;
> - }
> - }
> -
> - /* should not pass here */
> - assert(false);
> -}
> +static int quorum_vote_error(QuorumAIOCB *acb);
>
> static void quorum_aio_finalize(QuorumAIOCB *acb)
> {
> BDRVQuorumState *s = acb->bqs;
> int i, ret;
>
> - ret = s->threshold <= acb->success_count ? 0 : quorum_get_first_error(acb);
> + ret = s->threshold <= acb->success_count ? 0 : quorum_vote_error(acb);
>
> for (i = 0; i < s->total; i++) {
> qemu_vfree(acb->aios[i].buf);
> @@ -111,6 +136,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> acb->aios[i].ret = 0;
> }
>
> + if (acb->vote_ret) {
> + ret = acb->vote_ret;
> + }
> +
> acb->common.cb(acb->common.opaque, ret);
> if (acb->finished) {
> *acb->finished = true;
> @@ -122,6 +151,27 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
> qemu_aio_release(acb);
> }
>
> +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> +{
> + return memcmp(a->h, b->h, HASH_LENGTH);
> +}
> +
> +static int quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b)
> +{
> + int64_t i = a->l;
> + int64_t j = b->l;
> +
> + if (i < j) {
> + return -1;
> + }
> +
> + if (i > j) {
> + return 1;
> + }
> +
> + return 0;
> +}
> +
> static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> BlockDriverState *bs,
> QEMUIOVector *qiov,
> @@ -141,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
> acb->count = 0;
> acb->success_count = 0;
> acb->finished = NULL;
> + acb->votes.compare = quorum_sha256_compare;
> acb->is_read = false;
> acb->vote_ret = 0;
>
> @@ -170,9 +221,323 @@ static void quorum_aio_cb(void *opaque, int ret)
> return;
> }
>
> + /* Do the vote on read */
> + if (acb->is_read) {
> + quorum_vote(acb);
> + }
> +
> quorum_aio_finalize(acb);
> }
>
> +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name)
> +{
> + QObject *data;
> + data = qobject_from_jsonf("{ 'node-name': \"%s\""
> + ", 'sector-num': %" PRId64
> + ", 'sectors-count': %i }",
> + node_name,
> + acb->sector_num,
> + acb->nb_sectors);
> + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
> + qobject_decref(data);
> +}
> +
> +static void quorum_report_failure(QuorumAIOCB *acb)
> +{
> + QObject *data;
> + data = qobject_from_jsonf("{ 'sector-num': %" PRId64
> + ", 'sectors-count': %i }",
> + acb->sector_num,
> + acb->nb_sectors);
> + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
> + qobject_decref(data);
> +}
> +
> +static void quorum_report_bad_versions(BDRVQuorumState *s,
> + QuorumAIOCB *acb,
> + QuorumVoteValue *value)
> +{
> + QuorumVoteVersion *version;
> + QuorumVoteItem *item;
> +
> + QLIST_FOREACH(version, &acb->votes.vote_list, next) {
> + if (!acb->votes.compare(&version->value, value)) {
> + continue;
> + }
> + QLIST_FOREACH(item, &version->items, next) {
> + quorum_report_bad(acb, s->bs[item->index]->node_name);
> + }
> + }
> +}
> +
> +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
> +{
> + int i;
> + assert(dest->niov == source->niov);
> + assert(dest->size == source->size);
> + for (i = 0; i < source->niov; i++) {
> + assert(dest->iov[i].iov_len == source->iov[i].iov_len);
> + memcpy(dest->iov[i].iov_base,
> + source->iov[i].iov_base,
> + source->iov[i].iov_len);
> + }
> +}
> +
> +static void quorum_count_vote(QuorumVotes *votes,
> + QuorumVoteValue *value,
> + int index)
> +{
> + QuorumVoteVersion *v = NULL, *version = NULL;
> + QuorumVoteItem *item;
> +
> + /* look if we have something with this hash */
> + QLIST_FOREACH(v, &votes->vote_list, next) {
> + if (!votes->compare(&v->value, value)) {
> + version = v;
> + break;
> + }
> + }
> +
> + /* It's a version not yet in the list add it */
> + if (!version) {
> + version = g_new0(QuorumVoteVersion, 1);
> + QLIST_INIT(&version->items);
> + memcpy(&version->value, value, sizeof(version->value));
> + version->index = index;
> + version->vote_count = 0;
> + QLIST_INSERT_HEAD(&votes->vote_list, version, next);
> + }
> +
> + version->vote_count++;
> +
> + item = g_new0(QuorumVoteItem, 1);
> + item->index = index;
> + QLIST_INSERT_HEAD(&version->items, item, next);
> +}
> +
> +static void quorum_free_vote_list(QuorumVotes *votes)
> +{
> + QuorumVoteVersion *version, *next_version;
> + QuorumVoteItem *item, *next_item;
> +
> + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
> + QLIST_REMOVE(version, next);
> + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
> + QLIST_REMOVE(item, next);
> + g_free(item);
> + }
> + g_free(version);
> + }
> +}
> +
> +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
> +{
> + int j, ret;
> + gnutls_hash_hd_t dig;
> + QEMUIOVector *qiov = &acb->aios[i].qiov;
> +
> + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
> +
> + if (ret < 0) {
> + return ret;
> + }
> +
> + for (j = 0; j < qiov->niov; j++) {
> + ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
> + if (ret < 0) {
> + break;
> + }
> + }
> +
> + gnutls_hash_deinit(dig, (void *) hash);
> + return ret;
> +}
> +
> +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
> +{
> + int i = 0;
> + QuorumVoteVersion *candidate, *winner = NULL;
> +
> + QLIST_FOREACH(candidate, &votes->vote_list, next) {
> + if (candidate->vote_count > i) {
> + i = candidate->vote_count;
> + winner = candidate;
> + }
> + }
> +
> + return winner;
> +}
> +
> +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
> +{
> + int i;
> + int result;
> +
> + assert(a->niov == b->niov);
> + for (i = 0; i < a->niov; i++) {
> + assert(a->iov[i].iov_len == b->iov[i].iov_len);
> + result = memcmp(a->iov[i].iov_base,
> + b->iov[i].iov_base,
> + a->iov[i].iov_len);
> + if (result) {
> + return false;
> + }
> + }
> +
> + return true;
> +}
> +
> +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
> + const char *fmt, ...)
> +{
> + va_list ap;
> +
> + va_start(ap, fmt);
> + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
> + acb->sector_num, acb->nb_sectors);
> + vfprintf(stderr, fmt, ap);
> + fprintf(stderr, "\n");
> + va_end(ap);
> + exit(1);
> +}
> +
> +static bool quorum_compare(QuorumAIOCB *acb,
> + QEMUIOVector *a,
> + QEMUIOVector *b)
> +{
> + BDRVQuorumState *s = acb->bqs;
> + ssize_t offset;
> +
> + /* This driver will replace blkverify in this particular case */
> + if (s->is_blkverify) {
> + offset = qemu_iovec_compare(a, b);
> + if (offset != -1) {
> + quorum_err(acb, "contents mismatch in sector %" PRId64,
> + acb->sector_num +
> + (uint64_t)(offset / BDRV_SECTOR_SIZE));
> + }
> + return true;
> + }
> +
> + return quorum_iovec_compare(a, b);
> +}
> +
> +/* Do a vote to get the error code */
> +static int quorum_vote_error(QuorumAIOCB *acb)
> +{
> + BDRVQuorumState *s = acb->bqs;
> + QuorumVoteVersion *winner = NULL;
> + QuorumVotes error_votes;
> + QuorumVoteValue result_value;
> + int i, ret = 0;
> + bool error = false;
> +
> + QLIST_INIT(&error_votes.vote_list);
> + error_votes.compare = quorum_64bits_compare;
> +
> + for (i = 0; i < s->total; i++) {
> + ret = acb->aios[i].ret;
> + if (ret) {
> + error = true;
> + result_value.l = ret;
> + quorum_count_vote(&error_votes, &result_value, i);
> + }
> + }
> +
> + if (error) {
> + winner = quorum_get_vote_winner(&error_votes);
> + ret = winner->value.l;
> + }
> +
> + quorum_free_vote_list(&error_votes);
> +
> + return ret;
> +}
> +
> +static void quorum_vote(QuorumAIOCB *acb)
> +{
> + bool quorum = false;
> + int i, j, ret;
> + QuorumVoteValue hash;
> + BDRVQuorumState *s = acb->bqs;
> + QuorumVoteVersion *winner;
> +
> + QLIST_INIT(&acb->votes.vote_list);
> +
> + /* if we don't get enough successful read use the first error code */
This comment should be updated, since there is now a vote.
> + if (acb->success_count < s->threshold) {
> + acb->vote_ret = quorum_vote_error(acb);
> + quorum_report_failure(acb);
> + return;
> + }
With this change (the if condition), I'm fine with leaving giving the
Quorum error code precedence in quorum_aio_finalize(). (If this hadn't
been changed, a non-zero success count below the threshold would have
led this code to set acb->vote_ret to -EIO and subsequently using this
as the return code in quorum_aio_finalize() due to the precedence. With
this change, we'll get a more meaningful return code in acb->vote_ret
and subsequently in quorum_aio_finalize().)
Aside from the comment:
Reviewed-by: Max Reitz <mreitz@redhat.com>
next prev parent reply other threads:[~2014-02-03 19:40 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-02-03 19:11 [Qemu-devel] [PATCH V14 00/13] Quorum block filter Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 01/13] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 02/13] quorum: Create BDRVQuorumState and BlkDriver and do init Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 03/13] quorum: Add quorum_aio_writev and its dependencies Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 04/13] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 05/13] quorum: Add quorum_aio_readv Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 06/13] quorum: Add quorum mechanism Benoît Canet
2014-02-03 19:42 ` Max Reitz [this message]
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 07/13] quorum: Add quorum_getlength() Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 08/13] quorum: Add quorum_invalidate_cache() Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 09/13] quorum: Add quorum_co_get_block_status Benoît Canet
2014-02-03 19:46 ` Max Reitz
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 10/13] quorum: Add quorum_co_flush() Benoît Canet
2014-02-03 19:51 ` Max Reitz
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 11/13] quorum: Implement recursive .bdrv_recurse_is_first_non_filter in quorum Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 12/13] quorum: Add quorum_open() and quorum_close() Benoît Canet
2014-02-03 20:22 ` Max Reitz
2014-02-03 21:19 ` Benoît Canet
2014-02-03 19:11 ` [Qemu-devel] [PATCH V14 13/13] quorum: Add unit test Benoît Canet
2014-02-03 20:23 ` Max Reitz
2014-02-03 21:10 ` [Qemu-devel] [PATCH V14 00/13] Quorum block filter Jeff Cody
2014-02-03 22:14 ` Benoît Canet
2014-02-04 10:35 ` Kevin Wolf
2014-02-04 21:54 ` Benoît Canet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=52EFF133.9060604@redhat.com \
--to=mreitz@redhat.com \
--cc=benoit.canet@irqsave.net \
--cc=benoit@irqsave.net \
--cc=kwolf@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.