qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Max Reitz <mreitz@redhat.com>
To: "Benoît Canet" <benoit.canet@irqsave.net>
Cc: kwolf@redhat.com, qemu-devel@nongnu.org, stefanha@redhat.com
Subject: Re: [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism.
Date: Tue, 29 Oct 2013 18:34:33 +0100	[thread overview]
Message-ID: <526FF1A9.50208@redhat.com> (raw)
In-Reply-To: <20131028130401.GC2890@irqsave.net>

Am 28.10.2013 14:04, schrieb Benoît Canet:
> Le Friday 04 Oct 2013 à 16:48:12 (+0200), Max Reitz a écrit :
>> On 2013-10-02 14:39, Benoît Canet wrote:
>>> Use gnutls's SHA-256 to compare versions.
>> Wouldn't CRC32 suffice? (I don't really oppose using SHA, but taking
>> in gnutls as a dependency just for comparing several memory areas
>> seems a bit much to me)
>>
>>> Signed-off-by: Benoit Canet <benoit@irqsave.net>
>>> ---
>>>  block/Makefile.objs       |   2 +-
>>>  block/quorum.c            | 321 +++++++++++++++++++++++++++++++++++++++++++++-
>>>  configure                 |  36 ++++++
>>>  include/monitor/monitor.h |   2 +
>>>  monitor.c                 |   2 +
>>>  5 files changed, 361 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>>> index 05a65c2..adcdc21 100644
>>> --- a/block/Makefile.objs
>>> +++ b/block/Makefile.objs
>>> @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c
>>>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>  block-obj-y += qed-check.o
>>>  block-obj-y += vhdx.o
>>> -block-obj-y += quorum.o
>>> +block-obj-$(CONFIG_QUORUM) += quorum.o
>>>  block-obj-y += parallels.o blkdebug.o blkverify.o
>>>  block-obj-y += snapshot.o qapi.o
>>>  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
>>> diff --git a/block/quorum.c b/block/quorum.c
>>> index f0fc0e9..e235ac1 100644
>>> --- a/block/quorum.c
>>> +++ b/block/quorum.c
>>> @@ -13,7 +13,43 @@
>>>   * See the COPYING file in the top-level directory.
>>>   */
>>> +#include <gnutls/gnutls.h>
>>> +#include <gnutls/crypto.h>
>>>  #include "block/block_int.h"
>>> +#include "qapi/qmp/qjson.h"
>>> +
>>> +#define HASH_LENGTH 32
>>> +
>>> +/* This union hold a vote hash value */
>> *holds
>>
>>> +typedef union QuorumVoteValue {
>>> +    char h[HASH_LENGTH];       /* SHA-256 hash */
>>> +    int64_t l;                 /* simpler 64 bits hash */
>>> +} QuorumVoteValue;
>>> +
>>> +/* A vote item */
>>> +typedef struct QuorumVoteItem {
>>> +    int index;
>>> +    QLIST_ENTRY(QuorumVoteItem) next;
>>> +} QuorumVoteItem;
>>> +
>>> +/* this structure is a vote version. A version is the set of vote sharing the
>> *set of votes
>>
>>> + * same vote value.
>>> + * The set of vote will be tracked with the items field and it's count is
>> *set of votes or *vote set; also s/it's count/its cardinality/ or
>> something like that
>>
>>> + * vote_count.
>>> + */
>>> +typedef struct QuorumVoteVersion {
>>> +    QuorumVoteValue value;
>>> +    int index;
>>> +    int vote_count;
>>> +    QLIST_HEAD(, QuorumVoteItem) items;
>>> +    QLIST_ENTRY(QuorumVoteVersion) next;
>>> +} QuorumVoteVersion;
>>> +
>>> +/* this structure hold a group of vote versions together */
>> *holds
>>
>>> +typedef struct QuorumVotes {
>>> +    QLIST_HEAD(, QuorumVoteVersion) vote_list;
>>> +    int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
>>> +} QuorumVotes;
>>>  /* the following structure hold the state of one quorum instance */
>>>  typedef struct {
>>> @@ -60,10 +96,14 @@ struct QuorumAIOCB {
>>>      int success_count;          /* number of successfully completed AIOCB */
>>>      bool *finished;             /* completion signal for cancel */
>>> +    QuorumVotes votes;
>>> +
>>>      bool is_read;
>>>      int vote_ret;
>>>  };
>>> +static void quorum_vote(QuorumAIOCB *acb);
>>> +
>>>  static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
>>>  {
>>>      QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
>>> @@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>>>          acb->aios[i].ret = 0;
>>>      }
>>> +    if (acb->vote_ret) {
>>> +        ret = acb->vote_ret;
>>> +    }
>>> +
>>>      acb->common.cb(acb->common.opaque, ret);
>>>      if (acb->finished) {
>>>          *acb->finished = true;
>>> @@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>>>      qemu_aio_release(acb);
>>>  }
>>> +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
>>> +{
>>> +    return memcmp(a->h, b->h, HASH_LENGTH);
>>> +}
>>> +
>>>  static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>>>                                     BlockDriverState *bs,
>>>                                     QEMUIOVector *qiov,
>>> @@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>>>      acb->count = 0;
>>>      acb->success_count = 0;
>>>      acb->finished = NULL;
>>> +    acb->votes.compare = quorum_sha256_compare;
>>>      acb->is_read = false;
>>>      acb->vote_ret = 0;
>>> @@ -170,9 +220,278 @@ static void quorum_aio_cb(void *opaque, int ret)
>>>          return;
>>>      }
>>> +    /* Do the vote on read */
>>> +    if (acb->is_read) {
>>> +        quorum_vote(acb);
>>> +    }
>>> +
>>>      quorum_aio_finalize(acb);
>>>  }
>>> +static void quorum_report_bad(QuorumAIOCB *acb, int index)
>>> +{
>>> +    QObject *data;
>>> +    data = qobject_from_jsonf("{ 'children-index': %i"
>> I'd prefer child-index. Generally, remember that the singular of
>> "children" is "child".
>>
>>> +                              ", 'sector-num': %" PRId64
>>> +                              ", 'sectors-count': %i }",
>>> +                              index,
>>> +                              acb->sector_num,
>>> +                              acb->nb_sectors);
>>> +    monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
>> How about decrementing the refcount of data?
>>
>>> +}
>>> +
>>> +static void quorum_report_failure(QuorumAIOCB *acb)
>>> +{
>>> +    QObject *data;
>>> +    data = qobject_from_jsonf("{ 'sector-num': %" PRId64
>>> +                              ", 'sectors-count': %i }",
>>> +                              acb->sector_num,
>>> +                              acb->nb_sectors);
>>> +    monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
>> Same here.
>>
>>> +}
>>> +
>>> +static void quorum_report_bad_versions(QuorumAIOCB *acb,
>>> +                                      QuorumVoteValue *value)
>>> +{
>>> +    QuorumVoteVersion *version;
>>> +    QuorumVoteItem *item;
>>> +
>>> +    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
>>> +        if (!acb->votes.compare(&version->value, value)) {
>>> +            continue;
>>> +        }
>>> +        QLIST_FOREACH(item, &version->items, next) {
>>> +            quorum_report_bad(acb, item->index);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
>>> +{
>>> +    int i;
>>> +    assert(dest->niov == source->niov);
>>> +    assert(dest->size == source->size);
>>> +    for (i = 0; i < source->niov; i++) {
>>> +        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
>>> +        memcpy(dest->iov[i].iov_base,
>>> +               source->iov[i].iov_base,
>>> +               source->iov[i].iov_len);
>>> +    }
>>> +}
>>> +
>>> +static void quorum_count_vote(QuorumVotes *votes,
>>> +                              QuorumVoteValue *value,
>>> +                              int index)
>>> +{
>>> +    QuorumVoteVersion *v = NULL, *version = NULL;
>>> +    QuorumVoteItem *item;
>>> +
>>> +    /* look if we have something with this hash */
>>> +    QLIST_FOREACH(v, &votes->vote_list, next) {
>>> +        if (!votes->compare(&v->value, value)) {
>>> +            version = v;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    /* It's a version not yet in the list add it */
>>> +    if (!version) {
>>> +        version = g_new0(QuorumVoteVersion, 1);
>>> +        QLIST_INIT(&version->items);
>>> +        memcpy(&version->value, value, sizeof(version->value));
>>> +        version->index = index;
>>> +        version->vote_count = 0;
>>> +        QLIST_INSERT_HEAD(&votes->vote_list, version, next);
>>> +    }
>>> +
>>> +    version->vote_count++;
>>> +
>>> +    item = g_new0(QuorumVoteItem, 1);
>>> +    item->index = index;
>>> +    QLIST_INSERT_HEAD(&version->items, item, next);
>>> +}
>>> +
>>> +static void quorum_free_vote_list(QuorumVotes *votes)
>>> +{
>>> +    QuorumVoteVersion *version, *next_version;
>>> +    QuorumVoteItem *item, *next_item;
>>> +
>>> +    QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
>>> +        QLIST_REMOVE(version, next);
>>> +        QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
>>> +            QLIST_REMOVE(item, next);
>>> +            g_free(item);
>>> +        }
>>> +        g_free(version);
>>> +    }
>>> +}
>>> +
>>> +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
>>> +{
>>> +    int j, ret;
>>> +    gnutls_hash_hd_t dig;
>>> +    QEMUIOVector *qiov = &acb->aios[i].qiov;
>>> +
>>> +    ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
>>> +
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    for (j = 0; j < qiov->niov; j++) {
>>> +        ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
>>> +        if (ret < 0) {
>>> +            return ret;
>> gnutls_hash_deinit?
>>
>>> +        }
>>> +    }
>>> +
>>> +    gnutls_hash_deinit(dig, (void *) hash);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
>>> +{
>>> +    int i = 0;
>>> +    QuorumVoteVersion *candidate, *winner = NULL;
>>> +
>>> +    QLIST_FOREACH(candidate, &votes->vote_list, next) {
>>> +        if (candidate->vote_count > i) {
>>> +            i = candidate->vote_count;
>>> +            winner = candidate;
>>> +        }
>>> +    }
>>> +
>>> +    return winner;
>>> +}
>>> +
>>> +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
>>> +{
>>> +    int i;
>>> +    int result;
>>> +
>>> +    assert(a->niov == b->niov);
>>> +    for (i = 0; i < a->niov; i++) {
>>> +        assert(a->iov[i].iov_len == b->iov[i].iov_len);
>>> +        result = memcmp(a->iov[i].iov_base,
>>> +                        b->iov[i].iov_base,
>>> +                        a->iov[i].iov_len);
>>> +        if (result) {
>>> +            return false;
>>> +        }
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
>>> +                                          const char *fmt, ...)
>>> +{
>>> +    va_list ap;
>>> +
>>> +    va_start(ap, fmt);
>>> +    fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
>>> +            acb->sector_num, acb->nb_sectors);
>>> +    vfprintf(stderr, fmt, ap);
>>> +    fprintf(stderr, "\n");
>>> +    va_end(ap);
>>> +    exit(1);
>>> +}
>>> +
>>> +static bool quorum_compare(QuorumAIOCB *acb,
>>> +                           QEMUIOVector *a,
>>> +                           QEMUIOVector *b)
>>> +{
>>> +    BDRVQuorumState *s = acb->bqs;
>>> +    ssize_t offset;
>>> +
>>> +    /* This driver will replace blkverify in this particular case */
>>> +    if (s->is_blkverify) {
>>> +        offset = qemu_iovec_compare(a, b);
>>> +        if (offset != -1) {
>>> +            quorum_err(acb, "contents mismatch in sector %" PRId64,
>>> +                       acb->sector_num +
>>> +                       (uint64_t)(offset / BDRV_SECTOR_SIZE));
>>> +        }
>>> +        return true;
>>> +    }
>>> +
>>> +    return quorum_iovec_compare(a, b);
>>> +}
>>> +
>>> +static void quorum_vote(QuorumAIOCB *acb)
>>> +{
>>> +    bool quorum = true;
>>> +    int i, j, ret;
>>> +    QuorumVoteValue hash;
>>> +    BDRVQuorumState *s = acb->bqs;
>>> +    QuorumVoteVersion *winner;
>>> +
>>> +    /* get the index of the first successful read */
>>> +    for (i = 0; i < s->total; i++) {
>>> +        if (!acb->aios[i].ret) {
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    /* compare this read with all other successful read looking for quorum */
>>> +    for (j = i + 1; j < s->total; j++) {
>>> +        if (acb->aios[j].ret) {
>>> +            continue;
>>> +        }
>>> +        quorum = quorum_compare(acb, &acb->aios[i].qiov, &acb->aios[j].qiov);
>>> +        if (!quorum) {
>>> +            break;
>>> +       }
>>> +    }
>>> +
>>> +    /* Every successful read agrees -> Quorum */
>>> +    if (quorum) {
>>> +        quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov);
>> If no reads were successful at all, quorum will be true and i will
>> be s->total. Therefore, this will result in an array access with an
>> index out of bounds here.
> Thanks a lot for spotting this.
>
>> Generally, why is this function executed at all if any read failed?
>> If a read fails, the quorum read function will return an error, thus
>> the caller will ignore the result anyway.
> I should if there is a quorum between the successfull reads; if all the success
> ful reads agrees and their count is greater than threshold.

Ah, I see it now; quorum_aio_readv() returns "success" if more reads
than the threshould succeeded (and return the same result). I thought it
would fail as soon as just a single read operation failed – sorry, my fault.

Max

  reply	other threads:[~2013-10-29 17:34 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-10-02 12:39 [Qemu-devel] [PATCH V9 00/11] Quorum block driver Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 01/11] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB Benoît Canet
2013-10-04 14:33   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 02/11] quorum: Create BDRVQuorumState and BlkDriver and do init Benoît Canet
2013-10-04 14:34   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 03/11] quorum: Add quorum_aio_writev and its dependencies Benoît Canet
2013-10-04 14:35   ` Max Reitz
2013-10-28 12:21     ` Benoît Canet
2013-10-29 17:21       ` Max Reitz
2013-10-29 20:06         ` Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 04/11] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 05/11] quorum: Add quorum_aio_readv Benoît Canet
2013-10-04 14:38   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism Benoît Canet
2013-10-04 14:48   ` Max Reitz
2013-10-28 12:31     ` Benoît Canet
2013-10-29 16:42       ` Max Reitz
2013-10-28 13:04     ` Benoît Canet
2013-10-29 17:34       ` Max Reitz [this message]
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 07/11] quorum: Add quorum_getlength() Benoît Canet
2013-10-04 14:49   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 08/11] quorum: Add quorum_invalidate_cache() Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 09/11] quorum: Add quorum_co_get_block_status Benoît Canet
2013-10-04 14:51   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 10/11] quorum: Add quorum_co_flush() Benoît Canet
2013-10-04 14:53   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 11/11] quorum: Add quorum_open() and quorum_close() Benoît Canet
2013-10-04 15:14   ` Max Reitz
2013-10-04 15:31     ` Eric Blake
2013-10-07  7:23       ` Max Reitz
2013-10-04 14:31 ` [Qemu-devel] [PATCH V9 00/11] Quorum block driver Max Reitz
2013-10-04 22:25   ` Benoît Canet
2013-10-29  7:59   ` Benoît Canet
2013-10-29 17:55     ` Max Reitz
2013-10-29 17:56       ` Max Reitz
2013-10-29 20:06         ` Benoît Canet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=526FF1A9.50208@redhat.com \
    --to=mreitz@redhat.com \
    --cc=benoit.canet@irqsave.net \
    --cc=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).