All of lore.kernel.org
 help / color / mirror / Atom feed
From: Max Reitz <mreitz@redhat.com>
To: "Benoît Canet" <benoit.canet@irqsave.net>
Cc: kwolf@redhat.com, qemu-devel@nongnu.org, stefanha@redhat.com
Subject: Re: [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism.
Date: Tue, 29 Oct 2013 18:34:33 +0100	[thread overview]
Message-ID: <526FF1A9.50208@redhat.com> (raw)
In-Reply-To: <20131028130401.GC2890@irqsave.net>

Am 28.10.2013 14:04, schrieb Benoît Canet:
> Le Friday 04 Oct 2013 à 16:48:12 (+0200), Max Reitz a écrit :
>> On 2013-10-02 14:39, Benoît Canet wrote:
>>> Use gnutls's SHA-256 to compare versions.
>> Wouldn't CRC32 suffice? (I don't really oppose using SHA, but taking
>> in gnutls as a dependency just for comparing several memory areas
>> seems a bit much to me)
>>
>>> Signed-off-by: Benoit Canet <benoit@irqsave.net>
>>> ---
>>>  block/Makefile.objs       |   2 +-
>>>  block/quorum.c            | 321 +++++++++++++++++++++++++++++++++++++++++++++-
>>>  configure                 |  36 ++++++
>>>  include/monitor/monitor.h |   2 +
>>>  monitor.c                 |   2 +
>>>  5 files changed, 361 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>>> index 05a65c2..adcdc21 100644
>>> --- a/block/Makefile.objs
>>> +++ b/block/Makefile.objs
>>> @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c
>>>  block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>  block-obj-y += qed-check.o
>>>  block-obj-y += vhdx.o
>>> -block-obj-y += quorum.o
>>> +block-obj-$(CONFIG_QUORUM) += quorum.o
>>>  block-obj-y += parallels.o blkdebug.o blkverify.o
>>>  block-obj-y += snapshot.o qapi.o
>>>  block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
>>> diff --git a/block/quorum.c b/block/quorum.c
>>> index f0fc0e9..e235ac1 100644
>>> --- a/block/quorum.c
>>> +++ b/block/quorum.c
>>> @@ -13,7 +13,43 @@
>>>   * See the COPYING file in the top-level directory.
>>>   */
>>> +#include <gnutls/gnutls.h>
>>> +#include <gnutls/crypto.h>
>>>  #include "block/block_int.h"
>>> +#include "qapi/qmp/qjson.h"
>>> +
>>> +#define HASH_LENGTH 32
>>> +
>>> +/* This union hold a vote hash value */
>> *holds
>>
>>> +typedef union QuorumVoteValue {
>>> +    char h[HASH_LENGTH];       /* SHA-256 hash */
>>> +    int64_t l;                 /* simpler 64 bits hash */
>>> +} QuorumVoteValue;
>>> +
>>> +/* A vote item */
>>> +typedef struct QuorumVoteItem {
>>> +    int index;
>>> +    QLIST_ENTRY(QuorumVoteItem) next;
>>> +} QuorumVoteItem;
>>> +
>>> +/* this structure is a vote version. A version is the set of vote sharing the
>> *set of votes
>>
>>> + * same vote value.
>>> + * The set of vote will be tracked with the items field and it's count is
>> *set of votes or *vote set; also s/it's count/its cardinality/ or
>> something like that
>>
>>> + * vote_count.
>>> + */
>>> +typedef struct QuorumVoteVersion {
>>> +    QuorumVoteValue value;
>>> +    int index;
>>> +    int vote_count;
>>> +    QLIST_HEAD(, QuorumVoteItem) items;
>>> +    QLIST_ENTRY(QuorumVoteVersion) next;
>>> +} QuorumVoteVersion;
>>> +
>>> +/* this structure hold a group of vote versions together */
>> *holds
>>
>>> +typedef struct QuorumVotes {
>>> +    QLIST_HEAD(, QuorumVoteVersion) vote_list;
>>> +    int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b);
>>> +} QuorumVotes;
>>>  /* the following structure hold the state of one quorum instance */
>>>  typedef struct {
>>> @@ -60,10 +96,14 @@ struct QuorumAIOCB {
>>>      int success_count;          /* number of successfully completed AIOCB */
>>>      bool *finished;             /* completion signal for cancel */
>>> +    QuorumVotes votes;
>>> +
>>>      bool is_read;
>>>      int vote_ret;
>>>  };
>>> +static void quorum_vote(QuorumAIOCB *acb);
>>> +
>>>  static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
>>>  {
>>>      QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common);
>>> @@ -111,6 +151,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>>>          acb->aios[i].ret = 0;
>>>      }
>>> +    if (acb->vote_ret) {
>>> +        ret = acb->vote_ret;
>>> +    }
>>> +
>>>      acb->common.cb(acb->common.opaque, ret);
>>>      if (acb->finished) {
>>>          *acb->finished = true;
>>> @@ -122,6 +166,11 @@ static void quorum_aio_finalize(QuorumAIOCB *acb)
>>>      qemu_aio_release(acb);
>>>  }
>>> +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b)
>>> +{
>>> +    return memcmp(a->h, b->h, HASH_LENGTH);
>>> +}
>>> +
>>>  static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>>>                                     BlockDriverState *bs,
>>>                                     QEMUIOVector *qiov,
>>> @@ -141,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
>>>      acb->count = 0;
>>>      acb->success_count = 0;
>>>      acb->finished = NULL;
>>> +    acb->votes.compare = quorum_sha256_compare;
>>>      acb->is_read = false;
>>>      acb->vote_ret = 0;
>>> @@ -170,9 +220,278 @@ static void quorum_aio_cb(void *opaque, int ret)
>>>          return;
>>>      }
>>> +    /* Do the vote on read */
>>> +    if (acb->is_read) {
>>> +        quorum_vote(acb);
>>> +    }
>>> +
>>>      quorum_aio_finalize(acb);
>>>  }
>>> +static void quorum_report_bad(QuorumAIOCB *acb, int index)
>>> +{
>>> +    QObject *data;
>>> +    data = qobject_from_jsonf("{ 'children-index': %i"
>> I'd prefer child-index. Generally, remember that the singular of
>> "children" is "child".
>>
>>> +                              ", 'sector-num': %" PRId64
>>> +                              ", 'sectors-count': %i }",
>>> +                              index,
>>> +                              acb->sector_num,
>>> +                              acb->nb_sectors);
>>> +    monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data);
>> How about decrementing the refcount of data?
>>
>>> +}
>>> +
>>> +static void quorum_report_failure(QuorumAIOCB *acb)
>>> +{
>>> +    QObject *data;
>>> +    data = qobject_from_jsonf("{ 'sector-num': %" PRId64
>>> +                              ", 'sectors-count': %i }",
>>> +                              acb->sector_num,
>>> +                              acb->nb_sectors);
>>> +    monitor_protocol_event(QEVENT_QUORUM_FAILURE, data);
>> Same here.
>>
>>> +}
>>> +
>>> +static void quorum_report_bad_versions(QuorumAIOCB *acb,
>>> +                                      QuorumVoteValue *value)
>>> +{
>>> +    QuorumVoteVersion *version;
>>> +    QuorumVoteItem *item;
>>> +
>>> +    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
>>> +        if (!acb->votes.compare(&version->value, value)) {
>>> +            continue;
>>> +        }
>>> +        QLIST_FOREACH(item, &version->items, next) {
>>> +            quorum_report_bad(acb, item->index);
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
>>> +{
>>> +    int i;
>>> +    assert(dest->niov == source->niov);
>>> +    assert(dest->size == source->size);
>>> +    for (i = 0; i < source->niov; i++) {
>>> +        assert(dest->iov[i].iov_len == source->iov[i].iov_len);
>>> +        memcpy(dest->iov[i].iov_base,
>>> +               source->iov[i].iov_base,
>>> +               source->iov[i].iov_len);
>>> +    }
>>> +}
>>> +
>>> +static void quorum_count_vote(QuorumVotes *votes,
>>> +                              QuorumVoteValue *value,
>>> +                              int index)
>>> +{
>>> +    QuorumVoteVersion *v = NULL, *version = NULL;
>>> +    QuorumVoteItem *item;
>>> +
>>> +    /* look if we have something with this hash */
>>> +    QLIST_FOREACH(v, &votes->vote_list, next) {
>>> +        if (!votes->compare(&v->value, value)) {
>>> +            version = v;
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    /* It's a version not yet in the list add it */
>>> +    if (!version) {
>>> +        version = g_new0(QuorumVoteVersion, 1);
>>> +        QLIST_INIT(&version->items);
>>> +        memcpy(&version->value, value, sizeof(version->value));
>>> +        version->index = index;
>>> +        version->vote_count = 0;
>>> +        QLIST_INSERT_HEAD(&votes->vote_list, version, next);
>>> +    }
>>> +
>>> +    version->vote_count++;
>>> +
>>> +    item = g_new0(QuorumVoteItem, 1);
>>> +    item->index = index;
>>> +    QLIST_INSERT_HEAD(&version->items, item, next);
>>> +}
>>> +
>>> +static void quorum_free_vote_list(QuorumVotes *votes)
>>> +{
>>> +    QuorumVoteVersion *version, *next_version;
>>> +    QuorumVoteItem *item, *next_item;
>>> +
>>> +    QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) {
>>> +        QLIST_REMOVE(version, next);
>>> +        QLIST_FOREACH_SAFE(item, &version->items, next, next_item) {
>>> +            QLIST_REMOVE(item, next);
>>> +            g_free(item);
>>> +        }
>>> +        g_free(version);
>>> +    }
>>> +}
>>> +
>>> +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash)
>>> +{
>>> +    int j, ret;
>>> +    gnutls_hash_hd_t dig;
>>> +    QEMUIOVector *qiov = &acb->aios[i].qiov;
>>> +
>>> +    ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256);
>>> +
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    for (j = 0; j < qiov->niov; j++) {
>>> +        ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len);
>>> +        if (ret < 0) {
>>> +            return ret;
>> gnutls_hash_deinit?
>>
>>> +        }
>>> +    }
>>> +
>>> +    gnutls_hash_deinit(dig, (void *) hash);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes)
>>> +{
>>> +    int i = 0;
>>> +    QuorumVoteVersion *candidate, *winner = NULL;
>>> +
>>> +    QLIST_FOREACH(candidate, &votes->vote_list, next) {
>>> +        if (candidate->vote_count > i) {
>>> +            i = candidate->vote_count;
>>> +            winner = candidate;
>>> +        }
>>> +    }
>>> +
>>> +    return winner;
>>> +}
>>> +
>>> +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
>>> +{
>>> +    int i;
>>> +    int result;
>>> +
>>> +    assert(a->niov == b->niov);
>>> +    for (i = 0; i < a->niov; i++) {
>>> +        assert(a->iov[i].iov_len == b->iov[i].iov_len);
>>> +        result = memcmp(a->iov[i].iov_base,
>>> +                        b->iov[i].iov_base,
>>> +                        a->iov[i].iov_len);
>>> +        if (result) {
>>> +            return false;
>>> +        }
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb,
>>> +                                          const char *fmt, ...)
>>> +{
>>> +    va_list ap;
>>> +
>>> +    va_start(ap, fmt);
>>> +    fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ",
>>> +            acb->sector_num, acb->nb_sectors);
>>> +    vfprintf(stderr, fmt, ap);
>>> +    fprintf(stderr, "\n");
>>> +    va_end(ap);
>>> +    exit(1);
>>> +}
>>> +
>>> +static bool quorum_compare(QuorumAIOCB *acb,
>>> +                           QEMUIOVector *a,
>>> +                           QEMUIOVector *b)
>>> +{
>>> +    BDRVQuorumState *s = acb->bqs;
>>> +    ssize_t offset;
>>> +
>>> +    /* This driver will replace blkverify in this particular case */
>>> +    if (s->is_blkverify) {
>>> +        offset = qemu_iovec_compare(a, b);
>>> +        if (offset != -1) {
>>> +            quorum_err(acb, "contents mismatch in sector %" PRId64,
>>> +                       acb->sector_num +
>>> +                       (uint64_t)(offset / BDRV_SECTOR_SIZE));
>>> +        }
>>> +        return true;
>>> +    }
>>> +
>>> +    return quorum_iovec_compare(a, b);
>>> +}
>>> +
>>> +static void quorum_vote(QuorumAIOCB *acb)
>>> +{
>>> +    bool quorum = true;
>>> +    int i, j, ret;
>>> +    QuorumVoteValue hash;
>>> +    BDRVQuorumState *s = acb->bqs;
>>> +    QuorumVoteVersion *winner;
>>> +
>>> +    /* get the index of the first successful read */
>>> +    for (i = 0; i < s->total; i++) {
>>> +        if (!acb->aios[i].ret) {
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    /* compare this read with all other successful read looking for quorum */
>>> +    for (j = i + 1; j < s->total; j++) {
>>> +        if (acb->aios[j].ret) {
>>> +            continue;
>>> +        }
>>> +        quorum = quorum_compare(acb, &acb->aios[i].qiov, &acb->aios[j].qiov);
>>> +        if (!quorum) {
>>> +            break;
>>> +       }
>>> +    }
>>> +
>>> +    /* Every successful read agrees -> Quorum */
>>> +    if (quorum) {
>>> +        quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov);
>> If no reads were successful at all, quorum will be true and i will
>> be s->total. Therefore, this will result in an array access with an
>> index out of bounds here.
> Thanks a lot for spotting this.
>
>> Generally, why is this function executed at all if any read failed?
>> If a read fails, the quorum read function will return an error, thus
>> the caller will ignore the result anyway.
> I should if there is a quorum between the successfull reads; if all the success
> ful reads agrees and their count is greater than threshold.

Ah, I see it now; quorum_aio_readv() returns "success" if more reads
than the threshould succeeded (and return the same result). I thought it
would fail as soon as just a single read operation failed – sorry, my fault.

Max

  reply	other threads:[~2013-10-29 17:34 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-10-02 12:39 [Qemu-devel] [PATCH V9 00/11] Quorum block driver Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 01/11] quorum: Create quorum.c, add QuorumSingleAIOCB and QuorumAIOCB Benoît Canet
2013-10-04 14:33   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 02/11] quorum: Create BDRVQuorumState and BlkDriver and do init Benoît Canet
2013-10-04 14:34   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 03/11] quorum: Add quorum_aio_writev and its dependencies Benoît Canet
2013-10-04 14:35   ` Max Reitz
2013-10-28 12:21     ` Benoît Canet
2013-10-29 17:21       ` Max Reitz
2013-10-29 20:06         ` Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 04/11] blkverify: Extract qemu_iovec_clone() and qemu_iovec_compare() from blkverify Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 05/11] quorum: Add quorum_aio_readv Benoît Canet
2013-10-04 14:38   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 06/11] quorum: Add quorum mechanism Benoît Canet
2013-10-04 14:48   ` Max Reitz
2013-10-28 12:31     ` Benoît Canet
2013-10-29 16:42       ` Max Reitz
2013-10-28 13:04     ` Benoît Canet
2013-10-29 17:34       ` Max Reitz [this message]
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 07/11] quorum: Add quorum_getlength() Benoît Canet
2013-10-04 14:49   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 08/11] quorum: Add quorum_invalidate_cache() Benoît Canet
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 09/11] quorum: Add quorum_co_get_block_status Benoît Canet
2013-10-04 14:51   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 10/11] quorum: Add quorum_co_flush() Benoît Canet
2013-10-04 14:53   ` Max Reitz
2013-10-02 12:39 ` [Qemu-devel] [PATCH V9 11/11] quorum: Add quorum_open() and quorum_close() Benoît Canet
2013-10-04 15:14   ` Max Reitz
2013-10-04 15:31     ` Eric Blake
2013-10-07  7:23       ` Max Reitz
2013-10-04 14:31 ` [Qemu-devel] [PATCH V9 00/11] Quorum block driver Max Reitz
2013-10-04 22:25   ` Benoît Canet
2013-10-29  7:59   ` Benoît Canet
2013-10-29 17:55     ` Max Reitz
2013-10-29 17:56       ` Max Reitz
2013-10-29 20:06         ` Benoît Canet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=526FF1A9.50208@redhat.com \
    --to=mreitz@redhat.com \
    --cc=benoit.canet@irqsave.net \
    --cc=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.