All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Benoît Canet" <benoit.canet@irqsave.net>
To: Liu Yuan <namei.unix@gmail.com>
Cc: Kevin Wolf <kwolf@redhat.com>,
	qemu-devel@nongnu.org, Stefan Hajnoczi <stefanha@redhat.com>
Subject: Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic
Date: Mon, 1 Sep 2014 11:37:20 +0200	[thread overview]
Message-ID: <20140901093720.GK15537@irqsave.net> (raw)
In-Reply-To: <1409557394-11853-9-git-send-email-namei.unix@gmail.com>

The Monday 01 Sep 2014 à 15:43:14 (+0800), Liu Yuan wrote :
> For some configuration, quorum allow VMs to continue while some child devices
> are broken and when the child devices are repaired and return back, we need to
> sync dirty bits during downtime to keep data consistency.
> 
> The recovery logic is based on the driver state bitmap and will sync the dirty
> bits with a timeslice window in a coroutine in this prtimive implementation.
> 
> Simple graph about 2 children with threshold=1 and read-pattern=fifo:
> 
> + denote device sync iteration
> - IO on a single device
> = IO on two devices
> 
>                                       sync complete, release dirty bitmap
>                                          ^
>                                          |
>   ====-----------------++++----++++----++==========
>      |                 |
>      |                 v
>      |               device repaired and begin to sync
>      v
>    device broken, create a dirty bitmap
> 
>   This sync logic can take care of nested broken problem, that devices are
>   broken while in sync. We just start a sync process after the devices are
>   repaired again and switch the devices from broken to sound only when the sync
>   completes.
> 
> For read-pattern=quorum mode, it enjoys the recovery logic without any problem.
> 
> Cc: Eric Blake <eblake@redhat.com>
> Cc: Benoit Canet <benoit@irqsave.net>
> Cc: Kevin Wolf <kwolf@redhat.com>
> Cc: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Liu Yuan <namei.unix@gmail.com>
> ---
>  block/quorum.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  trace-events   |   5 ++
>  2 files changed, 191 insertions(+), 3 deletions(-)
> 
> diff --git a/block/quorum.c b/block/quorum.c
> index 7b07e35..ffd7c2d 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -23,6 +23,7 @@
>  #include "qapi/qmp/qlist.h"
>  #include "qapi/qmp/qstring.h"
>  #include "qapi-event.h"
> +#include "trace.h"
>  
>  #define HASH_LENGTH 32
>  
> @@ -31,6 +32,10 @@
>  #define QUORUM_OPT_REWRITE        "rewrite-corrupted"
>  #define QUORUM_OPT_READ_PATTERN   "read-pattern"
>  
> +#define SLICE_TIME          100000000ULL /* 100 ms */
> +#define CHUNK_SIZE          (1 << 20) /* 1M */
> +#define SECTORS_PER_CHUNK   (CHUNK_SIZE >> BDRV_SECTOR_BITS)
> +
>  /* This union holds a vote hash value */
>  typedef union QuorumVoteValue {
>      char h[HASH_LENGTH];       /* SHA-256 hash */
> @@ -64,6 +69,7 @@ typedef struct QuorumVotes {
>  
>  /* the following structure holds the state of one quorum instance */
>  typedef struct BDRVQuorumState {
> +    BlockDriverState *mybs;/* Quorum block driver base state */
>      BlockDriverState **bs; /* children BlockDriverStates */
>      int num_children;      /* children count */
>      int threshold;         /* if less than threshold children reads gave the
> @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState {
>                              */
>  
>      QuorumReadPattern read_pattern;
> +    BdrvDirtyBitmap *dirty_bitmap;
> +    uint8_t *sync_buf;
> +    HBitmapIter hbi;
> +    int64_t sector_num;
>  } BDRVQuorumState;
>  
>  typedef struct QuorumAIOCB QuorumAIOCB;
> @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
>      }
>  }
>  
> -static int next_fifo_child(QuorumAIOCB *acb)
> +static int get_good_child(BDRVQuorumState *s, int iter)
>  {
> -    BDRVQuorumState *s = acb->common.bs->opaque;
>      int i;
>  
> -    for (i = acb->child_iter; i < s->num_children; i++) {
> +    for (i = iter; i < s->num_children; i++) {
>          if (!s->bs[i]->broken) {
>              break;
>          }
> @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb)
>      return i;
>  }
>  
> +static int next_fifo_child(QuorumAIOCB *acb)
> +{
> +    BDRVQuorumState *s = acb->common.bs->opaque;
> +
> +    return get_good_child(s, acb->child_iter);
> +}
> +
>  static void quorum_aio_cb(void *opaque, int ret)
>  {
>      QuorumChildRequest *sacb = opaque;
> @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt)
>      return -EINVAL;
>  }
>  
> +static void sync_prepare(BDRVQuorumState *qs, int64_t *num)
> +{
> +    int64_t nb, total = bdrv_nb_sectors(qs->mybs);
> +
> +    qs->sector_num = hbitmap_iter_next(&qs->hbi);
> +    /* Wrap around if previous bits get dirty while syncing */
> +    if (qs->sector_num < 0) {
> +        bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> +        qs->sector_num = hbitmap_iter_next(&qs->hbi);
> +        assert(qs->sector_num >= 0);
> +    }
> +
> +    for (nb = 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < total;
> +         nb++) {
> +        if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num + nb)) {
> +            break;
> +        }
> +    }
> +    *num = nb;
> +}
> +
> +static void sync_finish(BDRVQuorumState *qs, int64_t num)
> +{
> +    int64_t i;
> +
> +    for (i = 0; i < num; i++) {
> +        /* We need to advance the iterator manually */
> +        hbitmap_iter_next(&qs->hbi);
> +    }
> +    bdrv_reset_dirty(qs->mybs, qs->sector_num, num);
> +}
> +
> +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState *target)
> +{
> +    BlockDriverState *source;
> +    QEMUIOVector qiov;
> +    int ret, good;
> +    int64_t nb_sectors;
> +    struct iovec iov;
> +    const char *sname, *tname = bdrv_get_filename(target);
> +
> +    good = get_good_child(qs, 0);
> +    if (good < 0) {
> +        error_report("No good device available.");
> +        return -1;
> +    }
> +    source = qs->bs[good];
> +    sname = bdrv_get_filename(source);
> +    sync_prepare(qs, &nb_sectors);
> +    iov.iov_base = qs->sync_buf;
> +    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +    trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_sectors);
> +    ret = bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        error_report("Read source %s failed.", sname);

I didn't read this patch throughfully but in quorum if you need to name a child BDS
you must use bs->node_name.

bs->node_name was introduced to be able to merge quorum and uniquely identify a given
node of the BDS graph.

Best regards

Benoît

> +        return ret;
> +    }
> +    ret = bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov);
> +    if (ret < 0) {
> +        error_report("Write target %s failed.", tname);
> +        return ret;
> +    }
> +    sync_finish(qs, nb_sectors);
> +
> +    return 0;
> +}
> +
> +static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *target)
> +{
> +    uint64_t last_pause_ns;
> +
> +    bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi);
> +    last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +    for (;;) {
> +        int64_t cnt;
> +
> +        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> +        if (cnt == 0) {
> +            break;
> +        }
> +        error_report("count %ld", cnt);
> +        if (quorum_sync_iteration(qs, target) < 0) {
> +            return -1;
> +        }
> +        cnt = bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap);
> +        if (cnt == 0) {
> +            break;
> +        }
> +
> +        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >=
> +            SLICE_TIME) {
> +            co_aio_sleep_ns(bdrv_get_aio_context(target), QEMU_CLOCK_REALTIME,
> +                            SLICE_TIME);
> +            last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverState *file)
> +{
> +    int i;
> +
> +    for (i = 0; i < qs->num_children; i++) {
> +        BlockDriverState *f = bdrv_get_file(qs->bs[i]);
> +
> +        if (f == file) {
> +            return qs->bs[i];
> +        }
> +    }
> +
> +    error_report("Can't find driver state for %s", bdrv_get_filename(file));
> +    abort();
> +}
> +
> +static void quorum_driver_reconnect(BlockDriverState *file)
> +{
> +    BDRVQuorumState *qs = file->drv_opaque;
> +    BlockDriverState *bs = file_to_bs(qs, file);
> +    const char *name = bdrv_get_filename(bs);
> +
> +    trace_quorum_driver_reconnect(name);
> +    assert(bs->broken == true);
> +    if (quorum_sync_device(qs, bs) < 0) {
> +        error_report("Failed to sync device %s", name);
> +        return;
> +    }
> +
> +    bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap);
> +    qemu_vfree(qs->sync_buf);
> +    bs->broken = false;
> +}
> +
> +static void quorum_driver_disconnect(BlockDriverState *file)
> +{
> +    BDRVQuorumState *qs = file->drv_opaque;
> +    BlockDriverState *bs = file_to_bs(qs, file);
> +    const char *name = bdrv_get_filename(bs);
> +
> +    trace_quorum_driver_disconnect(name);
> +    /*
> +     * If we are disconnected while being syncing, we expect to reconnect to the
> +     * target again and resume the data sync from the last synced point.
> +     */
> +    if (bs->broken) {
> +        return;
> +    }
> +
> +    bs->broken = true;
> +    qs->dirty_bitmap = bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTOR_SIZE,
> +                                                NULL);
> +    if (!qs->dirty_bitmap) {
> +        abort();
> +    }
> +    qs->sync_buf = qemu_blockalign(bs, CHUNK_SIZE);
> +}
> +
> +static const BlockDrvOps quorum_block_drv_ops = {
> +    .driver_reconnect = quorum_driver_reconnect,
> +    .driver_disconnect = quorum_driver_disconnect,
> +};
> +
>  static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>                         Error **errp)
>  {
> @@ -975,6 +1156,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>          goto exit;
>      }
>  
> +    s->mybs = bs;
>      /* count how many different children are present */
>      s->num_children = qlist_size(list);
>      if (s->num_children < 2) {
> @@ -1061,6 +1243,7 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
>              goto close_exit;
>          }
>          opened[i] = true;
> +        bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_ops, s);
>      }
>  
>      g_free(opened);
> diff --git a/trace-events b/trace-events
> index 81bc915..8da0a13 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -572,6 +572,11 @@ qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t o
>  qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64
>  qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
>  
> +# block/quorum.c
> +quorum_sync_iteration(const char *source, const char *target, int64_t sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d"
> +quorum_driver_reconnect(const char *target) "%s"
> +quorum_driver_disconnect(const char *target) "%s"
> +
>  # hw/display/g364fb.c
>  g364fb_read(uint64_t addr, uint32_t val) "read addr=0x%"PRIx64": 0x%x"
>  g364fb_write(uint64_t addr, uint32_t new) "write addr=0x%"PRIx64": 0x%x"
> -- 
> 1.9.1
> 

  reply	other threads:[~2014-09-01  9:38 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-09-01  7:43 [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver Liu Yuan
2014-09-01  7:43 ` [Qemu-devel] [PATCH 1/8] block/quorum: initialize qcrs.aiocb for read Liu Yuan
2014-09-01  7:43 ` [Qemu-devel] [PATCH 2/8] block: add driver operation callbacks Liu Yuan
2014-09-01  8:28   ` Benoît Canet
2014-09-01  9:19     ` Liu Yuan
2014-09-01  9:28       ` Benoît Canet
2014-09-01  9:40         ` Liu Yuan
2014-09-01  7:43 ` [Qemu-devel] [PATCH 3/8] block/sheepdog: propagate disconnect/reconnect events to upper driver Liu Yuan
2014-09-01  8:31   ` Benoît Canet
2014-09-01  9:22     ` Liu Yuan
2014-09-01  7:43 ` [Qemu-devel] [PATCH 4/8] block/quorum: add quorum_aio_release() helper Liu Yuan
2014-09-01  8:33   ` Benoît Canet
2014-09-01  7:43 ` [Qemu-devel] [PATCH 5/8] quorum: fix quorum_aio_cancel() Liu Yuan
2014-09-01  8:35   ` Benoît Canet
2014-09-01  9:26     ` Liu Yuan
2014-09-01  9:32       ` Benoît Canet
2014-09-01  9:46         ` Liu Yuan
2014-09-01  7:43 ` [Qemu-devel] [PATCH 6/8] block/quorum: add broken state to BlockDriverState Liu Yuan
2014-09-01  8:57   ` Benoît Canet
2014-09-01  9:30     ` Liu Yuan
2014-09-01  7:43 ` [Qemu-devel] [PATCH 7/8] block: add two helpers Liu Yuan
2014-09-01  8:59   ` Benoît Canet
2014-09-01  7:43 ` [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic Liu Yuan
2014-09-01  9:37   ` Benoît Canet [this message]
2014-09-01  9:45     ` Liu Yuan
2014-09-01  8:19 ` [Qemu-devel] [PATCH 0/8] add basic recovery logic to quorum driver Benoît Canet
2014-09-02 22:19 ` Benoît Canet
2014-09-10  7:31   ` Liu Yuan
2014-09-07 15:12 ` Benoît Canet
2014-09-10  7:18   ` Liu Yuan
2014-09-10 13:12     ` Benoît Canet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140901093720.GK15537@irqsave.net \
    --to=benoit.canet@irqsave.net \
    --cc=kwolf@redhat.com \
    --cc=namei.unix@gmail.com \
    --cc=qemu-devel@nongnu.org \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.