From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:47620) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1XOO3w-0008NN-Rw for qemu-devel@nongnu.org; Mon, 01 Sep 2014 05:38:17 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1XOO3r-0000b0-Mz for qemu-devel@nongnu.org; Mon, 01 Sep 2014 05:38:12 -0400 Received: from lputeaux-656-01-25-125.w80-12.abo.wanadoo.fr ([80.12.84.125]:38486 helo=paradis.irqsave.net) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1XOO3r-0000ap-Ce for qemu-devel@nongnu.org; Mon, 01 Sep 2014 05:38:07 -0400 Date: Mon, 1 Sep 2014 11:37:20 +0200 From: =?iso-8859-1?Q?Beno=EEt?= Canet Message-ID: <20140901093720.GK15537@irqsave.net> References: <1409557394-11853-1-git-send-email-namei.unix@gmail.com> <1409557394-11853-9-git-send-email-namei.unix@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline In-Reply-To: <1409557394-11853-9-git-send-email-namei.unix@gmail.com> Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH 8/8] quorum: add basic device recovery logic List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Liu Yuan Cc: Kevin Wolf , qemu-devel@nongnu.org, Stefan Hajnoczi The Monday 01 Sep 2014 =E0 15:43:14 (+0800), Liu Yuan wrote : > For some configuration, quorum allow VMs to continue while some child d= evices > are broken and when the child devices are repaired and return back, we = need to > sync dirty bits during downtime to keep data consistency. >=20 > The recovery logic is based on the driver state bitmap and will sync th= e dirty > bits with a timeslice window in a coroutine in this prtimive implementa= tion. >=20 > Simple graph about 2 children with threshold=3D1 and read-pattern=3Dfif= o: >=20 > + denote device sync iteration > - IO on a single device > =3D IO on two devices >=20 > sync complete, release dirty bitm= ap > ^ > | > =3D=3D=3D=3D-----------------++++----++++----++=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D > | | > | v > | device repaired and begin to sync > v > device broken, create a dirty bitmap >=20 > This sync logic can take care of nested broken problem, that devices = are > broken while in sync. We just start a sync process after the devices = are > repaired again and switch the devices from broken to sound only when = the sync > completes. >=20 > For read-pattern=3Dquorum mode, it enjoys the recovery logic without an= y problem. >=20 > Cc: Eric Blake > Cc: Benoit Canet > Cc: Kevin Wolf > Cc: Stefan Hajnoczi > Signed-off-by: Liu Yuan > --- > block/quorum.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++++= +++++++- > trace-events | 5 ++ > 2 files changed, 191 insertions(+), 3 deletions(-) >=20 > diff --git a/block/quorum.c b/block/quorum.c > index 7b07e35..ffd7c2d 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -23,6 +23,7 @@ > #include "qapi/qmp/qlist.h" > #include "qapi/qmp/qstring.h" > #include "qapi-event.h" > +#include "trace.h" > =20 > #define HASH_LENGTH 32 > =20 > @@ -31,6 +32,10 @@ > #define QUORUM_OPT_REWRITE "rewrite-corrupted" > #define QUORUM_OPT_READ_PATTERN "read-pattern" > =20 > +#define SLICE_TIME 100000000ULL /* 100 ms */ > +#define CHUNK_SIZE (1 << 20) /* 1M */ > +#define SECTORS_PER_CHUNK (CHUNK_SIZE >> BDRV_SECTOR_BITS) > + > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > char h[HASH_LENGTH]; /* SHA-256 hash */ > @@ -64,6 +69,7 @@ typedef struct QuorumVotes { > =20 > /* the following structure holds the state of one quorum instance */ > typedef struct BDRVQuorumState { > + BlockDriverState *mybs;/* Quorum block driver base state */ > BlockDriverState **bs; /* children BlockDriverStates */ > int num_children; /* children count */ > int threshold; /* if less than threshold children reads ga= ve the > @@ -82,6 +88,10 @@ typedef struct BDRVQuorumState { > */ > =20 > QuorumReadPattern read_pattern; > + BdrvDirtyBitmap *dirty_bitmap; > + uint8_t *sync_buf; > + HBitmapIter hbi; > + int64_t sector_num; > } BDRVQuorumState; > =20 > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -290,12 +300,11 @@ static void quorum_copy_qiov(QEMUIOVector *dest, = QEMUIOVector *source) > } > } > =20 > -static int next_fifo_child(QuorumAIOCB *acb) > +static int get_good_child(BDRVQuorumState *s, int iter) > { > - BDRVQuorumState *s =3D acb->common.bs->opaque; > int i; > =20 > - for (i =3D acb->child_iter; i < s->num_children; i++) { > + for (i =3D iter; i < s->num_children; i++) { > if (!s->bs[i]->broken) { > break; > } > @@ -306,6 +315,13 @@ static int next_fifo_child(QuorumAIOCB *acb) > return i; > } > =20 > +static int next_fifo_child(QuorumAIOCB *acb) > +{ > + BDRVQuorumState *s =3D acb->common.bs->opaque; > + > + return get_good_child(s, acb->child_iter); > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb =3D opaque; > @@ -951,6 +967,171 @@ static int parse_read_pattern(const char *opt) > return -EINVAL; > } > =20 > +static void sync_prepare(BDRVQuorumState *qs, int64_t *num) > +{ > + int64_t nb, total =3D bdrv_nb_sectors(qs->mybs); > + > + qs->sector_num =3D hbitmap_iter_next(&qs->hbi); > + /* Wrap around if previous bits get dirty while syncing */ > + if (qs->sector_num < 0) { > + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi); > + qs->sector_num =3D hbitmap_iter_next(&qs->hbi); > + assert(qs->sector_num >=3D 0); > + } > + > + for (nb =3D 1; nb < SECTORS_PER_CHUNK && qs->sector_num + nb < tot= al; > + nb++) { > + if (!bdrv_get_dirty(qs->mybs, qs->dirty_bitmap, qs->sector_num= + nb)) { > + break; > + } > + } > + *num =3D nb; > +} > + > +static void sync_finish(BDRVQuorumState *qs, int64_t num) > +{ > + int64_t i; > + > + for (i =3D 0; i < num; i++) { > + /* We need to advance the iterator manually */ > + hbitmap_iter_next(&qs->hbi); > + } > + bdrv_reset_dirty(qs->mybs, qs->sector_num, num); > +} > + > +static int quorum_sync_iteration(BDRVQuorumState *qs, BlockDriverState= *target) > +{ > + BlockDriverState *source; > + QEMUIOVector qiov; > + int ret, good; > + int64_t nb_sectors; > + struct iovec iov; > + const char *sname, *tname =3D bdrv_get_filename(target); > + > + good =3D get_good_child(qs, 0); > + if (good < 0) { > + error_report("No good device available."); > + return -1; > + } > + source =3D qs->bs[good]; > + sname =3D bdrv_get_filename(source); > + sync_prepare(qs, &nb_sectors); > + iov.iov_base =3D qs->sync_buf; > + iov.iov_len =3D nb_sectors * BDRV_SECTOR_SIZE; > + qemu_iovec_init_external(&qiov, &iov, 1); > + > + trace_quorum_sync_iteration(sname, tname, qs->sector_num, nb_secto= rs); > + ret =3D bdrv_co_readv(source, qs->sector_num, nb_sectors, &qiov); > + if (ret < 0) { > + error_report("Read source %s failed.", sname); I didn't read this patch throughfully but in quorum if you need to name a= child BDS you must use bs->node_name. bs->node_name was introduced to be able to merge quorum and uniquely iden= tify a given node of the BDS graph. Best regards Beno=EEt > + return ret; > + } > + ret =3D bdrv_co_writev(target, qs->sector_num, nb_sectors, &qiov); > + if (ret < 0) { > + error_report("Write target %s failed.", tname); > + return ret; > + } > + sync_finish(qs, nb_sectors); > + > + return 0; > +} > + > +static int quorum_sync_device(BDRVQuorumState *qs, BlockDriverState *t= arget) > +{ > + uint64_t last_pause_ns; > + > + bdrv_dirty_iter_init(qs->mybs, qs->dirty_bitmap, &qs->hbi); > + last_pause_ns =3D qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + for (;;) { > + int64_t cnt; > + > + cnt =3D bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap); > + if (cnt =3D=3D 0) { > + break; > + } > + error_report("count %ld", cnt); > + if (quorum_sync_iteration(qs, target) < 0) { > + return -1; > + } > + cnt =3D bdrv_get_dirty_count(qs->mybs, qs->dirty_bitmap); > + if (cnt =3D=3D 0) { > + break; > + } > + > + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns >=3D > + SLICE_TIME) { > + co_aio_sleep_ns(bdrv_get_aio_context(target), QEMU_CLOCK_R= EALTIME, > + SLICE_TIME); > + last_pause_ns =3D qemu_clock_get_ns(QEMU_CLOCK_REALTIME); > + } > + } > + > + return 0; > +} > + > +static BlockDriverState *file_to_bs(BDRVQuorumState *qs, BlockDriverSt= ate *file) > +{ > + int i; > + > + for (i =3D 0; i < qs->num_children; i++) { > + BlockDriverState *f =3D bdrv_get_file(qs->bs[i]); > + > + if (f =3D=3D file) { > + return qs->bs[i]; > + } > + } > + > + error_report("Can't find driver state for %s", bdrv_get_filename(f= ile)); > + abort(); > +} > + > +static void quorum_driver_reconnect(BlockDriverState *file) > +{ > + BDRVQuorumState *qs =3D file->drv_opaque; > + BlockDriverState *bs =3D file_to_bs(qs, file); > + const char *name =3D bdrv_get_filename(bs); > + > + trace_quorum_driver_reconnect(name); > + assert(bs->broken =3D=3D true); > + if (quorum_sync_device(qs, bs) < 0) { > + error_report("Failed to sync device %s", name); > + return; > + } > + > + bdrv_release_dirty_bitmap(qs->mybs, qs->dirty_bitmap); > + qemu_vfree(qs->sync_buf); > + bs->broken =3D false; > +} > + > +static void quorum_driver_disconnect(BlockDriverState *file) > +{ > + BDRVQuorumState *qs =3D file->drv_opaque; > + BlockDriverState *bs =3D file_to_bs(qs, file); > + const char *name =3D bdrv_get_filename(bs); > + > + trace_quorum_driver_disconnect(name); > + /* > + * If we are disconnected while being syncing, we expect to reconn= ect to the > + * target again and resume the data sync from the last synced poin= t. > + */ > + if (bs->broken) { > + return; > + } > + > + bs->broken =3D true; > + qs->dirty_bitmap =3D bdrv_create_dirty_bitmap(qs->mybs, BDRV_SECTO= R_SIZE, > + NULL); > + if (!qs->dirty_bitmap) { > + abort(); > + } > + qs->sync_buf =3D qemu_blockalign(bs, CHUNK_SIZE); > +} > + > +static const BlockDrvOps quorum_block_drv_ops =3D { > + .driver_reconnect =3D quorum_driver_reconnect, > + .driver_disconnect =3D quorum_driver_disconnect, > +}; > + > static int quorum_open(BlockDriverState *bs, QDict *options, int flags= , > Error **errp) > { > @@ -975,6 +1156,7 @@ static int quorum_open(BlockDriverState *bs, QDict= *options, int flags, > goto exit; > } > =20 > + s->mybs =3D bs; > /* count how many different children are present */ > s->num_children =3D qlist_size(list); > if (s->num_children < 2) { > @@ -1061,6 +1243,7 @@ static int quorum_open(BlockDriverState *bs, QDic= t *options, int flags, > goto close_exit; > } > opened[i] =3D true; > + bdrv_set_drv_ops(bdrv_get_file(s->bs[i]), &quorum_block_drv_op= s, s); > } > =20 > g_free(opened); > diff --git a/trace-events b/trace-events > index 81bc915..8da0a13 100644 > --- a/trace-events > +++ b/trace-events > @@ -572,6 +572,11 @@ qed_aio_write_prefill(void *s, void *acb, uint64_t= start, size_t len, uint64_t o > qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len,= uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 > qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_= t len) "s %p acb %p ret %d offset %"PRIu64" len %zu" > =20 > +# block/quorum.c > +quorum_sync_iteration(const char *source, const char *target, int64_t = sector, int num) "%s -> %s, sector %"PRId64" nb_sectors %d" > +quorum_driver_reconnect(const char *target) "%s" > +quorum_driver_disconnect(const char *target) "%s" > + > # hw/display/g364fb.c > g364fb_read(uint64_t addr, uint32_t val) "read addr=3D0x%"PRIx64": 0x%= x" > g364fb_write(uint64_t addr, uint32_t new) "write addr=3D0x%"PRIx64": 0= x%x" > --=20 > 1.9.1 >=20