From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:34311) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X0bWo-0005AK-3A for qemu-devel@nongnu.org; Fri, 27 Jun 2014 15:09:47 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1X0bWi-0006v8-Tw for qemu-devel@nongnu.org; Fri, 27 Jun 2014 15:09:41 -0400 Received: from mx1.redhat.com ([209.132.183.28]:8638) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X0bWi-0006v0-MM for qemu-devel@nongnu.org; Fri, 27 Jun 2014 15:09:36 -0400 Received: from int-mx13.intmail.prod.int.phx2.redhat.com (int-mx13.intmail.prod.int.phx2.redhat.com [10.5.11.26]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s5RJ9aZt005923 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK) for ; Fri, 27 Jun 2014 15:09:36 -0400 From: Kevin Wolf Date: Fri, 27 Jun 2014 21:08:38 +0200 Message-Id: <1403896146-3063-20-git-send-email-kwolf@redhat.com> In-Reply-To: <1403896146-3063-1-git-send-email-kwolf@redhat.com> References: <1403896146-3063-1-git-send-email-kwolf@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: quoted-printable Subject: [Qemu-devel] [PULL 19/47] quorum: Add the rewrite-corrupted parameter to quorum List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: kwolf@redhat.com From: Beno=C3=AEt Canet On read operations when this parameter is set and some replicas are corru= pted while quorum can be reached quorum will proceed to rewrite the correct ve= rsion of the data to fix the corrupted replicas. This will shine with SSD where the FTL will remap the same block at anoth= er place on rewrite. Signed-off-by: Benoit Canet Reviewed-by: Max Reitz Signed-off-by: Kevin Wolf --- block/quorum.c | 97 ++++++++++++++++++++++++++++++++++++++++= +++--- qapi/block-core.json | 5 ++- tests/qemu-iotests/081 | 15 ++++++- tests/qemu-iotests/081.out | 10 +++++ 4 files changed, 119 insertions(+), 8 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index 86802d3..d5ee9c0 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -23,6 +23,7 @@ =20 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" #define QUORUM_OPT_BLKVERIFY "blkverify" +#define QUORUM_OPT_REWRITE "rewrite-corrupted" =20 /* This union holds a vote hash value */ typedef union QuorumVoteValue { @@ -70,6 +71,9 @@ typedef struct BDRVQuorumState { * It is useful to debug other block drivers = by * comparing them with a reference one. */ + bool rewrite_corrupted;/* true if the driver must rewrite-on-read co= rrupted + * block if Quorum is reached. + */ } BDRVQuorumState; =20 typedef struct QuorumAIOCB QuorumAIOCB; @@ -105,13 +109,17 @@ struct QuorumAIOCB { int count; /* number of completed AIOCB */ int success_count; /* number of successfully completed AIOC= B */ =20 + int rewrite_count; /* number of replica to rewrite: count d= own to + * zero once writes are fired + */ + QuorumVotes votes; =20 bool is_read; int vote_ret; }; =20 -static void quorum_vote(QuorumAIOCB *acb); +static bool quorum_vote(QuorumAIOCB *acb); =20 static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) { @@ -183,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s= , acb->qcrs =3D g_new0(QuorumChildRequest, s->num_children); acb->count =3D 0; acb->success_count =3D 0; + acb->rewrite_count =3D 0; acb->votes.compare =3D quorum_sha256_compare; QLIST_INIT(&acb->votes.vote_list); acb->is_read =3D false; @@ -232,11 +241,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIO= CB *acb) return false; } =20 +static void quorum_rewrite_aio_cb(void *opaque, int ret) +{ + QuorumAIOCB *acb =3D opaque; + + /* one less rewrite to do */ + acb->rewrite_count--; + + /* wait until all rewrite callbacks have completed */ + if (acb->rewrite_count) { + return; + } + + quorum_aio_finalize(acb); +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb =3D opaque; QuorumAIOCB *acb =3D sacb->parent; BDRVQuorumState *s =3D acb->common.bs->opaque; + bool rewrite =3D false; =20 sacb->ret =3D ret; acb->count++; @@ -253,12 +278,15 @@ static void quorum_aio_cb(void *opaque, int ret) =20 /* Do the vote on read */ if (acb->is_read) { - quorum_vote(acb); + rewrite =3D quorum_vote(acb); } else { quorum_has_too_much_io_failed(acb); } =20 - quorum_aio_finalize(acb); + /* if no rewrite is done the code will finish right away */ + if (!rewrite) { + quorum_aio_finalize(acb); + } } =20 static void quorum_report_bad_versions(BDRVQuorumState *s, @@ -278,6 +306,43 @@ static void quorum_report_bad_versions(BDRVQuorumSta= te *s, } } =20 +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB = *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + int count =3D 0; + + /* first count the number of bad versions: done first to avoid concu= rrency + * issues. + */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + count++; + } + } + + /* quorum_rewrite_aio_cb will count down this to zero */ + acb->rewrite_count =3D count; + + /* now fire the correcting rewrites */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qi= ov, + acb->nb_sectors, quorum_rewrite_aio_cb, acb)= ; + } + } + + /* return true if any rewrite is done else false */ + return count; +} + static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) { int i; @@ -468,16 +533,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) return ret; } =20 -static void quorum_vote(QuorumAIOCB *acb) +static bool quorum_vote(QuorumAIOCB *acb) { bool quorum =3D true; + bool rewrite =3D false; int i, j, ret; QuorumVoteValue hash; BDRVQuorumState *s =3D acb->common.bs->opaque; QuorumVoteVersion *winner; =20 if (quorum_has_too_much_io_failed(acb)) { - return; + return false; } =20 /* get the index of the first successful read */ @@ -505,7 +571,7 @@ static void quorum_vote(QuorumAIOCB *acb) /* Every successful read agrees */ if (quorum) { quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return; + return false; } =20 /* compute hashes for each successful read, also store indexes */ @@ -538,9 +604,15 @@ static void quorum_vote(QuorumAIOCB *acb) /* some versions are bad print them */ quorum_report_bad_versions(s, acb, &winner->value); =20 + /* corruption correction is enabled */ + if (s->rewrite_corrupted) { + rewrite =3D quorum_rewrite_bad_versions(s, acb, &winner->value); + } + free_exit: /* free lists */ quorum_free_vote_list(&acb->votes); + return rewrite; } =20 static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, @@ -705,6 +777,11 @@ static QemuOptsList quorum_runtime_opts =3D { .type =3D QEMU_OPT_BOOL, .help =3D "Trigger block verify mode if set", }, + { + .name =3D QUORUM_OPT_REWRITE, + .type =3D QEMU_OPT_BOOL, + .help =3D "Rewrite corrupted block on read quorum", + }, { /* end of list */ } }, }; @@ -766,6 +843,14 @@ static int quorum_open(BlockDriverState *bs, QDict *= options, int flags, "and using two files with vote_threshold=3D2\n"); } =20 + s->rewrite_corrupted =3D qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE,= false); + if (s->rewrite_corrupted && s->is_blkverify) { + error_setg(&local_err, + "rewrite-corrupted=3Don cannot be used with blkverify= =3Don"); + ret =3D -EINVAL; + goto exit; + } + /* allocate the children BlockDriverState array */ s->bs =3D g_new0(BlockDriverState *, s->num_children); opened =3D g_new0(bool, s->num_children); diff --git a/qapi/block-core.json b/qapi/block-core.json index af6b436..de31f9f 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1329,12 +1329,15 @@ # # @vote-threshold: the vote limit under which a read will fail # +# @rewrite-corrupted: #optional rewrite corrupted data when quorum is re= ached +# (Since 2.1) +# # Since: 2.0 ## { 'type': 'BlockdevOptionsQuorum', 'data': { '*blkverify': 'bool', 'children': [ 'BlockdevRef' ], - 'vote-threshold': 'int' } } + 'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } } =20 ## # @BlockdevOptions diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081 index b512d00..7ae4be2 100755 --- a/tests/qemu-iotests/081 +++ b/tests/qemu-iotests/081 @@ -134,15 +134,28 @@ run_qemu -drive "file=3D$TEST_DIR/2.raw,format=3D$I= MGFMT,if=3Dnone,id=3Ddrive2" <