From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:38049) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WNSoe-0001zB-0D for qemu-devel@nongnu.org; Tue, 11 Mar 2014 15:58:25 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1WNSoY-0003b0-Q4 for qemu-devel@nongnu.org; Tue, 11 Mar 2014 15:58:19 -0400 Received: from mx1.redhat.com ([209.132.183.28]:8416) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1WNSoY-0003ar-Hx for qemu-devel@nongnu.org; Tue, 11 Mar 2014 15:58:14 -0400 Message-ID: <531F6ACE.7030900@redhat.com> Date: Tue, 11 Mar 2014 20:58:06 +0100 From: Max Reitz MIME-Version: 1.0 References: <1394555780-27665-1-git-send-email-benoit.canet@irqsave.net> <1394555780-27665-2-git-send-email-benoit.canet@irqsave.net> In-Reply-To: <1394555780-27665-2-git-send-email-benoit.canet@irqsave.net> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: quoted-printable Subject: Re: [Qemu-devel] [PATCH 1/3] quorum: Add the rewrite-corrupted parameter to quorum. List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: =?ISO-8859-1?Q?Beno=EEt_Canet?= , qemu-devel@nongnu.org Cc: kwolf@redhat.com, Benoit Canet , stefanha@redhat.com On 11.03.2014 17:36, Beno=EEt Canet wrote: > On read operations when this parameter is set and some replicas are cor= rupted > while quorum can be reached quorum will proceed to rewrite the correct = version > of the data to fix the corrupted replicas. > > This will shine with SSD where the FTL will remap the same block at ano= ther > place on rewrite. > > Signed-off-by: Benoit Canet > --- > block/quorum.c | 97 +++++++++++++++++++++++++++++++++++++= ++++++--- > qapi-schema.json | 5 ++- > tests/qemu-iotests/081 | 14 +++++++ > tests/qemu-iotests/081.out | 11 +++++- > 4 files changed, 119 insertions(+), 8 deletions(-) > > diff --git a/block/quorum.c b/block/quorum.c > index bd997b7..af4fd3c 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -22,6 +22,7 @@ > =20 > #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" > #define QUORUM_OPT_BLKVERIFY "blkverify" > +#define QUORUM_OPT_REWRITE "rewrite-corrupted" > =20 > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > @@ -69,6 +70,9 @@ typedef struct BDRVQuorumState { > * It is useful to debug other block drive= rs by > * comparing them with a reference one. > */ > + bool rewrite_corrupted;/* true if the driver must rewrite-on-read = corrupted > + * block if Quorum is reached. > + */ > } BDRVQuorumState; > =20 > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -104,13 +108,17 @@ struct QuorumAIOCB { > int count; /* number of completed AIOCB */ > int success_count; /* number of successfully completed A= IOCB */ > =20 > + int rewrite_count; /* number of replica to rewrite: count= down to > + * zero once writes are fired > + */ > + > QuorumVotes votes; > =20 > bool is_read; > int vote_ret; > }; > =20 > -static void quorum_vote(QuorumAIOCB *acb); > +static bool quorum_vote(QuorumAIOCB *acb); > =20 > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > { > @@ -182,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState = *s, > acb->qcrs =3D g_new0(QuorumChildRequest, s->num_children); > acb->count =3D 0; > acb->success_count =3D 0; > + acb->rewrite_count =3D 0; > acb->votes.compare =3D quorum_sha256_compare; > QLIST_INIT(&acb->votes.vote_list); > acb->is_read =3D false; > @@ -241,11 +250,27 @@ static bool quorum_has_too_much_io_failed(QuorumA= IOCB *acb) > return false; > } > =20 > +static void quorum_rewrite_aio_cb(void *opaque, int ret) > +{ > + QuorumAIOCB *acb =3D opaque; > + > + /* one less rewrite to do */ > + acb->rewrite_count--; > + > + /* wait until all rewrite callback have completed */ *callbacks > + if (acb->rewrite_count) { > + return; > + } > + > + quorum_aio_finalize(acb); > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb =3D opaque; > QuorumAIOCB *acb =3D sacb->parent; > BDRVQuorumState *s =3D acb->common.bs->opaque; > + bool rewrite =3D false; > =20 > sacb->ret =3D ret; > acb->count++; > @@ -262,12 +287,15 @@ static void quorum_aio_cb(void *opaque, int ret) > =20 > /* Do the vote on read */ > if (acb->is_read) { > - quorum_vote(acb); > + rewrite =3D quorum_vote(acb); > } else { > quorum_has_too_much_io_failed(acb); > } > =20 > - quorum_aio_finalize(acb); > + /* if no rewrite is done the code will finish right away */ > + if (!rewrite) { > + quorum_aio_finalize(acb); > + } > } > =20 > static void quorum_report_bad_versions(BDRVQuorumState *s, > @@ -287,6 +315,43 @@ static void quorum_report_bad_versions(BDRVQuorumS= tate *s, > } > } > =20 > +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOC= B *acb, > + QuorumVoteValue *value) > +{ > + QuorumVoteVersion *version; > + QuorumVoteItem *item; > + int count =3D 0; > + > + /* first count the number of bad versions: done first to avoid con= curency *concurrency > + * issues. > + */ > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > + if (acb->votes.compare(&version->value, value)) { > + continue; > + } > + QLIST_FOREACH(item, &version->items, next) { > + count++; > + } > + } > + > + /* quorum_rewrite_aio_cb will count down this to zero */ > + acb->rewrite_count =3D count; > + > + /* now fire the correcting rewrites */ > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > + if (acb->votes.compare(&version->value, value)) { > + continue; > + } > + QLIST_FOREACH(item, &version->items, next) { > + bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->= qiov, > + acb->nb_sectors, quorum_rewrite_aio_cb, ac= b); > + } > + } > + > + /* return true if any rewrite is done else false */ > + return count; > +} > + > static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source= ) > { > int i; > @@ -477,16 +542,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) > return ret; > } > =20 > -static void quorum_vote(QuorumAIOCB *acb) > +static bool quorum_vote(QuorumAIOCB *acb) > { > bool quorum =3D true; > + bool rewrite =3D false; > int i, j, ret; > QuorumVoteValue hash; > BDRVQuorumState *s =3D acb->common.bs->opaque; > QuorumVoteVersion *winner; > =20 > if (quorum_has_too_much_io_failed(acb)) { > - return; > + return false;; Two semicolons here. > } > =20 > /* get the index of the first successful read */ > @@ -514,7 +580,7 @@ static void quorum_vote(QuorumAIOCB *acb) > /* Every successful read agrees */ > if (quorum) { > quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); > - return; > + return false;; And here. > } > =20 > /* compute hashes for each successful read, also store indexes */ > @@ -547,9 +613,15 @@ static void quorum_vote(QuorumAIOCB *acb) > /* some versions are bad print them */ > quorum_report_bad_versions(s, acb, &winner->value); > =20 > + /* corruption correction is actived */ I'd vote for "enabled" rather than "actived". But I like "corruption=20 correction". :-) > + if (s->rewrite_corrupted) { > + rewrite =3D quorum_rewrite_bad_versions(s, acb, &winner->value= ); > + } > + > free_exit: > /* free lists */ > quorum_free_vote_list(&acb->votes); > + return rewrite; > } > =20 > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > @@ -709,6 +781,11 @@ static QemuOptsList quorum_runtime_opts =3D { > .type =3D QEMU_OPT_BOOL, > .help =3D "Trigger block verify mode if set", > }, > + { > + .name =3D QUORUM_OPT_REWRITE, > + .type =3D QEMU_OPT_BOOL, > + .help =3D "Rewrite corrupted block on read quorum", > + }, > { /* end of list */ } > }, > }; > @@ -770,6 +847,14 @@ static int quorum_open(BlockDriverState *bs, QDict= *options, int flags, > "and using two files with vote_threshold=3D2\n"); > } > =20 > + s->rewrite_corrupted =3D qemu_opt_get_bool(opts, QUORUM_OPT_REWRIT= E, false); > + if (s->rewrite_corrupted && s->is_blkverify) { > + error_setg(&local_err, > + "rewrite-corrupted=3Don cannot be used with blkveri= fy=3Don"); > + ret =3D -EINVAL; > + goto exit; > + } > + > /* allocate the children BlockDriverState array */ > s->bs =3D g_new0(BlockDriverState *, s->num_children); > opened =3D g_new0(bool, s->num_children); > diff --git a/qapi-schema.json b/qapi-schema.json > index 6476d4a..f5a8767 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -4542,12 +4542,15 @@ > # > # @vote-threshold: the vote limit under which a read will fail > # > +# @rewrite-corrupted: #optional rewrite corrupted data when quorum is = reached > +# (Since 2.1) > +# > # Since: 2.0 > ## > { 'type': 'BlockdevOptionsQuorum', > 'data': { '*blkverify': 'bool', > 'children': [ 'BlockdevRef' ], > - 'vote-threshold': 'int' } } > + 'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } } > =20 > ## > # @BlockdevOptions > diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081 > index b512d00..bb211d6 100755 > --- a/tests/qemu-iotests/081 > +++ b/tests/qemu-iotests/081 > @@ -95,6 +95,18 @@ echo "=3D=3D checking quorum correction =3D=3D" > $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qem= u_io > =20 > echo > +echo "=3D=3D using quorum rewrite corrupted mode =3D=3D" > + > +quorum=3D"$quorum,file.rewrite-corrupted=3Don" > + > +$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu= _io > + > +echo > +echo "=3D=3D checking that quorum has corrected the corrupted file =3D= =3D" > + > +$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io > + I'd prefer it if you could put these new test cases below the "checking=20 mixed reference/option specification" test, just because I'd like that=20 test to output the quorum warning - especially there is no other point=20 in the test where we can see that QMP message. But that's up to you and if you at least fix the double semicolons: Reviewed-by: Max Reitz > +echo > echo "=3D=3D checking mixed reference/option specification =3D=3D" > =20 > run_qemu -drive "file=3D$TEST_DIR/2.raw,format=3D$IMGFMT,if=3Dnone,id= =3Ddrive2" < @@ -137,6 +149,8 @@ echo > echo "=3D=3D breaking quorum =3D=3D" > =20 > $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_= io > +$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_i= o > + > echo > echo "=3D=3D checking that quorum is broken =3D=3D" > =20 > diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out > index 84aeb0c..2d8b290 100644 > --- a/tests/qemu-iotests/081.out > +++ b/tests/qemu-iotests/081.out > @@ -25,12 +25,19 @@ wrote 10485760/10485760 bytes at offset 0 > read 10485760/10485760 bytes at offset 0 > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > =20 > +=3D=3D using quorum rewrite corrupted mode =3D=3D > +read 10485760/10485760 bytes at offset 0 > +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > + > +=3D=3D checking that quorum has corrected the corrupted file =3D=3D > +read 10485760/10485760 bytes at offset 0 > +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > + > =3D=3D checking mixed reference/option specification =3D=3D > Testing: -drive file=3DTEST_DIR/2.IMGFMT,format=3DIMGFMT,if=3Dnone,id= =3Ddrive2 > QMP_VERSION > {"return": {}} > {"return": {}} > -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "ev= ent": "QUORUM_REPORT_BAD", "data": {"node-name": "", "sectors-count": 204= 80, "sector-num": 0}} > read 10485760/10485760 bytes at offset 0 > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > {"return": ""} > @@ -43,6 +50,8 @@ read 10485760/10485760 bytes at offset 0 > =3D=3D breaking quorum =3D=3D > wrote 10485760/10485760 bytes at offset 0 > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > +wrote 10485760/10485760 bytes at offset 0 > +10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > =20 > =3D=3D checking that quorum is broken =3D=3D > qemu-io: can't open device (null): Could not read image for determini= ng its format: Input/output error