qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: qemu-devel@nongnu.org
Cc: kwolf@redhat.com
Subject: [Qemu-devel] [PULL 19/47] quorum: Add the rewrite-corrupted parameter to quorum
Date: Fri, 27 Jun 2014 21:08:38 +0200	[thread overview]
Message-ID: <1403896146-3063-20-git-send-email-kwolf@redhat.com> (raw)
In-Reply-To: <1403896146-3063-1-git-send-email-kwolf@redhat.com>

From: Benoît Canet <benoit.canet@irqsave.net>

On read operations when this parameter is set and some replicas are corrupted
while quorum can be reached quorum will proceed to rewrite the correct version
of the data to fix the corrupted replicas.

This will shine with SSD where the FTL will remap the same block at another
place on rewrite.

Signed-off-by: Benoit Canet <benoit@irqsave.net>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/quorum.c             | 97 +++++++++++++++++++++++++++++++++++++++++++---
 qapi/block-core.json       |  5 ++-
 tests/qemu-iotests/081     | 15 ++++++-
 tests/qemu-iotests/081.out | 10 +++++
 4 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/block/quorum.c b/block/quorum.c
index 86802d3..d5ee9c0 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -23,6 +23,7 @@
 
 #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
 #define QUORUM_OPT_BLKVERIFY      "blkverify"
+#define QUORUM_OPT_REWRITE        "rewrite-corrupted"
 
 /* This union holds a vote hash value */
 typedef union QuorumVoteValue {
@@ -70,6 +71,9 @@ typedef struct BDRVQuorumState {
                             * It is useful to debug other block drivers by
                             * comparing them with a reference one.
                             */
+    bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
+                            * block if Quorum is reached.
+                            */
 } BDRVQuorumState;
 
 typedef struct QuorumAIOCB QuorumAIOCB;
@@ -105,13 +109,17 @@ struct QuorumAIOCB {
     int count;                  /* number of completed AIOCB */
     int success_count;          /* number of successfully completed AIOCB */
 
+    int rewrite_count;          /* number of replica to rewrite: count down to
+                                 * zero once writes are fired
+                                 */
+
     QuorumVotes votes;
 
     bool is_read;
     int vote_ret;
 };
 
-static void quorum_vote(QuorumAIOCB *acb);
+static bool quorum_vote(QuorumAIOCB *acb);
 
 static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
 {
@@ -183,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
     acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
     acb->count = 0;
     acb->success_count = 0;
+    acb->rewrite_count = 0;
     acb->votes.compare = quorum_sha256_compare;
     QLIST_INIT(&acb->votes.vote_list);
     acb->is_read = false;
@@ -232,11 +241,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
     return false;
 }
 
+static void quorum_rewrite_aio_cb(void *opaque, int ret)
+{
+    QuorumAIOCB *acb = opaque;
+
+    /* one less rewrite to do */
+    acb->rewrite_count--;
+
+    /* wait until all rewrite callbacks have completed */
+    if (acb->rewrite_count) {
+        return;
+    }
+
+    quorum_aio_finalize(acb);
+}
+
 static void quorum_aio_cb(void *opaque, int ret)
 {
     QuorumChildRequest *sacb = opaque;
     QuorumAIOCB *acb = sacb->parent;
     BDRVQuorumState *s = acb->common.bs->opaque;
+    bool rewrite = false;
 
     sacb->ret = ret;
     acb->count++;
@@ -253,12 +278,15 @@ static void quorum_aio_cb(void *opaque, int ret)
 
     /* Do the vote on read */
     if (acb->is_read) {
-        quorum_vote(acb);
+        rewrite = quorum_vote(acb);
     } else {
         quorum_has_too_much_io_failed(acb);
     }
 
-    quorum_aio_finalize(acb);
+    /* if no rewrite is done the code will finish right away */
+    if (!rewrite) {
+        quorum_aio_finalize(acb);
+    }
 }
 
 static void quorum_report_bad_versions(BDRVQuorumState *s,
@@ -278,6 +306,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
     }
 }
 
+static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
+                                        QuorumVoteValue *value)
+{
+    QuorumVoteVersion *version;
+    QuorumVoteItem *item;
+    int count = 0;
+
+    /* first count the number of bad versions: done first to avoid concurrency
+     * issues.
+     */
+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
+        if (acb->votes.compare(&version->value, value)) {
+            continue;
+        }
+        QLIST_FOREACH(item, &version->items, next) {
+            count++;
+        }
+    }
+
+    /* quorum_rewrite_aio_cb will count down this to zero */
+    acb->rewrite_count = count;
+
+    /* now fire the correcting rewrites */
+    QLIST_FOREACH(version, &acb->votes.vote_list, next) {
+        if (acb->votes.compare(&version->value, value)) {
+            continue;
+        }
+        QLIST_FOREACH(item, &version->items, next) {
+            bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov,
+                            acb->nb_sectors, quorum_rewrite_aio_cb, acb);
+        }
+    }
+
+    /* return true if any rewrite is done else false */
+    return count;
+}
+
 static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
 {
     int i;
@@ -468,16 +533,17 @@ static int quorum_vote_error(QuorumAIOCB *acb)
     return ret;
 }
 
-static void quorum_vote(QuorumAIOCB *acb)
+static bool quorum_vote(QuorumAIOCB *acb)
 {
     bool quorum = true;
+    bool rewrite = false;
     int i, j, ret;
     QuorumVoteValue hash;
     BDRVQuorumState *s = acb->common.bs->opaque;
     QuorumVoteVersion *winner;
 
     if (quorum_has_too_much_io_failed(acb)) {
-        return;
+        return false;
     }
 
     /* get the index of the first successful read */
@@ -505,7 +571,7 @@ static void quorum_vote(QuorumAIOCB *acb)
     /* Every successful read agrees */
     if (quorum) {
         quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
-        return;
+        return false;
     }
 
     /* compute hashes for each successful read, also store indexes */
@@ -538,9 +604,15 @@ static void quorum_vote(QuorumAIOCB *acb)
     /* some versions are bad print them */
     quorum_report_bad_versions(s, acb, &winner->value);
 
+    /* corruption correction is enabled */
+    if (s->rewrite_corrupted) {
+        rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value);
+    }
+
 free_exit:
     /* free lists */
     quorum_free_vote_list(&acb->votes);
+    return rewrite;
 }
 
 static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
@@ -705,6 +777,11 @@ static QemuOptsList quorum_runtime_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Trigger block verify mode if set",
         },
+        {
+            .name = QUORUM_OPT_REWRITE,
+            .type = QEMU_OPT_BOOL,
+            .help = "Rewrite corrupted block on read quorum",
+        },
         { /* end of list */ }
     },
 };
@@ -766,6 +843,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
                 "and using two files with vote_threshold=2\n");
     }
 
+    s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
+    if (s->rewrite_corrupted && s->is_blkverify) {
+        error_setg(&local_err,
+                   "rewrite-corrupted=on cannot be used with blkverify=on");
+        ret = -EINVAL;
+        goto exit;
+    }
+
     /* allocate the children BlockDriverState array */
     s->bs = g_new0(BlockDriverState *, s->num_children);
     opened = g_new0(bool, s->num_children);
diff --git a/qapi/block-core.json b/qapi/block-core.json
index af6b436..de31f9f 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1329,12 +1329,15 @@
 #
 # @vote-threshold: the vote limit under which a read will fail
 #
+# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached
+#                     (Since 2.1)
+#
 # Since: 2.0
 ##
 { 'type': 'BlockdevOptionsQuorum',
   'data': { '*blkverify': 'bool',
             'children': [ 'BlockdevRef' ],
-            'vote-threshold': 'int' } }
+            'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } }
 
 ##
 # @BlockdevOptions
diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081
index b512d00..7ae4be2 100755
--- a/tests/qemu-iotests/081
+++ b/tests/qemu-iotests/081
@@ -134,15 +134,28 @@ run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF
 EOF
 
 echo
+echo "== using quorum rewrite corrupted mode =="
+
+quorum="$quorum,file.rewrite-corrupted=on"
+
+$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
+
+echo
+echo "== checking that quorum has corrected the corrupted file =="
+
+$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
+
+echo
 echo "== breaking quorum =="
 
 $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io
+$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io
+
 echo
 echo "== checking that quorum is broken =="
 
 $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io
 
-
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out
index 2241cec..073515e 100644
--- a/tests/qemu-iotests/081.out
+++ b/tests/qemu-iotests/081.out
@@ -40,9 +40,19 @@ read 10485760/10485760 bytes at offset 0
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "floppy0", "tray-open": true}}
 
 
+== using quorum rewrite corrupted mode ==
+read 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== checking that quorum has corrected the corrupted file ==
+read 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
 == breaking quorum ==
 wrote 10485760/10485760 bytes at offset 0
 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 10485760/10485760 bytes at offset 0
+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 == checking that quorum is broken ==
 qemu-io: can't open: Could not read image for determining its format: Input/output error
-- 
1.8.3.1

  parent reply	other threads:[~2014-06-27 19:09 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-06-27 19:08 [Qemu-devel] [PULL 00/47] Block patches for 2.1.0-rc0 Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 01/47] blockjob: Add block_job_yield() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 02/47] mirror: Go through ready -> complete process for 0 len image Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 03/47] qemu-iotests: Test BLOCK_JOB_READY event for 0Kb image active commit Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 04/47] qemu-iotests: Test 0-length image for mirror Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 05/47] block/nfs: fix url parameter checking Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 06/47] block/nfs: add knob to set readahead Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 07/47] block: Create bdrv_fill_options() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 08/47] block: Move bdrv_fill_options() call to bdrv_open() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 09/47] block: Move json: parsing to bdrv_fill_options() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 10/47] block: Always pass driver name through options QDict Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 11/47] block: Use common driver selection code for bdrv_open_file() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 12/47] block: Inline bdrv_file_open() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 13/47] block: Remove second bdrv_open() recursion Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 14/47] block: Catch backing files assigned to non-COW drivers Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 15/47] block: Remove a special case for protocols Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 16/47] qemu_opts_append: Play nicely with QemuOptsList's head Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 17/47] block: check for RESIZE blocker in the QMP command, not bdrv_truncate() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 18/47] block: add qemu-iotest for resize base during live commit Kevin Wolf
2014-06-27 19:08 ` Kevin Wolf [this message]
2014-06-27 19:08 ` [Qemu-devel] [PULL 20/47] block: Add node-name argument to drive-mirror Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 21/47] virtio-blk: Move VirtIOBlockReq to header Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 22/47] virtio-blk: Convert VirtIOBlockReq.elem to pointer Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 23/47] virtio-blk: Drop bounce buffer from dataplane code Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 24/47] virtio-blk: Drop VirtIOBlockRequest.read Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 25/47] virtio-blk: Replace VirtIOBlockRequest with VirtIOBlockReq Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 26/47] virtio-blk: Use VirtIOBlockReq.in to drop VirtIOBlockReq.inhdr Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 27/47] virtio-blk: Convert VirtIOBlockReq.out to structrue Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 28/47] virtio-blk: Fill in VirtIOBlockReq.out in dataplane code Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 29/47] virtio-blk: Fix and clean up the in_sg and out_sg check Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 30/47] block: make bdrv_query_stats() static Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 31/47] block: acquire AioContext in qmp_query_blockstats() Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 32/47] virtio-blk: Make request completion function virtual Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 33/47] virtio-blk: Export request handling functions to dataplane Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 34/47] virtio-blk: Schedule BH in the right context Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 35/47] virtio-blk: Unify {non-, }dataplane's request handlings Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 36/47] virtio-blk: Rename complete_request_early to complete_request_vring Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 37/47] blockjob: Fix recent BLOCK_JOB_READY regression Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 38/47] blockjob: Fix recent BLOCK_JOB_ERROR regression Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 39/47] block: Add replaces argument to drive-mirror Kevin Wolf
2014-06-27 19:08 ` [Qemu-devel] [PULL 40/47] qemu-iotests: Add TestRepairQuorum to 041 to test drive-mirror node-name mode Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 41/47] block.c: Don't return success for bdrv_append_temp_snapshot() failure Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 42/47] iotests: Allow out-of-tree run Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 43/47] configure: Enable out-of-tree iotests Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 44/47] iotests: Source common.env Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 45/47] iotests: Use $PYTHON for Python scripts Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 46/47] iotests: Drop Python version from 065's Shebang Kevin Wolf
2014-06-27 19:09 ` [Qemu-devel] [PULL 47/47] iotests: Fix 083 for out-of-tree builds Kevin Wolf
2014-06-29 15:15 ` [Qemu-devel] [PULL 00/47] Block patches for 2.1.0-rc0 Peter Maydell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1403896146-3063-20-git-send-email-kwolf@redhat.com \
    --to=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).