From: Kevin Wolf <kwolf@redhat.com>
To: anthony@codemonkey.ws
Cc: kwolf@redhat.com, qemu-devel@nongnu.org
Subject: [Qemu-devel] [PATCH 35/38] qed: Consistency check support
Date: Fri, 17 Dec 2010 18:44:50 +0100 [thread overview]
Message-ID: <1292607893-13461-36-git-send-email-kwolf@redhat.com> (raw)
In-Reply-To: <1292607893-13461-1-git-send-email-kwolf@redhat.com>
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
This patch adds support for the qemu-img check command. It also
introduces a dirty bit in the qed header to mark modified images as
needing a check. This bit is cleared when the image file is closed
cleanly.
If an image file is opened and it has the dirty bit set, a consistency
check will run and try to fix corrupted table offsets. These
corruptions may occur if there is power loss while an allocating write
is performed. Once the image is fixed it opens as normal again.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/qed-check.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++++
block/qed.c | 125 +++++++++++++++++++++++++++++++-
block/qed.h | 4 +
3 files changed, 336 insertions(+), 3 deletions(-)
create mode 100644 block/qed-check.c
diff --git a/block/qed-check.c b/block/qed-check.c
new file mode 100644
index 0000000..4600932
--- /dev/null
+++ b/block/qed-check.c
@@ -0,0 +1,210 @@
+/*
+ * QEMU Enhanced Disk Format Consistency Check
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qed.h"
+
+typedef struct {
+ BDRVQEDState *s;
+ BdrvCheckResult *result;
+ bool fix; /* whether to fix invalid offsets */
+
+ size_t nclusters;
+ uint32_t *used_clusters; /* referenced cluster bitmap */
+
+ QEDRequest request;
+} QEDCheck;
+
+static bool qed_test_bit(uint32_t *bitmap, uint64_t n) {
+ return !!(bitmap[n / 32] & (1 << (n % 32)));
+}
+
+static void qed_set_bit(uint32_t *bitmap, uint64_t n) {
+ bitmap[n / 32] |= 1 << (n % 32);
+}
+
+/**
+ * Set bitmap bits for clusters
+ *
+ * @check: Check structure
+ * @offset: Starting offset in bytes
+ * @n: Number of clusters
+ */
+static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset,
+ unsigned int n)
+{
+ uint64_t cluster = qed_bytes_to_clusters(check->s, offset);
+ unsigned int corruptions = 0;
+
+ while (n-- != 0) {
+ /* Clusters should only be referenced once */
+ if (qed_test_bit(check->used_clusters, cluster)) {
+ corruptions++;
+ }
+
+ qed_set_bit(check->used_clusters, cluster);
+ cluster++;
+ }
+
+ check->result->corruptions += corruptions;
+ return corruptions == 0;
+}
+
+/**
+ * Check an L2 table
+ *
+ * @ret: Number of invalid cluster offsets
+ */
+static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
+{
+ BDRVQEDState *s = check->s;
+ unsigned int i, num_invalid = 0;
+
+ for (i = 0; i < s->table_nelems; i++) {
+ uint64_t offset = table->offsets[i];
+
+ if (!offset) {
+ continue;
+ }
+
+ /* Detect invalid cluster offset */
+ if (!qed_check_cluster_offset(s, offset)) {
+ if (check->fix) {
+ table->offsets[i] = 0;
+ } else {
+ check->result->corruptions++;
+ }
+
+ num_invalid++;
+ continue;
+ }
+
+ qed_set_used_clusters(check, offset, 1);
+ }
+
+ return num_invalid;
+}
+
+/**
+ * Descend tables and check each cluster is referenced once only
+ */
+static int qed_check_l1_table(QEDCheck *check, QEDTable *table)
+{
+ BDRVQEDState *s = check->s;
+ unsigned int i, num_invalid_l1 = 0;
+ int ret, last_error = 0;
+
+ /* Mark L1 table clusters used */
+ qed_set_used_clusters(check, s->header.l1_table_offset,
+ s->header.table_size);
+
+ for (i = 0; i < s->table_nelems; i++) {
+ unsigned int num_invalid_l2;
+ uint64_t offset = table->offsets[i];
+
+ if (!offset) {
+ continue;
+ }
+
+ /* Detect invalid L2 offset */
+ if (!qed_check_table_offset(s, offset)) {
+ /* Clear invalid offset */
+ if (check->fix) {
+ table->offsets[i] = 0;
+ } else {
+ check->result->corruptions++;
+ }
+
+ num_invalid_l1++;
+ continue;
+ }
+
+ if (!qed_set_used_clusters(check, offset, s->header.table_size)) {
+ continue; /* skip an invalid table */
+ }
+
+ ret = qed_read_l2_table_sync(s, &check->request, offset);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ continue;
+ }
+
+ num_invalid_l2 = qed_check_l2_table(check,
+ check->request.l2_table->table);
+
+ /* Write out fixed L2 table */
+ if (num_invalid_l2 > 0 && check->fix) {
+ ret = qed_write_l2_table_sync(s, &check->request, 0,
+ s->table_nelems, false);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ continue;
+ }
+ }
+ }
+
+ /* Drop reference to final table */
+ qed_unref_l2_cache_entry(check->request.l2_table);
+ check->request.l2_table = NULL;
+
+ /* Write out fixed L1 table */
+ if (num_invalid_l1 > 0 && check->fix) {
+ ret = qed_write_l1_table_sync(s, 0, s->table_nelems);
+ if (ret) {
+ check->result->check_errors++;
+ last_error = ret;
+ }
+ }
+
+ return last_error;
+}
+
+/**
+ * Check for unreferenced (leaked) clusters
+ */
+static void qed_check_for_leaks(QEDCheck *check)
+{
+ BDRVQEDState *s = check->s;
+ size_t i;
+
+ for (i = s->header.header_size; i < check->nclusters; i++) {
+ if (!qed_test_bit(check->used_clusters, i)) {
+ check->result->leaks++;
+ }
+ }
+}
+
+int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
+{
+ QEDCheck check = {
+ .s = s,
+ .result = result,
+ .nclusters = qed_bytes_to_clusters(s, s->file_size),
+ .request = { .l2_table = NULL },
+ .fix = fix,
+ };
+ int ret;
+
+ check.used_clusters = qemu_mallocz(((check.nclusters + 31) / 32) *
+ sizeof(check.used_clusters[0]));
+
+ ret = qed_check_l1_table(&check, s->l1_table);
+ if (ret == 0) {
+ /* Only check for leaks if entire image was scanned successfully */
+ qed_check_for_leaks(&check);
+ }
+
+ qemu_free(check.used_clusters);
+ return ret;
+}
diff --git a/block/qed.c b/block/qed.c
index 8e65d18..085c4f2 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -99,6 +99,81 @@ static int qed_write_header_sync(BDRVQEDState *s)
return 0;
}
+typedef struct {
+ GenericCB gencb;
+ BDRVQEDState *s;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ int nsectors;
+ uint8_t *buf;
+} QEDWriteHeaderCB;
+
+static void qed_write_header_cb(void *opaque, int ret)
+{
+ QEDWriteHeaderCB *write_header_cb = opaque;
+
+ qemu_vfree(write_header_cb->buf);
+ gencb_complete(write_header_cb, ret);
+}
+
+static void qed_write_header_read_cb(void *opaque, int ret)
+{
+ QEDWriteHeaderCB *write_header_cb = opaque;
+ BDRVQEDState *s = write_header_cb->s;
+ BlockDriverAIOCB *acb;
+
+ if (ret) {
+ qed_write_header_cb(write_header_cb, ret);
+ return;
+ }
+
+ /* Update header */
+ qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf);
+
+ acb = bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov,
+ write_header_cb->nsectors, qed_write_header_cb,
+ write_header_cb);
+ if (!acb) {
+ qed_write_header_cb(write_header_cb, -EIO);
+ }
+}
+
+/**
+ * Update header in-place (does not rewrite backing filename or other strings)
+ *
+ * This function only updates known header fields in-place and does not affect
+ * extra data after the QED header.
+ */
+static void qed_write_header(BDRVQEDState *s, BlockDriverCompletionFunc cb,
+ void *opaque)
+{
+ /* We must write full sectors for O_DIRECT but cannot necessarily generate
+ * the data following the header if an unrecognized compat feature is
+ * active. Therefore, first read the sectors containing the header, update
+ * them, and write back.
+ */
+
+ BlockDriverAIOCB *acb;
+ int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) /
+ BDRV_SECTOR_SIZE;
+ size_t len = nsectors * BDRV_SECTOR_SIZE;
+ QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb),
+ cb, opaque);
+
+ write_header_cb->s = s;
+ write_header_cb->nsectors = nsectors;
+ write_header_cb->buf = qemu_blockalign(s->bs, len);
+ write_header_cb->iov.iov_base = write_header_cb->buf;
+ write_header_cb->iov.iov_len = len;
+ qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1);
+
+ acb = bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors,
+ qed_write_header_read_cb, write_header_cb);
+ if (!acb) {
+ qed_write_header_cb(write_header_cb, -EIO);
+ }
+}
+
static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size)
{
uint64_t table_entries;
@@ -310,6 +385,32 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
ret = qed_read_l1_table_sync(s);
if (ret) {
+ goto out;
+ }
+
+ /* If image was not closed cleanly, check consistency */
+ if (s->header.features & QED_F_NEED_CHECK) {
+ /* Read-only images cannot be fixed. There is no risk of corruption
+ * since write operations are not possible. Therefore, allow
+ * potentially inconsistent images to be opened read-only. This can
+ * aid data recovery from an otherwise inconsistent image.
+ */
+ if (!bdrv_is_read_only(bs->file)) {
+ BdrvCheckResult result = {0};
+
+ ret = qed_check(s, &result, true);
+ if (!ret && !result.corruptions && !result.check_errors) {
+ /* Ensure fixes reach storage before clearing check bit */
+ bdrv_flush(s->bs);
+
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header_sync(s);
+ }
+ }
+ }
+
+out:
+ if (ret) {
qed_free_l2_cache(&s->l2_cache);
qemu_vfree(s->l1_table);
}
@@ -320,6 +421,15 @@ static void bdrv_qed_close(BlockDriverState *bs)
{
BDRVQEDState *s = bs->opaque;
+ /* Ensure writes reach stable storage */
+ bdrv_flush(bs->file);
+
+ /* Clean shutdown, no check required on next open */
+ if (s->header.features & QED_F_NEED_CHECK) {
+ s->header.features &= ~QED_F_NEED_CHECK;
+ qed_write_header_sync(s);
+ }
+
qed_free_l2_cache(&s->l2_cache);
qemu_vfree(s->l1_table);
}
@@ -885,8 +995,15 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
- /* Write new cluster */
- qed_aio_write_prefill(acb, 0);
+ /* Write new cluster if the image is already marked dirty */
+ if (s->header.features & QED_F_NEED_CHECK) {
+ qed_aio_write_prefill(acb, 0);
+ return;
+ }
+
+ /* Mark the image dirty before writing the new cluster */
+ s->header.features |= QED_F_NEED_CHECK;
+ qed_write_header(s, qed_aio_write_prefill, acb);
}
/**
@@ -1172,7 +1289,9 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
{
- return -ENOTSUP;
+ BDRVQEDState *s = bs->opaque;
+
+ return qed_check(s, result, false);
}
static QEMUOptionParameter qed_create_options[] = {
diff --git a/block/qed.h b/block/qed.h
index 046a410..2925e37 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -50,11 +50,15 @@ enum {
/* The image supports a backing file */
QED_F_BACKING_FILE = 0x01,
+ /* The image needs a consistency check before use */
+ QED_F_NEED_CHECK = 0x02,
+
/* The backing file format must not be probed, treat as raw image */
QED_F_BACKING_FORMAT_NO_PROBE = 0x04,
/* Feature bits must be used when the on-disk format changes */
QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */
+ QED_F_NEED_CHECK |
QED_F_BACKING_FORMAT_NO_PROBE,
QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */
QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */
--
1.7.2.3
next prev parent reply other threads:[~2010-12-17 17:47 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-12-17 17:44 [Qemu-devel] [PULL 00/38] Block patches Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 01/38] blockdev: check dinfo ptr before using Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 02/38] block: Introduce path_has_protocol() function Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 03/38] block: Fix the use of protocols in backing files Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 04/38] Introduce strtosz_suffix() Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 05/38] qemu-img.c: Clean up handling of image size in img_create() Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 06/38] ide: split ide command interpretation off Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 07/38] ide: fix whitespace gap in ide_exec_cmd Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 08/38] ide: Split out BMDMA code from ATA core Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 09/38] ide: move transfer_start after variable modification Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 10/38] ide: add ncq identify data for ahci sata drives Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 11/38] pci: add storage class for sata Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 12/38] pci: add ich9 pci id Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 13/38] ahci: add ahci emulation Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 14/38] config: move ide core and pci to pci.mak Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 15/38] config: add ahci for pci capable machines Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 16/38] ahci: set SATA Mode Select Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 17/38] ide: honor ncq for atapi Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 18/38] qemu-img: Call error_set_progname Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 19/38] qemu-img.c: Re-factor img_create() Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 20/38] Introduce do_snapshot_blkdev() and monitor command to handle it Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 21/38] Prevent creating an image with the same filename as backing file Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 22/38] bdrv_img_create() use proper errno return values Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 23/38] qemu.img.c: Use error_report() instead of own error() implementation Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 24/38] Remove NULL checks for bdrv_new return value Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 25/38] ide: Register vm change state handler once only Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 26/38] block: add discard support Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 27/38] scsi-disk: support WRITE SAME (16) with unmap bit Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 28/38] qemu-io: Add discard command Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 29/38] raw-posix: add discard support Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 30/38] qemu-io: Fix typo in help texts Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 31/38] docs: Add QED image format specification Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 32/38] qed: Add QEMU Enhanced Disk image format Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 33/38] qed: Table, L2 cache, and cluster functions Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 34/38] qed: Read/write support Kevin Wolf
2010-12-17 17:44 ` Kevin Wolf [this message]
2010-12-17 17:44 ` [Qemu-devel] [PATCH 36/38] block/qcow2.c: rename qcow_ functions to qcow2_ Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 37/38] Add proper -errno error return values to qcow2_open() Kevin Wolf
2010-12-17 17:44 ` [Qemu-devel] [PATCH 38/38] docs: Fix missing carets in QED specification Kevin Wolf
2010-12-17 17:52 ` [Qemu-devel] Re: [PULL 00/38] Block patches Anthony Liguori
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1292607893-13461-36-git-send-email-kwolf@redhat.com \
--to=kwolf@redhat.com \
--cc=anthony@codemonkey.ws \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).