From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1M1kAK-0006JM-Fg for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:16 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1M1kAH-0006Ir-3W for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:16 -0400 Received: from [199.232.76.173] (port=50829 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1M1kAG-0006In-KQ for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:12 -0400 Received: from mx2.redhat.com ([66.187.237.31]:39251) by monty-python.gnu.org with esmtp (Exim 4.60) (envelope-from ) id 1M1kAG-0000ZV-16 for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:12 -0400 From: Kevin Wolf Date: Wed, 6 May 2009 18:39:10 +0200 Message-Id: <1241627950-22195-1-git-send-email-kwolf@redhat.com> Subject: [Qemu-devel] [PATCH] qcow2/virtio corruption: Don't allocate the same cluster twice List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Kevin Wolf , markmc@redhat.com, avi@redhat.com A write on a qcow2 image that results in the allocation of a new cluster can be divided into two parts: There is a part which happens before the actual data is written, this is about allocating clusters in the image file. The second part happens in the AIO callback handler and is about making L2 entries for the newly allocated clusters. When two AIO requests happen to touch the same free cluster, there is a chance that the second request still sees the cluster as free because the callback of the first request has not yet run. This means that it reserves another cluster for the same virtual hard disk offset and hooks it up in its own callback, overwriting what the first callback has done. Long story cut short: Bad Things happen. This patch maintains a list of in-flight requests that have allocated new clusters. A second request touching the same cluster doesn't find an entry yet in the L2 table, but can look it up in the list now. The second request is limited so that it either doesn't touch the allocation of the first request (so it can have a non-overlapping allocation) or that it doesn't exceed the end of the allocation of the first request (so it can reuse this allocation and doesn't need to do anything itself). Signed-off-by: Kevin Wolf --- block-qcow2.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 files changed, 40 insertions(+), 0 deletions(-) diff --git a/block-qcow2.c b/block-qcow2.c index 1f33125..d78d1e7 100644 --- a/block-qcow2.c +++ b/block-qcow2.c @@ -140,6 +140,7 @@ typedef struct BDRVQcowState { uint8_t *cluster_cache; uint8_t *cluster_data; uint64_t cluster_cache_offset; + LIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; uint64_t *refcount_table; uint64_t refcount_table_offset; @@ -351,6 +352,8 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) if (refcount_init(bs) < 0) goto fail; + LIST_INIT(&s->cluster_allocs); + /* read qcow2 extensions */ if (header.backing_file_offset) ext_end = header.backing_file_offset; @@ -953,9 +956,11 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs, typedef struct QCowL2Meta { uint64_t offset; + uint64_t cluster_offset; int n_start; int nb_available; int nb_clusters; + LIST_ENTRY(QCowL2Meta) next; } QCowL2Meta; static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, @@ -1007,6 +1012,9 @@ static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, for (i = 0; i < j; i++) free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1); + /* Take the request off the list of running requests */ + LIST_REMOVE(m, next); + ret = 0; err: qemu_free(old_cluster); @@ -1035,6 +1043,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, int l2_index, ret; uint64_t l2_offset, *l2_table, cluster_offset; int nb_clusters, i = 0; + QCowL2Meta *old_alloc; ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); if (ret == 0) @@ -1083,6 +1092,36 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, } nb_clusters = i; + /* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case take the cluster_offset which was + * allocated for the previous write. + */ + LIST_FOREACH(old_alloc, &s->cluster_allocs, next) { + + uint64_t end_offset = offset + nb_clusters * s->cluster_size; + uint64_t old_offset = old_alloc->offset; + uint64_t old_end_offset = old_alloc->offset + + old_alloc->nb_clusters * s->cluster_size; + + if (end_offset < old_offset || offset > old_end_offset) { + /* No intersection */ + } else if (offset < old_offset) { + /* Stop at the start of a running allocation */ + nb_clusters = (old_offset - offset) >> s->cluster_bits; + } else { + /* Reuse the previously allocated clusters */ + if (end_offset > old_end_offset) { + nb_clusters = (old_end_offset - offset) >> s->cluster_bits; + } + cluster_offset = old_alloc->cluster_offset + (offset - old_offset); + m->nb_clusters = 0; + goto out; + } + } + + LIST_INSERT_HEAD(&s->cluster_allocs, m, next); + /* allocate a new cluster */ cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size); @@ -1091,6 +1130,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs, m->offset = offset; m->n_start = n_start; m->nb_clusters = nb_clusters; + m->cluster_offset = cluster_offset; out: m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end); -- 1.6.0.6