From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43)
	id 1M1kAK-0006JM-Fg
	for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:16 -0400
Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43)
	id 1M1kAH-0006Ir-3W
	for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:16 -0400
Received: from [199.232.76.173] (port=50829 helo=monty-python.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.43) id 1M1kAG-0006In-KQ
	for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:12 -0400
Received: from mx2.redhat.com ([66.187.237.31]:39251)
	by monty-python.gnu.org with esmtp (Exim 4.60)
	(envelope-from <kwolf@redhat.com>) id 1M1kAG-0000ZV-16
	for qemu-devel@nongnu.org; Wed, 06 May 2009 12:40:12 -0400
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed,  6 May 2009 18:39:10 +0200
Message-Id: <1241627950-22195-1-git-send-email-kwolf@redhat.com>
Subject: [Qemu-devel] [PATCH] qcow2/virtio corruption: Don't allocate the
	same cluster twice
List-Id: qemu-devel.nongnu.org
List-Unsubscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.gnu.org/pipermail/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <http://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>, markmc@redhat.com, avi@redhat.com

A write on a qcow2 image that results in the allocation of a new cluster can be
divided into two parts: There is a part which happens before the actual data is
written, this is about allocating clusters in the image file. The second part
happens in the AIO callback handler and is about making L2 entries for the
newly allocated clusters.

When two AIO requests happen to touch the same free cluster, there is a chance
that the second request still sees the cluster as free because the callback of
the first request has not yet run. This means that it reserves another cluster
for the same virtual hard disk offset and hooks it up in its own callback,
overwriting what the first callback has done. Long story cut short: Bad Things
happen.

This patch maintains a list of in-flight requests that have allocated new
clusters. A second request touching the same cluster doesn't find an entry yet
in the L2 table, but can look it up in the list now. The second request is
limited so that it either doesn't touch the allocation of the first request
(so it can have a non-overlapping allocation) or that it doesn't exceed the
end of the allocation of the first request (so it can reuse this allocation
and doesn't need to do anything itself).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-qcow2.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/block-qcow2.c b/block-qcow2.c
index 1f33125..d78d1e7 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -140,6 +140,7 @@ typedef struct BDRVQcowState {
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
     uint64_t cluster_cache_offset;
+    LIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
 
     uint64_t *refcount_table;
     uint64_t refcount_table_offset;
@@ -351,6 +352,8 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
     if (refcount_init(bs) < 0)
         goto fail;
 
+    LIST_INIT(&s->cluster_allocs);
+
     /* read qcow2 extensions */
     if (header.backing_file_offset)
         ext_end = header.backing_file_offset;
@@ -953,9 +956,11 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
 typedef struct QCowL2Meta
 {
     uint64_t offset;
+    uint64_t cluster_offset;
     int n_start;
     int nb_available;
     int nb_clusters;
+    LIST_ENTRY(QCowL2Meta) next;
 } QCowL2Meta;
 
 static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
@@ -1007,6 +1012,9 @@ static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
     for (i = 0; i < j; i++)
         free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1);
 
+    /* Take the request off the list of running requests */
+    LIST_REMOVE(m, next);
+
     ret = 0;
 err:
     qemu_free(old_cluster);
@@ -1035,6 +1043,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     int l2_index, ret;
     uint64_t l2_offset, *l2_table, cluster_offset;
     int nb_clusters, i = 0;
+    QCowL2Meta *old_alloc;
 
     ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
     if (ret == 0)
@@ -1083,6 +1092,36 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     }
     nb_clusters = i;
 
+    /*
+     * Check if there already is an AIO write request in flight which allocates
+     * the same cluster. In this case take the cluster_offset which was
+     * allocated for the previous write.
+     */
+    LIST_FOREACH(old_alloc, &s->cluster_allocs, next) {
+
+        uint64_t end_offset = offset + nb_clusters * s->cluster_size;
+        uint64_t old_offset = old_alloc->offset;
+        uint64_t old_end_offset = old_alloc->offset +
+            old_alloc->nb_clusters * s->cluster_size;
+
+        if (end_offset < old_offset || offset > old_end_offset) {
+            /* No intersection */
+        } else if (offset < old_offset) {
+            /* Stop at the start of a running allocation */
+            nb_clusters = (old_offset - offset) >> s->cluster_bits;
+        } else {
+            /* Reuse the previously allocated clusters */
+            if (end_offset > old_end_offset) {
+                nb_clusters = (old_end_offset - offset) >> s->cluster_bits;
+            }
+            cluster_offset = old_alloc->cluster_offset + (offset - old_offset);
+            m->nb_clusters = 0;
+            goto out;
+        }
+    }
+
+    LIST_INSERT_HEAD(&s->cluster_allocs, m, next);
+
     /* allocate a new cluster */
 
     cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
@@ -1091,6 +1130,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     m->offset = offset;
     m->n_start = n_start;
     m->nb_clusters = nb_clusters;
+    m->cluster_offset = cluster_offset;
 
 out:
     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
-- 
1.6.0.6