qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: qemu-devel@nongnu.org
Cc: Kevin Wolf <kwolf@redhat.com>, markmc@redhat.com, avi@redhat.com
Subject: [Qemu-devel] [PATCH] qcow2/virtio corruption: Don't allocate the same cluster twice
Date: Wed,  6 May 2009 18:39:10 +0200	[thread overview]
Message-ID: <1241627950-22195-1-git-send-email-kwolf@redhat.com> (raw)

A write on a qcow2 image that results in the allocation of a new cluster can be
divided into two parts: There is a part which happens before the actual data is
written, this is about allocating clusters in the image file. The second part
happens in the AIO callback handler and is about making L2 entries for the
newly allocated clusters.

When two AIO requests happen to touch the same free cluster, there is a chance
that the second request still sees the cluster as free because the callback of
the first request has not yet run. This means that it reserves another cluster
for the same virtual hard disk offset and hooks it up in its own callback,
overwriting what the first callback has done. Long story cut short: Bad Things
happen.

This patch maintains a list of in-flight requests that have allocated new
clusters. A second request touching the same cluster doesn't find an entry yet
in the L2 table, but can look it up in the list now. The second request is
limited so that it either doesn't touch the allocation of the first request
(so it can have a non-overlapping allocation) or that it doesn't exceed the
end of the allocation of the first request (so it can reuse this allocation
and doesn't need to do anything itself).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block-qcow2.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/block-qcow2.c b/block-qcow2.c
index 1f33125..d78d1e7 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -140,6 +140,7 @@ typedef struct BDRVQcowState {
     uint8_t *cluster_cache;
     uint8_t *cluster_data;
     uint64_t cluster_cache_offset;
+    LIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
 
     uint64_t *refcount_table;
     uint64_t refcount_table_offset;
@@ -351,6 +352,8 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
     if (refcount_init(bs) < 0)
         goto fail;
 
+    LIST_INIT(&s->cluster_allocs);
+
     /* read qcow2 extensions */
     if (header.backing_file_offset)
         ext_end = header.backing_file_offset;
@@ -953,9 +956,11 @@ static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
 typedef struct QCowL2Meta
 {
     uint64_t offset;
+    uint64_t cluster_offset;
     int n_start;
     int nb_available;
     int nb_clusters;
+    LIST_ENTRY(QCowL2Meta) next;
 } QCowL2Meta;
 
 static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
@@ -1007,6 +1012,9 @@ static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
     for (i = 0; i < j; i++)
         free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1);
 
+    /* Take the request off the list of running requests */
+    LIST_REMOVE(m, next);
+
     ret = 0;
 err:
     qemu_free(old_cluster);
@@ -1035,6 +1043,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     int l2_index, ret;
     uint64_t l2_offset, *l2_table, cluster_offset;
     int nb_clusters, i = 0;
+    QCowL2Meta *old_alloc;
 
     ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
     if (ret == 0)
@@ -1083,6 +1092,36 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     }
     nb_clusters = i;
 
+    /*
+     * Check if there already is an AIO write request in flight which allocates
+     * the same cluster. In this case take the cluster_offset which was
+     * allocated for the previous write.
+     */
+    LIST_FOREACH(old_alloc, &s->cluster_allocs, next) {
+
+        uint64_t end_offset = offset + nb_clusters * s->cluster_size;
+        uint64_t old_offset = old_alloc->offset;
+        uint64_t old_end_offset = old_alloc->offset +
+            old_alloc->nb_clusters * s->cluster_size;
+
+        if (end_offset < old_offset || offset > old_end_offset) {
+            /* No intersection */
+        } else if (offset < old_offset) {
+            /* Stop at the start of a running allocation */
+            nb_clusters = (old_offset - offset) >> s->cluster_bits;
+        } else {
+            /* Reuse the previously allocated clusters */
+            if (end_offset > old_end_offset) {
+                nb_clusters = (old_end_offset - offset) >> s->cluster_bits;
+            }
+            cluster_offset = old_alloc->cluster_offset + (offset - old_offset);
+            m->nb_clusters = 0;
+            goto out;
+        }
+    }
+
+    LIST_INSERT_HEAD(&s->cluster_allocs, m, next);
+
     /* allocate a new cluster */
 
     cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
@@ -1091,6 +1130,7 @@ static uint64_t alloc_cluster_offset(BlockDriverState *bs,
     m->offset = offset;
     m->n_start = n_start;
     m->nb_clusters = nb_clusters;
+    m->cluster_offset = cluster_offset;
 
 out:
     m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
-- 
1.6.0.6

             reply	other threads:[~2009-05-06 16:40 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-05-06 16:39 Kevin Wolf [this message]
2009-05-06 16:54 ` [Qemu-devel] Re: [PATCH] qcow2/virtio corruption: Don't allocate the same cluster twice Avi Kivity
2009-05-06 17:03   ` Kevin Wolf
2009-05-06 17:08     ` Avi Kivity
2009-05-06 17:52       ` Kevin Wolf
2009-05-06 18:31         ` Avi Kivity
2009-05-07  7:32         ` Gleb Natapov
2009-05-07  7:54           ` Avi Kivity
2009-05-07  8:01           ` Kevin Wolf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1241627950-22195-1-git-send-email-kwolf@redhat.com \
    --to=kwolf@redhat.com \
    --cc=avi@redhat.com \
    --cc=markmc@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).