qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: qemu-devel@nongnu.org
Cc: kwolf@redhat.com
Subject: [Qemu-devel] [RFC PATCH 15/16] qcow2: Cancel COW when overwritten
Date: Tue, 18 Sep 2012 13:40:41 +0200	[thread overview]
Message-ID: <1347968442-8860-16-git-send-email-kwolf@redhat.com> (raw)
In-Reply-To: <1347968442-8860-1-git-send-email-kwolf@redhat.com>

This is the first part of an optimisation to improve the performance of
sequential cluster allocations.

Typically, write requests aren't aligned to cluster boundaries, so
sequential allocation means that every other request has to wait for the
COW of the previous request to complete. We can do better: Just cancel
the COW instead of waiting for it and then overwriting the same area
with the second write request.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c |  127 +++++++++++++++++++++++++++++++++++++++++++------
 block/qcow2.c         |   21 ++++++++
 block/qcow2.h         |   47 ++++++++++++++++++
 3 files changed, 180 insertions(+), 15 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 440fdbf..ff22992 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -659,6 +659,8 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
     BDRVQcowState *s = bs->opaque;
     int ret;
 
+    r->final = true;
+
     if (r->nb_sectors == 0) {
         return 0;
     }
@@ -689,6 +691,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     int i, j = 0, l2_index, ret;
     uint64_t *old_cluster, *l2_table;
     uint64_t cluster_offset = m->alloc_offset;
+    bool has_wr_lock = false;
 
     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
     assert(m->nb_clusters > 0);
@@ -707,6 +710,16 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     /* Update L2 table. */
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_co_rwlock_wrlock(&m->l2_writeback_lock);
+    has_wr_lock = true;
+    qemu_co_mutex_lock(&s->lock);
+
+    if (m->no_l2_update) {
+        ret = 0;
+        goto err;
+    }
+
     if (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS) {
         qcow2_mark_dirty(bs);
     }
@@ -753,6 +766,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 
     ret = 0;
 err:
+    if (has_wr_lock) {
+        qemu_co_rwlock_unlock(&m->l2_writeback_lock);
+    }
     g_free(old_cluster);
     return ret;
  }
@@ -825,7 +841,8 @@ static void kick_l2meta(QCowL2Meta *m)
  * request has completed and updated the L2 table accordingly.
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t bytes, unsigned int *nb_clusters)
+    uint64_t *host_offset, uint64_t bytes, unsigned int *nb_clusters,
+    QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     QCowL2Meta *old_alloc;
@@ -840,22 +857,96 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
         if (end <= old_start || start >= old_end) {
             /* No intersection */
         } else {
+            uint64_t new_bytes;
+            uint64_t old_cow_end;
+
+           /*
+            * Shorten the request to stop at the start of a running
+            * allocation.
+            */
             if (start < old_start) {
-                /* Stop at the start of a running allocation */
-                bytes = old_start - start;
+                new_bytes = old_start - start;
             } else {
-                bytes = 0;
+                new_bytes = 0;
+            }
+
+            if (new_bytes > 0) {
+                bytes = new_bytes;
+                continue;
+            }
+
+            /*
+             * Check if we're just overwriting some COW of the old allocation
+             * that is safe to be replaced by the data of this request.
+             */
+            old_cow_end = old_alloc->offset + old_alloc->cow_end.offset;
+
+            if ((old_end & (s->cluster_size - 1)) == 0
+                && start >= old_cow_end
+                && !old_alloc->cow_end.final)
+            {
+                uint64_t subcluster_offset;
+                int nb_sectors;
+
+                *nb_clusters = 1;
+                subcluster_offset = offset_into_cluster(s, guest_offset);
+                nb_sectors = (subcluster_offset + bytes) >> BDRV_SECTOR_BITS;
+
+                /* Move forward cluster by cluster when overwriting COW areas,
+                 * or we'd have to deal with multiple overlapping requests and
+                 * things would become complicated. */
+                nb_sectors = MIN(s->cluster_sectors, nb_sectors);
+
+                /* Shorten the COW area at the end of the old request */
+                old_alloc->cow_end.nb_sectors =
+                    (guest_offset - old_cow_end) >> BDRV_SECTOR_BITS;
+
+                /* The new data region starts in the same cluster where the COW
+                 * region at the end of the old request starts. */
+                *host_offset = start_of_cluster(s,
+                    old_alloc->alloc_offset + old_alloc->cow_end.offset);
+
+                /* Create new l2meta that doesn't actually allocate new L2
+                 * entries, but describes the new data area so that reads
+                 * access the right cluster */
+                *m = g_malloc0(sizeof(**m));
+                **m = (QCowL2Meta) {
+                    .parent         = old_alloc,
+                    .alloc_offset   = *host_offset,
+                    .offset         = guest_offset & ~(s->cluster_size - 1),
+                    .nb_clusters    = 1,
+                    .nb_available   = nb_sectors,
+                    .no_l2_update   = true,
+
+                    .cow_start = {
+                        .offset     = subcluster_offset,
+                        .nb_sectors = 0,
+                    },
+                    .cow_end = {
+                        .offset     = nb_sectors << BDRV_SECTOR_BITS,
+                        .nb_sectors = s->cluster_sectors - nb_sectors,
+                    },
+                };
+                qemu_co_queue_init(&(*m)->dependent_requests);
+                qemu_co_rwlock_init(&(*m)->l2_writeback_lock);
+                QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+                /* Ensure that the L2 isn't written before COW has completed */
+                assert(!old_alloc->l2_writeback_lock.writer);
+                qemu_co_rwlock_rdlock(&old_alloc->l2_writeback_lock);
+
+                return 0;
             }
 
-            if (bytes == 0) {
-                /* Wait for the dependency to complete. We need to recheck
-                 * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
+            /* Wait for the dependency to complete. We need to recheck
+             * the free/allocated clusters when we continue. */
+            qemu_co_mutex_unlock(&s->lock);
+            if (old_alloc->sleeping) {
                 kick_l2meta(old_alloc);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
-                return -EAGAIN;
             }
+            qemu_co_queue_wait(&old_alloc->dependent_requests);
+            qemu_co_mutex_lock(&s->lock);
+            return -EAGAIN;
         }
     }
 
@@ -942,7 +1033,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int l2_index, ret, sectors;
     uint64_t *l2_table;
     unsigned int nb_clusters, keep_clusters;
-    uint64_t cluster_offset;
+    uint64_t cluster_offset = 0;
 
     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
                                       n_start, n_end);
@@ -969,7 +1060,7 @@ again:
      *         contiguous clusters (the situation could have changed while we
      *         were sleeping)
      *
-     *      c) TODO: Request starts in the same cluster as the in-flight
+     *      c) Request starts in the same cluster as the in-flight
      *         allocation ends. Shorten the COW of the in-fight allocation, set
      *         cluster_offset to write to the same cluster and set up the right
      *         synchronisation between the in-flight request and the new one.
@@ -980,13 +1071,17 @@ again:
      * 3. If the request still hasn't completed, allocate new clusters,
      *    considering any cluster_offset of steps 1c or 2.
      */
-    ret = handle_dependencies(bs, offset,
+    ret = handle_dependencies(bs, offset, &cluster_offset,
                               (n_end - n_start) * BDRV_SECTOR_SIZE,
-                              &nb_clusters);
+                              &nb_clusters, m);
     if (ret == -EAGAIN) {
         goto again;
     } else if (ret < 0) {
         return ret;
+    } else if (*m) {
+        keep_clusters = 1;
+        nb_clusters = 0;
+        goto done;
     }
 
     /* Find L2 entry for the first involved cluster */
@@ -1105,11 +1200,13 @@ again:
                 },
             };
             qemu_co_queue_init(&(*m)->dependent_requests);
+            qemu_co_rwlock_init(&(*m)->l2_writeback_lock);
             QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
         }
     }
 
     /* Some cleanup work */
+done:
     sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
     if (sectors > n_end) {
         sectors = n_end;
diff --git a/block/qcow2.c b/block/qcow2.c
index 88a2020..abc3de3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -779,6 +779,11 @@ again:
          * to be recoverable e.g. after the block device has been grown or the
          * network connection restored. Sleep until the next flush comes and
          * then retry.
+         *
+         * FIXME:
+         * - Need to cancel any other requests to the same cluster (or at least
+         *   prevent the L2 entry from being updated) (Really? Don't they wait
+         *   automatically?)
          */
         s->flush_error = ret;
 
@@ -795,6 +800,22 @@ again:
 
     qemu_co_mutex_unlock(&s->lock);
 
+    if (m->parent) {
+        /*
+         * Allow the parent to write the L2 table entry now that both guest
+         * data write and COW have completed. Don't remove the request from the
+         * queue yet until the L2 is updated, we still need it for overriding
+         * the cluster_offset of requests to the same area.
+         *
+         * The release of the l2_writeback_lock and the queuing must be
+         * performed atomically to avoid a race. This code implements this
+         * correctly because unlocking only schedules a BH to wake the parent,
+         * but doesn't yield yet.
+         */
+        qemu_co_rwlock_unlock(&m->parent->l2_writeback_lock);
+        qemu_co_queue_wait(&m->parent->dependent_requests);
+    }
+
     /* Take the request off the list of running requests */
     if (m->nb_clusters != 0) {
         QLIST_REMOVE(m, next_in_flight);
diff --git a/block/qcow2.h b/block/qcow2.h
index 06ca195..7ed082b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -216,6 +216,12 @@ typedef struct Qcow2COWRegion {
 
     /** Number of sectors to copy */
     int         nb_sectors;
+
+    /**
+     * Becomes true when offset and nb_sectors cannot be modified any more
+     * because the COW is already being processed.
+     */
+    bool        final;
 } Qcow2COWRegion;
 
 /**
@@ -254,6 +260,12 @@ typedef struct QCowL2Meta
     bool sleeping;
     bool error;
 
+    /**
+     * true indicates that this is not an actual cluster allocation, but reuses
+     * the allocation of an earlier request, so don't update the L2 table.
+     */
+    bool no_l2_update;
+
     /** Coroutine that handles delayed COW and updates L2 entry */
     Coroutine *co;
 
@@ -275,6 +287,31 @@ typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /**
+     * The L2 table must only be updated when all COWs to the same sector have
+     * completed. A request that shortens part of the COW, takes a reader lock
+     * until its data and COW is written.
+     */
+    CoRwlock l2_writeback_lock;
+
+    /**
+     * QCowL2Metas can form a tree. The root of this tree is a request that
+     * actually allocated the host cluster in the image file. Children are
+     * requests that overwrite (part of) a COW region of their parents, so that
+     * the COW of their parent was cancelled.
+     *
+     * Important rules to bear in mind:
+     *
+     * - The parent must not write its L2 entry before all children have
+     *   written both their guest data and COW regions. If it did, the image on
+     *   disk would yet have incomplete COW and data would be corrupted.
+     *
+     * - The chilren must stay in s->cluster_allocs until the parent has
+     *   written the L2 entry, so that read requests correctly return the new
+     *   data even though the L2 table is not updated yet.
+     */
+    struct QCowL2Meta *parent;
+
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
 
@@ -291,6 +328,16 @@ enum {
 
 #define REFT_OFFSET_MASK 0xffffffffffffff00ULL
 
+static inline int start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & ~(s->cluster_size - 1);
+}
+
+static inline int offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & (s->cluster_size - 1);
+}
+
 static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
 {
     return (size + (s->cluster_size - 1)) >> s->cluster_bits;
-- 
1.7.6.5

  parent reply	other threads:[~2012-09-18 11:41 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-09-18 11:40 [Qemu-devel] [RFC PATCH 00/16] qcow2: Delayed COW Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 01/16] qcow2: Round QCowL2Meta.offset down to cluster boundary Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 02/16] qcow2: Introduce Qcow2COWRegion Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 03/16] qcow2: Allocate l2meta dynamically Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 04/16] qcow2: Drop l2meta.cluster_offset Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 05/16] qcow2: Allocate l2meta only for cluster allocations Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 06/16] qcow2: Enable dirty flag in qcow2_alloc_cluster_link_l2 Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 07/16] qcow2: Factor out handle_dependencies() Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 08/16] qcow2: Reading from areas not in L2 tables yet Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 09/16] qcow2: Move COW and L2 update into own coroutine Kevin Wolf
2012-09-18 14:24   ` Paolo Bonzini
2012-09-18 14:44     ` Kevin Wolf
2012-09-18 14:59       ` Paolo Bonzini
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 10/16] qcow2: Delay the COW Kevin Wolf
2012-09-18 14:27   ` Paolo Bonzini
2012-09-18 14:49     ` Kevin Wolf
2012-09-19 18:47   ` Blue Swirl
2012-09-20  6:58     ` Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 11/16] qcow2: Add error handling to the l2meta coroutine Kevin Wolf
2012-09-18 14:29   ` Paolo Bonzini
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 12/16] qcow2: Handle dependencies earlier Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 13/16] qcow2: Change handle_dependency to byte granularity Kevin Wolf
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 14/16] qcow2: Execute run_dependent_requests() without lock Kevin Wolf
2012-09-18 14:33   ` Paolo Bonzini
2012-09-18 14:54     ` Kevin Wolf
2012-09-18 11:40 ` Kevin Wolf [this message]
2012-09-18 14:44   ` [Qemu-devel] [RFC PATCH 15/16] qcow2: Cancel COW when overwritten Paolo Bonzini
2012-09-18 15:02     ` Kevin Wolf
2012-09-18 15:05       ` Paolo Bonzini
2012-09-18 15:08         ` Paolo Bonzini
2012-09-18 11:40 ` [Qemu-devel] [RFC PATCH 16/16] [BROKEN] qcow2: Overwrite COW and allocate new cluster at the same time Kevin Wolf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1347968442-8860-16-git-send-email-kwolf@redhat.com \
    --to=kwolf@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).