From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Cc: kwolf@redhat.com, jcody@redhat.com, stefanha@redhat.com
Subject: [Qemu-devel] [PATCH 11/20] mirror: support more than one in-flight AIO operation
Date: Wed, 12 Dec 2012 14:46:30 +0100 [thread overview]
Message-ID: <1355319999-30627-12-git-send-email-pbonzini@redhat.com> (raw)
In-Reply-To: <1355319999-30627-1-git-send-email-pbonzini@redhat.com>
With AIO support in place, we can start copying more than one chunk
in parallel. This patch introduces the required infrastructure for
this: the buffer is split into multiple granularity-sized chunks,
and there is a free list to access them.
Because of copy-on-write, a single operation may already require
multiple chunks to be available on the free list.
In addition, two different iterations on the HBitmap may want to
copy the same cluster. We avoid this by keeping a bitmap of in-flight
I/O operations, and blocking until the previous iteration completes.
This should be a pretty rare occurrence, though; as long as there is
no overlap the next iteration can start before the previous one finishes.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
block/mirror.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
trace-events | 4 ++-
2 files changed, 100 insertions(+), 13 deletions(-)
diff --git a/block/mirror.c b/block/mirror.c
index ed56b86..f9caaea 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -17,7 +17,15 @@
#include "qemu/ratelimit.h"
#include "bitmap.h"
-#define SLICE_TIME 100000000ULL /* ns */
+#define SLICE_TIME 100000000ULL /* ns */
+#define MAX_IN_FLIGHT 16
+
+/* The mirroring buffer is a list of granularity-sized chunks.
+ * Free chunks are organized in a list.
+ */
+typedef struct MirrorBuffer {
+ QSIMPLEQ_ENTRY(MirrorBuffer) next;
+} MirrorBuffer;
typedef struct MirrorBlockJob {
BlockJob common;
@@ -33,7 +41,10 @@ typedef struct MirrorBlockJob {
unsigned long *cow_bitmap;
HBitmapIter hbi;
uint8_t *buf;
+ QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
+ int buf_free_count;
+ unsigned long *in_flight_bitmap;
int in_flight;
int ret;
} MirrorBlockJob;
@@ -41,7 +52,6 @@ typedef struct MirrorBlockJob {
typedef struct MirrorOp {
MirrorBlockJob *s;
QEMUIOVector qiov;
- struct iovec iov;
int64_t sector_num;
int nb_sectors;
} MirrorOp;
@@ -62,8 +72,22 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
static void mirror_iteration_done(MirrorOp *op)
{
MirrorBlockJob *s = op->s;
+ struct iovec *iov;
+ int64_t cluster_num;
+ int i, nb_chunks;
s->in_flight--;
+ iov = op->qiov.iov;
+ for (i = 0; i < op->qiov.niov; i++) {
+ MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+ QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
+ s->buf_free_count++;
+ }
+
+ cluster_num = op->sector_num / s->granularity;
+ nb_chunks = op->nb_sectors / s->granularity;
+ bitmap_clear(s->in_flight_bitmap, cluster_num, nb_chunks);
+
trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors);
g_slice_free(MirrorOp, op);
qemu_coroutine_enter(s->common.co, NULL);
@@ -110,8 +134,8 @@ static void mirror_read_complete(void *opaque, int ret)
static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
{
BlockDriverState *source = s->common.bs;
- int nb_sectors, nb_sectors_chunk;
- int64_t end, sector_num, cluster_num;
+ int nb_sectors, nb_sectors_chunk, nb_chunks;
+ int64_t end, sector_num, cluster_num, next_sector, hbitmap_next_sector;
MirrorOp *op;
s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -122,6 +146,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
assert(s->sector_num >= 0);
}
+ hbitmap_next_sector = s->sector_num;
+
/* If we have no backing file yet in the destination, and the cluster size
* is very large, we need to do COW ourselves. The first time a cluster is
* copied, copy it entirely.
@@ -137,21 +163,58 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
bdrv_round_to_clusters(s->target,
sector_num, nb_sectors_chunk,
§or_num, &nb_sectors);
- bitmap_set(s->cow_bitmap, sector_num / nb_sectors_chunk,
- nb_sectors / nb_sectors_chunk);
+
+ /* The rounding may make us copy sectors before the
+ * first dirty one.
+ */
+ cluster_num = sector_num / nb_sectors_chunk;
+ }
+
+ /* Wait for I/O to this cluster (from a previous iteration) to be done. */
+ while (test_bit(cluster_num, s->in_flight_bitmap)) {
+ trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+ qemu_coroutine_yield();
}
end = s->common.len >> BDRV_SECTOR_BITS;
nb_sectors = MIN(nb_sectors, end - sector_num);
+ nb_chunks = (nb_sectors + nb_sectors_chunk - 1) / nb_sectors_chunk;
+ while (s->buf_free_count < nb_chunks) {
+ trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+ qemu_coroutine_yield();
+ }
+
+ /* We have enough free space to copy these sectors. */
+ if (s->cow_bitmap) {
+ bitmap_set(s->cow_bitmap, cluster_num, nb_chunks);
+ }
/* Allocate a MirrorOp that is used as an AIO callback. */
op = g_slice_new(MirrorOp);
op->s = s;
- op->iov.iov_base = s->buf;
- op->iov.iov_len = nb_sectors * 512;
op->sector_num = sector_num;
op->nb_sectors = nb_sectors;
- qemu_iovec_init_external(&op->qiov, &op->iov, 1);
+
+ /* Now make a QEMUIOVector taking enough granularity-sized chunks
+ * from s->buf_free.
+ */
+ qemu_iovec_init(&op->qiov, nb_chunks);
+ next_sector = sector_num;
+ while (nb_chunks-- > 0) {
+ MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
+ QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
+ s->buf_free_count--;
+ qemu_iovec_add(&op->qiov, buf, s->granularity);
+
+ /* Advance the HBitmapIter in parallel, so that we do not examine
+ * the same sector twice.
+ */
+ if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+ hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
+ }
+
+ next_sector += nb_sectors_chunk;
+ }
bdrv_reset_dirty(source, sector_num, nb_sectors);
@@ -162,6 +225,23 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
mirror_read_complete, op);
}
+static void mirror_free_init(MirrorBlockJob *s)
+{
+ int granularity = s->granularity;
+ size_t buf_size = s->buf_size;
+ uint8_t *buf = s->buf;
+
+ assert(s->buf_free_count == 0);
+ QSIMPLEQ_INIT(&s->buf_free);
+ while (buf_size != 0) {
+ MirrorBuffer *cur = (MirrorBuffer *)buf;
+ QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
+ s->buf_free_count++;
+ buf_size -= granularity;
+ buf += granularity;
+ }
+}
+
static void mirror_drain(MirrorBlockJob *s)
{
while (s->in_flight > 0) {
@@ -190,6 +270,9 @@ static void coroutine_fn mirror_run(void *opaque)
return;
}
+ length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
+ s->in_flight_bitmap = bitmap_new(length);
+
/* If we have no backing file yet in the destination, we cannot let
* the destination do COW. Instead, we copy sectors around the
* dirty data if needed. We need a bitmap to do that.
@@ -200,7 +283,6 @@ static void coroutine_fn mirror_run(void *opaque)
bdrv_get_info(s->target, &bdi);
if (s->buf_size < bdi.cluster_size) {
s->buf_size = bdi.cluster_size;
- length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
s->cow_bitmap = bitmap_new(length);
}
}
@@ -208,6 +290,7 @@ static void coroutine_fn mirror_run(void *opaque)
end = s->common.len >> BDRV_SECTOR_BITS;
s->buf = qemu_blockalign(bs, s->buf_size);
nb_sectors_chunk = s->granularity >> BDRV_SECTOR_BITS;
+ mirror_free_init(s);
if (s->mode != MIRROR_SYNC_MODE_NONE) {
/* First part, loop on the sectors and initialize the dirty bitmap. */
@@ -253,8 +336,9 @@ static void coroutine_fn mirror_run(void *opaque)
*/
if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
- if (s->in_flight > 0) {
- trace_mirror_yield(s, s->in_flight, cnt);
+ if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+ (cnt == 0 && s->in_flight > 0)) {
+ trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
qemu_coroutine_yield();
continue;
} else if (cnt != 0) {
@@ -346,6 +430,7 @@ immediate_exit:
assert(s->in_flight == 0);
g_free(s->buf);
g_free(s->cow_bitmap);
+ g_free(s->in_flight_bitmap);
bdrv_set_dirty_tracking(bs, 0);
bdrv_iostatus_disable(s->target);
if (s->should_complete && ret == 0) {
diff --git a/trace-events b/trace-events
index ca50812..42068dd 100644
--- a/trace-events
+++ b/trace-events
@@ -86,7 +86,9 @@ mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64
mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64
mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
-mirror_yield(void *s, int64_t cnt, int in_flight) "s %p dirty count %"PRId64" in_flight %d"
+mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
+mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
+mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
# blockdev.c
qmp_block_job_cancel(void *job) "job %p"
--
1.8.0.1
next prev parent reply other threads:[~2012-12-12 13:47 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-12-12 13:46 [Qemu-devel] [PATCH 00/20] Block device mirroring enhancements, 12-12-12 edition Paolo Bonzini
2012-12-12 13:46 ` [Qemu-devel] [PATCH 01/20] host-utils: add ffsl Paolo Bonzini
2012-12-12 23:41 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 02/20] add hierarchical bitmap data type and test cases Paolo Bonzini
2012-12-14 0:04 ` Eric Blake
2013-01-11 18:27 ` Stefan Hajnoczi
2012-12-12 13:46 ` [Qemu-devel] [PATCH 03/20] block: implement dirty bitmap using HBitmap Paolo Bonzini
2012-12-14 0:27 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 04/20] block: make round_to_clusters public Paolo Bonzini
2012-12-14 20:13 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 05/20] mirror: perform COW if the cluster size is bigger than the granularity Paolo Bonzini
2012-12-14 20:21 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 06/20] block: return count of dirty sectors, not chunks Paolo Bonzini
2012-12-14 20:49 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 07/20] block: allow customizing the granularity of the dirty bitmap Paolo Bonzini
2012-12-14 21:27 ` Eric Blake
2012-12-15 9:11 ` Paolo Bonzini
2012-12-12 13:46 ` [Qemu-devel] [PATCH 08/20] mirror: allow customizing the granularity Paolo Bonzini
2012-12-14 22:01 ` Eric Blake
2013-01-14 11:28 ` Stefan Hajnoczi
2012-12-12 13:46 ` [Qemu-devel] [PATCH 09/20] mirror: switch mirror_iteration to AIO Paolo Bonzini
2012-12-14 22:11 ` Eric Blake
2012-12-15 9:09 ` Paolo Bonzini
2012-12-15 13:05 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 10/20] mirror: add buf-size argument to drive-mirror Paolo Bonzini
2012-12-14 22:22 ` Eric Blake
2012-12-15 9:09 ` Paolo Bonzini
2013-01-14 11:41 ` Stefan Hajnoczi
2012-12-12 13:46 ` Paolo Bonzini [this message]
2012-12-14 22:32 ` [Qemu-devel] [PATCH 11/20] mirror: support more than one in-flight AIO operation Eric Blake
2013-01-14 12:56 ` Stefan Hajnoczi
2013-01-14 13:28 ` Paolo Bonzini
2012-12-12 13:46 ` [Qemu-devel] [PATCH 12/20] mirror: support arbitrarily-sized iterations Paolo Bonzini
2012-12-14 22:39 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 13/20] oslib: add a wrapper for mmap/munmap Paolo Bonzini
2012-12-14 22:54 ` Eric Blake
2012-12-15 9:06 ` Paolo Bonzini
2012-12-12 13:46 ` [Qemu-devel] [PATCH 14/20] hbitmap: add hbitmap_alloc_with_data and hbitmap_required_size Paolo Bonzini
2012-12-17 17:14 ` Eric Blake
2012-12-17 17:18 ` Paolo Bonzini
2012-12-12 13:46 ` [Qemu-devel] [PATCH 15/20] hbitmap: add hbitmap_copy Paolo Bonzini
2012-12-17 18:25 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 16/20] block: split bdrv_enable_dirty_tracking and bdrv_disable_dirty_tracking Paolo Bonzini
2012-12-20 18:26 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 17/20] block: support a persistent dirty bitmap Paolo Bonzini
2012-12-20 23:03 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 18/20] mirror: add support for " Paolo Bonzini
2012-12-20 23:49 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 19/20] block: choose the default dirty bitmap granularity in bdrv_enable_dirty_tracking Paolo Bonzini
2012-12-20 23:53 ` Eric Blake
2012-12-12 13:46 ` [Qemu-devel] [PATCH 20/20] monitor: add commands to start/stop dirty bitmap Paolo Bonzini
2012-12-21 18:30 ` Eric Blake
2013-01-14 13:02 ` [Qemu-devel] [PATCH 00/20] Block device mirroring enhancements, 12-12-12 edition Stefan Hajnoczi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1355319999-30627-12-git-send-email-pbonzini@redhat.com \
--to=pbonzini@redhat.com \
--cc=jcody@redhat.com \
--cc=kwolf@redhat.com \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).