From: Anthony Liguori <anthony@codemonkey.ws>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Jes.Sorensen@redhat.com, qemu-devel@nongnu.org, avi@redhat.com
Subject: [Qemu-devel] Re: [patch 2/3] Add support for live block copy
Date: Tue, 22 Feb 2011 14:50:09 -0600 [thread overview]
Message-ID: <4D642181.4080509@codemonkey.ws> (raw)
In-Reply-To: <20110222170115.710717278@redhat.com>
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> Support live image copy + switch. That is, copy an image backing
> a guest hard disk to a destination image (destination image must
> be created separately), and switch to this copy.
>
> Command syntax:
>
> block_copy device filename [commit_filename] [-i] -- live block copy device to image
> optional commit filename
> -i for incremental copy (base image shared between src and destination)
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/block-copy.c
> ===================================================================
> --- /dev/null
> +++ qemu/block-copy.c
> @@ -0,0 +1,741 @@
> +/*
> + * QEMU live block copy
> + *
> + * Copyright (C) 2010 Red Hat Inc.
> + *
> + * Authors: Marcelo Tosatti<mtosatti@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "blockdev.h"
> +#include "qemu-queue.h"
> +#include "qemu-timer.h"
> +#include "monitor.h"
> +#include "block-copy.h"
> +#include "migration.h"
> +#include "sysemu.h"
> +#include "qjson.h"
> +#include<assert.h>
> +
> +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<< BDRV_SECTOR_BITS)
> +#define MAX_IS_ALLOCATED_SEARCH 65536
> +
> +/*
> + * Stages:
> + *
> + * STAGE_BULK: bulk reads/writes in progress
> + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
> + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
> + * STAGE_SWITCH_FINISHED: switched to new image.
> + */
> +
> +enum BdrvCopyStage {
> + STAGE_BULK,
> + STAGE_BULK_FINISHED,
> + STAGE_DIRTY,
> + STAGE_SWITCH_FINISHED,
> +};
> +
> +typedef struct BdrvCopyState {
> + BlockDriverState *src;
> + BlockDriverState *dst;
> + bool shared_base;
> +
> + int64_t curr_sector;
> + int64_t completed_sectors;
> + int64_t nr_sectors;
> +
> + enum BdrvCopyStage stage;
> + int inflight_reads;
> + int error;
> + int failed;
> + int cancelled;
> + QLIST_HEAD(, BdrvCopyBlock) io_list;
> + unsigned long *aio_bitmap;
> + QEMUTimer *aio_timer;
> + QLIST_ENTRY(BdrvCopyState) list;
> +
> + int64_t blocks;
> + int64_t total_time;
> +
> + char src_device_name[32];
> + char dst_filename[1024];
> + int commit_fd;
> +} BdrvCopyState;
> +
> +typedef struct BdrvCopyBlock {
> + BdrvCopyState *state;
> + uint8_t *buf;
> + int64_t sector;
> + int64_t nr_sectors;
> + struct iovec iov;
> + QEMUIOVector qiov;
> + BlockDriverAIOCB *aiocb;
> + int64_t time;
> + QLIST_ENTRY(BdrvCopyBlock) list;
> +} BdrvCopyBlock;
> +
> +static QLIST_HEAD(, BdrvCopyState) block_copy_list =
> + QLIST_HEAD_INITIALIZER(block_copy_list);
> +
> +static void alloc_aio_bitmap(BdrvCopyState *s)
> +{
> + BlockDriverState *bs = s->src;
> + int64_t bitmap_size;
> +
> + bitmap_size = (bdrv_getlength(bs)>> BDRV_SECTOR_BITS) +
> + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> + bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> +
> + s->aio_bitmap = qemu_mallocz(bitmap_size);
> +}
> +
> +static bool aio_inflight(BdrvCopyState *s, int64_t sector)
> +{
> + int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> + if (s->aio_bitmap&&
> + (sector<< BDRV_SECTOR_BITS)< bdrv_getlength(s->src)) {
> + return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]&
> + (1UL<< (chunk % (sizeof(unsigned long) * 8))));
> + } else {
> + return 0;
> + }
> +}
> +
> +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
> + int nb_sectors, int set)
> +{
> + int64_t start, end;
> + unsigned long val, idx, bit;
> +
> + start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
> + end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> + for (; start<= end; start++) {
> + idx = start / (sizeof(unsigned long) * 8);
> + bit = start % (sizeof(unsigned long) * 8);
> + val = s->aio_bitmap[idx];
> + if (set) {
> + if (!(val& (1UL<< bit))) {
> + val |= 1UL<< bit;
> + }
> + } else {
> + if (val& (1UL<< bit)) {
> + val&= ~(1UL<< bit);
> + }
> + }
> + s->aio_bitmap[idx] = val;
> + }
> +}
> +
> +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
> +{
> + s->stage = stage;
> +
> + switch (stage) {
> + case STAGE_BULK:
> + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
> + break;
> + case STAGE_BULK_FINISHED:
> + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
> + break;
> + case STAGE_DIRTY:
> + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
> + break;
> + case STAGE_SWITCH_FINISHED:
> + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
> + break;
> + default:
> + break;
> + }
> +}
> +
> +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
> +{
> + s->error = ret;
> + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
> +{
> + s->blocks++;
> + s->total_time += time;
> +}
> +
> +static void blk_copy_write_cb(void *opaque, int ret)
> +{
> + BdrvCopyBlock *blk = opaque;
> + BdrvCopyState *s = blk->state;
> +
> + if (ret< 0) {
> + QLIST_REMOVE(blk, list);
> + qemu_free(blk->buf);
> + qemu_free(blk);
> + blk_copy_handle_cb_error(s, ret);
> + return;
> + }
> +
> + QLIST_REMOVE(blk, list);
> + add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
> +
> + /* schedule switch to STAGE_DIRTY on last bulk write completion */
> + if (blk->state->stage == STAGE_BULK_FINISHED) {
> + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> + }
> +
> + if (blk->state->stage> STAGE_BULK_FINISHED) {
> + set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
> + }
> +
> + qemu_free(blk->buf);
> + qemu_free(blk);
> +}
> +
> +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
> +{
> + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> + blk->state = s;
> + blk->sector = read_blk->sector;
> + blk->nr_sectors = read_blk->nr_sectors;
> + blk->time = read_blk->time;
> + blk->buf = read_blk->buf;
> + QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> + blk->iov.iov_base = read_blk->buf;
> + blk->iov.iov_len = read_blk->iov.iov_len;
> + qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
> + blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov,
> + blk->iov.iov_len / BDRV_SECTOR_SIZE,
> + blk_copy_write_cb, blk);
> + if (!blk->aiocb) {
> + s->error = 1;
> + goto error;
> + }
> +
> + return;
> +
> +error:
> + QLIST_REMOVE(blk, list);
> + qemu_free(read_blk->buf);
> + qemu_free(blk);
> +}
> +
> +static void blk_copy_read_cb(void *opaque, int ret)
> +{
> + BdrvCopyBlock *blk = opaque;
> + BdrvCopyState *s = blk->state;
> +
> + s->inflight_reads--;
> + if (ret< 0) {
> + QLIST_REMOVE(blk, list);
> + qemu_free(blk->buf);
> + qemu_free(blk);
> + blk_copy_handle_cb_error(s, ret);
> + return;
> + }
> + blk_copy_issue_write(s, blk);
> + QLIST_REMOVE(blk, list);
> + qemu_free(blk);
> + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
> + int nr_sectors)
> +{
> + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> + blk->buf = qemu_mallocz(BLOCK_SIZE);
> + blk->state = s;
> + blk->sector = sector;
> + blk->nr_sectors = nr_sectors;
> + QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> + blk->iov.iov_base = blk->buf;
> + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> + qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> + s->inflight_reads++;
> + blk->time = qemu_get_clock_ns(rt_clock);
> + blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors,
> + blk_copy_read_cb, blk);
> + if (!blk->aiocb) {
> + s->error = 1;
> + goto error;
> + }
> +
> + return;
> +
> +error:
> + s->inflight_reads--;
> + QLIST_REMOVE(blk, list);
> + qemu_free(blk->buf);
> + qemu_free(blk);
> +}
> +
> +static bool blkcopy_can_switch(BdrvCopyState *s)
> +{
> + int64_t remaining_dirty;
> + int64_t avg_transfer_time;
> +
> + remaining_dirty = bdrv_get_dirty_count(s->src);
> + if (remaining_dirty == 0 || s->blocks == 0) {
> + return true;
> + }
> +
> + avg_transfer_time = s->total_time / s->blocks;
> + if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) {
> + return true;
> + }
> + return false;
> +}
> +
> +static int blk_issue_reads_dirty(BdrvCopyState *s)
> +{
> + int64_t sector;
> +
> + for (sector = s->curr_sector; sector< s->nr_sectors;) {
> + if (bdrv_get_dirty(s->src, sector)&& !aio_inflight(s, sector)) {
> + int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
> + BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> + blk_copy_issue_read(s, sector, nr_sectors);
> + bdrv_reset_dirty(s->src, sector, nr_sectors);
> + set_aio_inflight(s, sector, nr_sectors, 1);
> + break;
> + }
> +
> + sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
> + s->curr_sector = sector;
> + }
> +
> + if (sector>= s->nr_sectors) {
> + s->curr_sector = 0;
> + }
> + return 0;
> +}
> +
> +static int blk_issue_reads_bulk(BdrvCopyState *s)
> +{
> + int nr_sectors;
> + int64_t curr_sector = s->curr_sector;
> +
> + if (s->shared_base) {
> + while (curr_sector< s->nr_sectors&&
> + !bdrv_is_allocated(s->src, curr_sector,
> + MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) {
> + curr_sector += nr_sectors;
> + }
> + }
> +
> + if (curr_sector>= s->nr_sectors) {
> + s->curr_sector = 0;
> + return 1;
> + }
> +
> + curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
> + nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> + blk_copy_issue_read(s, s->curr_sector, nr_sectors);
> + s->curr_sector += nr_sectors;
> + s->completed_sectors = curr_sector;
> + return 0;
> +}
> +
> +static void blkcopy_finish(BdrvCopyState *s)
> +{
> + int64_t sector;
> + uint8_t *buf;
> +
> + buf = qemu_malloc(BLOCK_SIZE);
> +
> + /* FIXME: speed up loop, get_next_dirty_block? */
> + for (sector = 0; sector< s->nr_sectors;
> + sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
> + if (bdrv_get_dirty(s->src, sector)) {
> + int nr_sectors = MIN(s->nr_sectors - sector,
> + BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> + memset(buf, 0, BLOCK_SIZE);
> + if (bdrv_read(s->src, sector, buf, nr_sectors)< 0) {
> + goto error;
> + }
> + if (bdrv_write(s->dst, sector, buf, nr_sectors)< 0) {
> + goto error;
> + }
> + bdrv_reset_dirty(s->src, sector, nr_sectors);
> + }
> +
> + if (bdrv_get_dirty_count(s->src) == 0)
> + break;
> + }
> + qemu_free(buf);
> + return;
> +
> +error:
> + qemu_free(buf);
> + s->error = 1;
> +}
> +
> +static int write_commit_file(BdrvCopyState *s)
> +{
> + char commit_msg[1400];
> + const char *buf = commit_msg;
> + int len, ret;
> +
> + sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name,
> + s->dst_filename);
> +
> + len = strlen(commit_msg);
> + while (len> 0) {
> + ret = write(s->commit_fd, buf, len);
> + if (ret == -1&& errno == EINTR) {
> + continue;
> + }
> + if (ret<= 0) {
> + return -errno;
> + }
> + buf += ret;
> + len -= ret;
> + }
> +
> + if (fsync(s->commit_fd) == -1) {
> + return -errno;
> + }
>
>
This is more or less black magic. What is this commit file used for and
why aren't we using something like a QMP event?
Regards,
Anthony Liguori
next prev parent reply other threads:[~2011-02-22 20:50 UTC|newest]
Thread overview: 76+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-02-22 17:00 [Qemu-devel] [patch 0/3] live block copy (v2) Marcelo Tosatti
2011-02-22 17:00 ` [Qemu-devel] [patch 1/3] add migration_active function Marcelo Tosatti
2011-02-22 17:00 ` [Qemu-devel] [patch 2/3] Add support for live block copy Marcelo Tosatti
2011-02-22 20:50 ` Anthony Liguori [this message]
2011-02-22 21:07 ` [Qemu-devel] " Marcelo Tosatti
2011-02-22 21:11 ` Anthony Liguori
2011-02-22 23:09 ` Marcelo Tosatti
2011-02-22 23:14 ` Anthony Liguori
2011-02-23 13:01 ` Avi Kivity
2011-02-23 14:35 ` Anthony Liguori
2011-02-23 15:31 ` Avi Kivity
2011-02-23 16:01 ` Anthony Liguori
2011-02-23 16:14 ` Avi Kivity
2011-02-23 16:28 ` Anthony Liguori
2011-02-23 17:18 ` Avi Kivity
2011-02-23 20:18 ` Anthony Liguori
2011-02-23 20:44 ` Marcelo Tosatti
2011-02-23 21:41 ` Anthony Liguori
2011-02-24 14:39 ` Marcelo Tosatti
2011-02-24 7:37 ` Markus Armbruster
2011-02-24 8:54 ` Avi Kivity
2011-02-24 15:00 ` Anthony Liguori
2011-02-24 15:22 ` Avi Kivity
2011-02-24 17:58 ` Anthony Liguori
2011-02-27 9:10 ` Avi Kivity
2011-02-27 9:55 ` Dor Laor
2011-02-27 13:49 ` Anthony Liguori
2011-02-27 16:02 ` Dor Laor
2011-02-27 17:25 ` Anthony Liguori
2011-02-28 8:58 ` Dor Laor
2011-02-27 14:00 ` Anthony Liguori
2011-02-27 15:31 ` Avi Kivity
2011-02-27 17:41 ` Anthony Liguori
2011-02-28 8:38 ` Avi Kivity
2011-02-28 12:45 ` Anthony Liguori
2011-02-28 13:21 ` Avi Kivity
2011-02-28 17:33 ` Anthony Liguori
2011-02-28 17:47 ` Avi Kivity
2011-02-28 18:12 ` Anthony Liguori
[not found] ` <4D6CB556.5060401@redhat.c! om>
[not found] ` <4D6CBECF.8090805@redhat.c! om>
2011-03-01 8:59 ` Dor Laor
2011-03-02 12:39 ` Anthony Liguori
2011-03-02 13:00 ` Avi Kivity
2011-03-02 15:07 ` Anthony Liguori
2011-03-01 9:39 ` Avi Kivity
2011-03-01 15:51 ` Anthony Liguori
2011-03-01 22:27 ` Dor Laor
2011-03-02 16:30 ` Avi Kivity
2011-03-02 21:55 ` Anthony Liguori
2011-02-28 18:56 ` Marcelo Tosatti
2011-03-01 9:45 ` Avi Kivity
2011-02-23 16:17 ` Peter Maydell
2011-02-23 16:30 ` Anthony Liguori
2011-02-24 5:41 ` [Qemu-devel] Unsubsribing James Brown
2011-02-24 10:00 ` Stefan Hajnoczi
2011-02-23 17:26 ` [Qemu-devel] Re: [patch 2/3] Add support for live block copy Markus Armbruster
2011-02-23 20:06 ` Anthony Liguori
2011-02-24 12:15 ` Markus Armbruster
2011-02-25 7:16 ` Stefan Hajnoczi
2011-02-23 17:49 ` Marcelo Tosatti
2011-02-24 8:58 ` Avi Kivity
2011-02-24 15:14 ` Marcelo Tosatti
2011-02-24 15:28 ` Avi Kivity
2011-02-24 16:39 ` Marcelo Tosatti
2011-02-24 17:32 ` Avi Kivity
2011-02-24 17:45 ` Anthony Liguori
2011-02-27 9:22 ` Avi Kivity
2011-02-23 12:46 ` Avi Kivity
2011-02-22 20:50 ` Anthony Liguori
2011-02-22 21:16 ` [Qemu-devel] " Anthony Liguori
2011-02-23 19:06 ` Anthony Liguori
2011-02-26 0:02 ` Marcelo Tosatti
2011-02-26 13:45 ` Anthony Liguori
2011-02-28 19:09 ` Marcelo Tosatti
2011-03-01 2:35 ` Marcelo Tosatti
2011-02-26 15:32 ` Anthony Liguori
2011-02-22 17:00 ` [Qemu-devel] [patch 3/3] do not allow migration if block copy in progress Marcelo Tosatti
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4D642181.4080509@codemonkey.ws \
--to=anthony@codemonkey.ws \
--cc=Jes.Sorensen@redhat.com \
--cc=avi@redhat.com \
--cc=mtosatti@redhat.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).