From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from [140.186.70.92] (port=57275 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1PrzBU-0005O6-Vw for qemu-devel@nongnu.org; Tue, 22 Feb 2011 15:50:14 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1PrzBS-0000fx-NN for qemu-devel@nongnu.org; Tue, 22 Feb 2011 15:50:12 -0500 Received: from mail-vx0-f173.google.com ([209.85.220.173]:42932) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1PrzBS-0000fQ-HX for qemu-devel@nongnu.org; Tue, 22 Feb 2011 15:50:10 -0500 Received: by vxb41 with SMTP id 41so2145941vxb.4 for ; Tue, 22 Feb 2011 12:50:09 -0800 (PST) Message-ID: <4D642181.4080509@codemonkey.ws> Date: Tue, 22 Feb 2011 14:50:09 -0600 From: Anthony Liguori MIME-Version: 1.0 References: <20110222170004.808373778@redhat.com> <20110222170115.710717278@redhat.com> In-Reply-To: <20110222170115.710717278@redhat.com> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Subject: [Qemu-devel] Re: [patch 2/3] Add support for live block copy List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Marcelo Tosatti Cc: Jes.Sorensen@redhat.com, qemu-devel@nongnu.org, avi@redhat.com On 02/22/2011 11:00 AM, Marcelo Tosatti wrote: > Support live image copy + switch. That is, copy an image backing > a guest hard disk to a destination image (destination image must > be created separately), and switch to this copy. > > Command syntax: > > block_copy device filename [commit_filename] [-i] -- live block copy device to image > optional commit filename > -i for incremental copy (base image shared between src and destination) > > Signed-off-by: Marcelo Tosatti > > Index: qemu/block-copy.c > =================================================================== > --- /dev/null > +++ qemu/block-copy.c > @@ -0,0 +1,741 @@ > +/* > + * QEMU live block copy > + * > + * Copyright (C) 2010 Red Hat Inc. > + * > + * Authors: Marcelo Tosatti > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include "qemu-common.h" > +#include "block_int.h" > +#include "blockdev.h" > +#include "qemu-queue.h" > +#include "qemu-timer.h" > +#include "monitor.h" > +#include "block-copy.h" > +#include "migration.h" > +#include "sysemu.h" > +#include "qjson.h" > +#include > + > +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<< BDRV_SECTOR_BITS) > +#define MAX_IS_ALLOCATED_SEARCH 65536 > + > +/* > + * Stages: > + * > + * STAGE_BULK: bulk reads/writes in progress > + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress > + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress > + * STAGE_SWITCH_FINISHED: switched to new image. > + */ > + > +enum BdrvCopyStage { > + STAGE_BULK, > + STAGE_BULK_FINISHED, > + STAGE_DIRTY, > + STAGE_SWITCH_FINISHED, > +}; > + > +typedef struct BdrvCopyState { > + BlockDriverState *src; > + BlockDriverState *dst; > + bool shared_base; > + > + int64_t curr_sector; > + int64_t completed_sectors; > + int64_t nr_sectors; > + > + enum BdrvCopyStage stage; > + int inflight_reads; > + int error; > + int failed; > + int cancelled; > + QLIST_HEAD(, BdrvCopyBlock) io_list; > + unsigned long *aio_bitmap; > + QEMUTimer *aio_timer; > + QLIST_ENTRY(BdrvCopyState) list; > + > + int64_t blocks; > + int64_t total_time; > + > + char src_device_name[32]; > + char dst_filename[1024]; > + int commit_fd; > +} BdrvCopyState; > + > +typedef struct BdrvCopyBlock { > + BdrvCopyState *state; > + uint8_t *buf; > + int64_t sector; > + int64_t nr_sectors; > + struct iovec iov; > + QEMUIOVector qiov; > + BlockDriverAIOCB *aiocb; > + int64_t time; > + QLIST_ENTRY(BdrvCopyBlock) list; > +} BdrvCopyBlock; > + > +static QLIST_HEAD(, BdrvCopyState) block_copy_list = > + QLIST_HEAD_INITIALIZER(block_copy_list); > + > +static void alloc_aio_bitmap(BdrvCopyState *s) > +{ > + BlockDriverState *bs = s->src; > + int64_t bitmap_size; > + > + bitmap_size = (bdrv_getlength(bs)>> BDRV_SECTOR_BITS) + > + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; > + bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; > + > + s->aio_bitmap = qemu_mallocz(bitmap_size); > +} > + > +static bool aio_inflight(BdrvCopyState *s, int64_t sector) > +{ > + int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + if (s->aio_bitmap&& > + (sector<< BDRV_SECTOR_BITS)< bdrv_getlength(s->src)) { > + return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]& > + (1UL<< (chunk % (sizeof(unsigned long) * 8)))); > + } else { > + return 0; > + } > +} > + > +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num, > + int nb_sectors, int set) > +{ > + int64_t start, end; > + unsigned long val, idx, bit; > + > + start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; > + end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + for (; start<= end; start++) { > + idx = start / (sizeof(unsigned long) * 8); > + bit = start % (sizeof(unsigned long) * 8); > + val = s->aio_bitmap[idx]; > + if (set) { > + if (!(val& (1UL<< bit))) { > + val |= 1UL<< bit; > + } > + } else { > + if (val& (1UL<< bit)) { > + val&= ~(1UL<< bit); > + } > + } > + s->aio_bitmap[idx] = val; > + } > +} > + > +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage) > +{ > + s->stage = stage; > + > + switch (stage) { > + case STAGE_BULK: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK); > + break; > + case STAGE_BULK_FINISHED: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED); > + break; > + case STAGE_DIRTY: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY); > + break; > + case STAGE_SWITCH_FINISHED: > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED); > + break; > + default: > + break; > + } > +} > + > +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret) > +{ > + s->error = ret; > + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); > +} > + > +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time) > +{ > + s->blocks++; > + s->total_time += time; > +} > + > +static void blk_copy_write_cb(void *opaque, int ret) > +{ > + BdrvCopyBlock *blk = opaque; > + BdrvCopyState *s = blk->state; > + > + if (ret< 0) { > + QLIST_REMOVE(blk, list); > + qemu_free(blk->buf); > + qemu_free(blk); > + blk_copy_handle_cb_error(s, ret); > + return; > + } > + > + QLIST_REMOVE(blk, list); > + add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time); > + > + /* schedule switch to STAGE_DIRTY on last bulk write completion */ > + if (blk->state->stage == STAGE_BULK_FINISHED) { > + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); > + } > + > + if (blk->state->stage> STAGE_BULK_FINISHED) { > + set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0); > + } > + > + qemu_free(blk->buf); > + qemu_free(blk); > +} > + > +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk) > +{ > + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock)); > + blk->state = s; > + blk->sector = read_blk->sector; > + blk->nr_sectors = read_blk->nr_sectors; > + blk->time = read_blk->time; > + blk->buf = read_blk->buf; > + QLIST_INSERT_HEAD(&s->io_list, blk, list); > + > + blk->iov.iov_base = read_blk->buf; > + blk->iov.iov_len = read_blk->iov.iov_len; > + qemu_iovec_init_external(&blk->qiov,&blk->iov, 1); > + > + BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE); > + blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov, > + blk->iov.iov_len / BDRV_SECTOR_SIZE, > + blk_copy_write_cb, blk); > + if (!blk->aiocb) { > + s->error = 1; > + goto error; > + } > + > + return; > + > +error: > + QLIST_REMOVE(blk, list); > + qemu_free(read_blk->buf); > + qemu_free(blk); > +} > + > +static void blk_copy_read_cb(void *opaque, int ret) > +{ > + BdrvCopyBlock *blk = opaque; > + BdrvCopyState *s = blk->state; > + > + s->inflight_reads--; > + if (ret< 0) { > + QLIST_REMOVE(blk, list); > + qemu_free(blk->buf); > + qemu_free(blk); > + blk_copy_handle_cb_error(s, ret); > + return; > + } > + blk_copy_issue_write(s, blk); > + QLIST_REMOVE(blk, list); > + qemu_free(blk); > + qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock)); > +} > + > +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector, > + int nr_sectors) > +{ > + BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock)); > + blk->buf = qemu_mallocz(BLOCK_SIZE); > + blk->state = s; > + blk->sector = sector; > + blk->nr_sectors = nr_sectors; > + QLIST_INSERT_HEAD(&s->io_list, blk, list); > + > + blk->iov.iov_base = blk->buf; > + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; > + qemu_iovec_init_external(&blk->qiov,&blk->iov, 1); > + > + s->inflight_reads++; > + blk->time = qemu_get_clock_ns(rt_clock); > + blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors, > + blk_copy_read_cb, blk); > + if (!blk->aiocb) { > + s->error = 1; > + goto error; > + } > + > + return; > + > +error: > + s->inflight_reads--; > + QLIST_REMOVE(blk, list); > + qemu_free(blk->buf); > + qemu_free(blk); > +} > + > +static bool blkcopy_can_switch(BdrvCopyState *s) > +{ > + int64_t remaining_dirty; > + int64_t avg_transfer_time; > + > + remaining_dirty = bdrv_get_dirty_count(s->src); > + if (remaining_dirty == 0 || s->blocks == 0) { > + return true; > + } > + > + avg_transfer_time = s->total_time / s->blocks; > + if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) { > + return true; > + } > + return false; > +} > + > +static int blk_issue_reads_dirty(BdrvCopyState *s) > +{ > + int64_t sector; > + > + for (sector = s->curr_sector; sector< s->nr_sectors;) { > + if (bdrv_get_dirty(s->src, sector)&& !aio_inflight(s, sector)) { > + int nr_sectors = MIN(s->nr_sectors - s->curr_sector, > + BDRV_SECTORS_PER_DIRTY_CHUNK); > + > + blk_copy_issue_read(s, sector, nr_sectors); > + bdrv_reset_dirty(s->src, sector, nr_sectors); > + set_aio_inflight(s, sector, nr_sectors, 1); > + break; > + } > + > + sector += BDRV_SECTORS_PER_DIRTY_CHUNK; > + s->curr_sector = sector; > + } > + > + if (sector>= s->nr_sectors) { > + s->curr_sector = 0; > + } > + return 0; > +} > + > +static int blk_issue_reads_bulk(BdrvCopyState *s) > +{ > + int nr_sectors; > + int64_t curr_sector = s->curr_sector; > + > + if (s->shared_base) { > + while (curr_sector< s->nr_sectors&& > + !bdrv_is_allocated(s->src, curr_sector, > + MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) { > + curr_sector += nr_sectors; > + } > + } > + > + if (curr_sector>= s->nr_sectors) { > + s->curr_sector = 0; > + return 1; > + } > + > + curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); > + nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + blk_copy_issue_read(s, s->curr_sector, nr_sectors); > + s->curr_sector += nr_sectors; > + s->completed_sectors = curr_sector; > + return 0; > +} > + > +static void blkcopy_finish(BdrvCopyState *s) > +{ > + int64_t sector; > + uint8_t *buf; > + > + buf = qemu_malloc(BLOCK_SIZE); > + > + /* FIXME: speed up loop, get_next_dirty_block? */ > + for (sector = 0; sector< s->nr_sectors; > + sector += BDRV_SECTORS_PER_DIRTY_CHUNK) { > + if (bdrv_get_dirty(s->src, sector)) { > + int nr_sectors = MIN(s->nr_sectors - sector, > + BDRV_SECTORS_PER_DIRTY_CHUNK); > + > + memset(buf, 0, BLOCK_SIZE); > + if (bdrv_read(s->src, sector, buf, nr_sectors)< 0) { > + goto error; > + } > + if (bdrv_write(s->dst, sector, buf, nr_sectors)< 0) { > + goto error; > + } > + bdrv_reset_dirty(s->src, sector, nr_sectors); > + } > + > + if (bdrv_get_dirty_count(s->src) == 0) > + break; > + } > + qemu_free(buf); > + return; > + > +error: > + qemu_free(buf); > + s->error = 1; > +} > + > +static int write_commit_file(BdrvCopyState *s) > +{ > + char commit_msg[1400]; > + const char *buf = commit_msg; > + int len, ret; > + > + sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name, > + s->dst_filename); > + > + len = strlen(commit_msg); > + while (len> 0) { > + ret = write(s->commit_fd, buf, len); > + if (ret == -1&& errno == EINTR) { > + continue; > + } > + if (ret<= 0) { > + return -errno; > + } > + buf += ret; > + len -= ret; > + } > + > + if (fsync(s->commit_fd) == -1) { > + return -errno; > + } > > This is more or less black magic. What is this commit file used for and why aren't we using something like a QMP event? Regards, Anthony Liguori