qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Anthony Liguori <anthony@codemonkey.ws>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Jes.Sorensen@redhat.com, qemu-devel@nongnu.org, avi@redhat.com
Subject: [Qemu-devel] Re: [patch 2/3] Add support for live block copy
Date: Tue, 22 Feb 2011 14:50:09 -0600	[thread overview]
Message-ID: <4D642181.4080509@codemonkey.ws> (raw)
In-Reply-To: <20110222170115.710717278@redhat.com>

On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> Support live image copy + switch. That is, copy an image backing
> a guest hard disk to a destination image (destination image must
> be created separately), and switch to this copy.
>
> Command syntax:
>
> block_copy device filename [commit_filename] [-i] -- live block copy device to image
>               optional commit filename
>               -i for incremental copy (base image shared between src and destination)
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/block-copy.c
> ===================================================================
> --- /dev/null
> +++ qemu/block-copy.c
> @@ -0,0 +1,741 @@
> +/*
> + * QEMU live block copy
> + *
> + * Copyright (C) 2010 Red Hat Inc.
> + *
> + * Authors: Marcelo Tosatti<mtosatti@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "blockdev.h"
> +#include "qemu-queue.h"
> +#include "qemu-timer.h"
> +#include "monitor.h"
> +#include "block-copy.h"
> +#include "migration.h"
> +#include "sysemu.h"
> +#include "qjson.h"
> +#include<assert.h>
> +
> +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<<  BDRV_SECTOR_BITS)
> +#define MAX_IS_ALLOCATED_SEARCH 65536
> +
> +/*
> + * Stages:
> + *
> + * STAGE_BULK: bulk reads/writes in progress
> + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
> + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
> + * STAGE_SWITCH_FINISHED: switched to new image.
> + */
> +
> +enum BdrvCopyStage {
> +    STAGE_BULK,
> +    STAGE_BULK_FINISHED,
> +    STAGE_DIRTY,
> +    STAGE_SWITCH_FINISHED,
> +};
> +
> +typedef struct BdrvCopyState {
> +    BlockDriverState *src;
> +    BlockDriverState *dst;
> +    bool shared_base;
> +
> +    int64_t curr_sector;
> +    int64_t completed_sectors;
> +    int64_t nr_sectors;
> +
> +    enum BdrvCopyStage stage;
> +    int inflight_reads;
> +    int error;
> +    int failed;
> +    int cancelled;
> +    QLIST_HEAD(, BdrvCopyBlock) io_list;
> +    unsigned long *aio_bitmap;
> +    QEMUTimer *aio_timer;
> +    QLIST_ENTRY(BdrvCopyState) list;
> +
> +    int64_t blocks;
> +    int64_t total_time;
> +
> +    char src_device_name[32];
> +    char dst_filename[1024];
> +    int commit_fd;
> +} BdrvCopyState;
> +
> +typedef struct BdrvCopyBlock {
> +    BdrvCopyState *state;
> +    uint8_t *buf;
> +    int64_t sector;
> +    int64_t nr_sectors;
> +    struct iovec iov;
> +    QEMUIOVector qiov;
> +    BlockDriverAIOCB *aiocb;
> +    int64_t time;
> +    QLIST_ENTRY(BdrvCopyBlock) list;
> +} BdrvCopyBlock;
> +
> +static QLIST_HEAD(, BdrvCopyState) block_copy_list =
> +    QLIST_HEAD_INITIALIZER(block_copy_list);
> +
> +static void alloc_aio_bitmap(BdrvCopyState *s)
> +{
> +    BlockDriverState *bs = s->src;
> +    int64_t bitmap_size;
> +
> +    bitmap_size = (bdrv_getlength(bs)>>  BDRV_SECTOR_BITS) +
> +            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> +
> +    s->aio_bitmap = qemu_mallocz(bitmap_size);
> +}
> +
> +static bool aio_inflight(BdrvCopyState *s, int64_t sector)
> +{
> +    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    if (s->aio_bitmap&&
> +        (sector<<  BDRV_SECTOR_BITS)<  bdrv_getlength(s->src)) {
> +        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]&
> +            (1UL<<  (chunk % (sizeof(unsigned long) * 8))));
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
> +                             int nb_sectors, int set)
> +{
> +    int64_t start, end;
> +    unsigned long val, idx, bit;
> +
> +    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    for (; start<= end; start++) {
> +        idx = start / (sizeof(unsigned long) * 8);
> +        bit = start % (sizeof(unsigned long) * 8);
> +        val = s->aio_bitmap[idx];
> +        if (set) {
> +            if (!(val&  (1UL<<  bit))) {
> +                val |= 1UL<<  bit;
> +            }
> +        } else {
> +            if (val&  (1UL<<  bit)) {
> +                val&= ~(1UL<<  bit);
> +            }
> +        }
> +        s->aio_bitmap[idx] = val;
> +    }
> +}
> +
> +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
> +{
> +    s->stage = stage;
> +
> +    switch (stage) {
> +    case STAGE_BULK:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
> +        break;
> +    case STAGE_BULK_FINISHED:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
> +        break;
> +    case STAGE_DIRTY:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
> +        break;
> +    case STAGE_SWITCH_FINISHED:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
> +        break;
> +    default:
> +        break;
> +    }
> +}
> +
> +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
> +{
> +    s->error = ret;
> +    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
> +{
> +    s->blocks++;
> +    s->total_time += time;
> +}
> +
> +static void blk_copy_write_cb(void *opaque, int ret)
> +{
> +    BdrvCopyBlock *blk = opaque;
> +    BdrvCopyState *s = blk->state;
> +
> +    if (ret<  0) {
> +        QLIST_REMOVE(blk, list);
> +        qemu_free(blk->buf);
> +        qemu_free(blk);
> +        blk_copy_handle_cb_error(s, ret);
> +        return;
> +    }
> +
> +    QLIST_REMOVE(blk, list);
> +    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
> +
> +    /* schedule switch to STAGE_DIRTY on last bulk write completion */
> +    if (blk->state->stage == STAGE_BULK_FINISHED) {
> +        qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +    }
> +
> +    if (blk->state->stage>  STAGE_BULK_FINISHED) {
> +        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
> +    }
> +
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->state = s;
> +    blk->sector = read_blk->sector;
> +    blk->nr_sectors = read_blk->nr_sectors;
> +    blk->time = read_blk->time;
> +    blk->buf = read_blk->buf;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = read_blk->buf;
> +    blk->iov.iov_len = read_blk->iov.iov_len;
> +    qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> +    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
> +    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov,
> +                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
> +                                 blk_copy_write_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(read_blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static void blk_copy_read_cb(void *opaque, int ret)
> +{
> +    BdrvCopyBlock *blk = opaque;
> +    BdrvCopyState *s = blk->state;
> +
> +    s->inflight_reads--;
> +    if (ret<  0) {
> +        QLIST_REMOVE(blk, list);
> +        qemu_free(blk->buf);
> +        qemu_free(blk);
> +        blk_copy_handle_cb_error(s, ret);
> +        return;
> +    }
> +    blk_copy_issue_write(s, blk);
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk);
> +    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
> +                                int nr_sectors)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->buf = qemu_mallocz(BLOCK_SIZE);
> +    blk->state = s;
> +    blk->sector = sector;
> +    blk->nr_sectors = nr_sectors;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = blk->buf;
> +    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> +    s->inflight_reads++;
> +    blk->time = qemu_get_clock_ns(rt_clock);
> +    blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors,
> +                                blk_copy_read_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    s->inflight_reads--;
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static bool blkcopy_can_switch(BdrvCopyState *s)
> +{
> +    int64_t remaining_dirty;
> +    int64_t avg_transfer_time;
> +
> +    remaining_dirty = bdrv_get_dirty_count(s->src);
> +    if (remaining_dirty == 0 || s->blocks == 0) {
> +        return true;
> +    }
> +
> +    avg_transfer_time = s->total_time / s->blocks;
> +    if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) {
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static int blk_issue_reads_dirty(BdrvCopyState *s)
> +{
> +    int64_t sector;
> +
> +    for (sector = s->curr_sector; sector<  s->nr_sectors;) {
> +        if (bdrv_get_dirty(s->src, sector)&&  !aio_inflight(s, sector)) {
> +            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
> +                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> +            blk_copy_issue_read(s, sector, nr_sectors);
> +            bdrv_reset_dirty(s->src, sector, nr_sectors);
> +            set_aio_inflight(s, sector, nr_sectors, 1);
> +            break;
> +        }
> +
> +        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
> +        s->curr_sector = sector;
> +    }
> +
> +    if (sector>= s->nr_sectors) {
> +        s->curr_sector = 0;
> +    }
> +    return 0;
> +}
> +
> +static int blk_issue_reads_bulk(BdrvCopyState *s)
> +{
> +    int nr_sectors;
> +    int64_t curr_sector = s->curr_sector;
> +
> +    if (s->shared_base) {
> +        while (curr_sector<  s->nr_sectors&&
> +                !bdrv_is_allocated(s->src, curr_sector,
> +                                   MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) {
> +                curr_sector += nr_sectors;
> +        }
> +    }
> +
> +    if (curr_sector>= s->nr_sectors) {
> +        s->curr_sector = 0;
> +        return 1;
> +    }
> +
> +    curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
> +    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
> +    s->curr_sector += nr_sectors;
> +    s->completed_sectors = curr_sector;
> +    return 0;
> +}
> +
> +static void blkcopy_finish(BdrvCopyState *s)
> +{
> +    int64_t sector;
> +    uint8_t *buf;
> +
> +    buf = qemu_malloc(BLOCK_SIZE);
> +
> +    /* FIXME: speed up loop, get_next_dirty_block? */
> +    for (sector = 0; sector<  s->nr_sectors;
> +         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
> +        if (bdrv_get_dirty(s->src, sector)) {
> +            int nr_sectors = MIN(s->nr_sectors - sector,
> +                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> +            memset(buf, 0, BLOCK_SIZE);
> +            if (bdrv_read(s->src, sector, buf, nr_sectors)<  0) {
> +                goto error;
> +            }
> +            if (bdrv_write(s->dst, sector, buf, nr_sectors)<  0) {
> +                goto error;
> +            }
> +            bdrv_reset_dirty(s->src, sector, nr_sectors);
> +        }
> +
> +        if (bdrv_get_dirty_count(s->src) == 0)
> +            break;
> +    }
> +    qemu_free(buf);
> +    return;
> +
> +error:
> +    qemu_free(buf);
> +    s->error = 1;
> +}
> +
> +static int write_commit_file(BdrvCopyState *s)
> +{
> +    char commit_msg[1400];
> +    const char *buf = commit_msg;
> +    int len, ret;
> +
> +    sprintf(commit_msg, "commit QEMU block_copy %s ->  %s\n", s->src_device_name,
> +                        s->dst_filename);
> +
> +    len = strlen(commit_msg);
> +    while (len>  0) {
> +        ret = write(s->commit_fd, buf, len);
> +        if (ret == -1&&  errno == EINTR) {
> +            continue;
> +        }
> +        if (ret<= 0) {
> +            return -errno;
> +        }
> +        buf += ret;
> +        len -= ret;
> +    }
> +
> +    if (fsync(s->commit_fd) == -1) {
> +        return -errno;
> +    }
>
>    

This is more or less black magic.  What is this commit file used for and 
why aren't we using something like a QMP event?

Regards,

Anthony Liguori

  reply	other threads:[~2011-02-22 20:50 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-22 17:00 [Qemu-devel] [patch 0/3] live block copy (v2) Marcelo Tosatti
2011-02-22 17:00 ` [Qemu-devel] [patch 1/3] add migration_active function Marcelo Tosatti
2011-02-22 17:00 ` [Qemu-devel] [patch 2/3] Add support for live block copy Marcelo Tosatti
2011-02-22 20:50   ` Anthony Liguori [this message]
2011-02-22 21:07     ` [Qemu-devel] " Marcelo Tosatti
2011-02-22 21:11       ` Anthony Liguori
2011-02-22 23:09         ` Marcelo Tosatti
2011-02-22 23:14           ` Anthony Liguori
2011-02-23 13:01             ` Avi Kivity
2011-02-23 14:35               ` Anthony Liguori
2011-02-23 15:31                 ` Avi Kivity
2011-02-23 16:01                   ` Anthony Liguori
2011-02-23 16:14                     ` Avi Kivity
2011-02-23 16:28                       ` Anthony Liguori
2011-02-23 17:18                         ` Avi Kivity
2011-02-23 20:18                           ` Anthony Liguori
2011-02-23 20:44                             ` Marcelo Tosatti
2011-02-23 21:41                               ` Anthony Liguori
2011-02-24 14:39                                 ` Marcelo Tosatti
2011-02-24  7:37                             ` Markus Armbruster
2011-02-24  8:54                             ` Avi Kivity
2011-02-24 15:00                               ` Anthony Liguori
2011-02-24 15:22                                 ` Avi Kivity
2011-02-24 17:58                                   ` Anthony Liguori
2011-02-27  9:10                                     ` Avi Kivity
2011-02-27  9:55                                       ` Dor Laor
2011-02-27 13:49                                         ` Anthony Liguori
2011-02-27 16:02                                           ` Dor Laor
2011-02-27 17:25                                             ` Anthony Liguori
2011-02-28  8:58                                               ` Dor Laor
2011-02-27 14:00                                       ` Anthony Liguori
2011-02-27 15:31                                         ` Avi Kivity
2011-02-27 17:41                                           ` Anthony Liguori
2011-02-28  8:38                                             ` Avi Kivity
2011-02-28 12:45                                               ` Anthony Liguori
2011-02-28 13:21                                                 ` Avi Kivity
2011-02-28 17:33                                                   ` Anthony Liguori
2011-02-28 17:47                                                     ` Avi Kivity
2011-02-28 18:12                                                       ` Anthony Liguori
     [not found]                                                         ` <4D6CB556.5060401@redhat.c! om>
     [not found]                                                         ` <4D6CBECF.8090805@redhat.c! om>
2011-03-01  8:59                                                         ` Dor Laor
2011-03-02 12:39                                                           ` Anthony Liguori
2011-03-02 13:00                                                             ` Avi Kivity
2011-03-02 15:07                                                               ` Anthony Liguori
2011-03-01  9:39                                                         ` Avi Kivity
2011-03-01 15:51                                                           ` Anthony Liguori
2011-03-01 22:27                                                             ` Dor Laor
2011-03-02 16:30                                                             ` Avi Kivity
2011-03-02 21:55                                                               ` Anthony Liguori
2011-02-28 18:56                                                       ` Marcelo Tosatti
2011-03-01  9:45                                                         ` Avi Kivity
2011-02-23 16:17                     ` Peter Maydell
2011-02-23 16:30                       ` Anthony Liguori
2011-02-24  5:41                         ` [Qemu-devel] Unsubsribing James Brown
2011-02-24 10:00                           ` Stefan Hajnoczi
2011-02-23 17:26                   ` [Qemu-devel] Re: [patch 2/3] Add support for live block copy Markus Armbruster
2011-02-23 20:06                     ` Anthony Liguori
2011-02-24 12:15                       ` Markus Armbruster
2011-02-25  7:16                   ` Stefan Hajnoczi
2011-02-23 17:49               ` Marcelo Tosatti
2011-02-24  8:58                 ` Avi Kivity
2011-02-24 15:14                   ` Marcelo Tosatti
2011-02-24 15:28                     ` Avi Kivity
2011-02-24 16:39                       ` Marcelo Tosatti
2011-02-24 17:32                         ` Avi Kivity
2011-02-24 17:45                         ` Anthony Liguori
2011-02-27  9:22                           ` Avi Kivity
2011-02-23 12:46         ` Avi Kivity
2011-02-22 20:50   ` Anthony Liguori
2011-02-22 21:16   ` [Qemu-devel] " Anthony Liguori
2011-02-23 19:06   ` Anthony Liguori
2011-02-26  0:02     ` Marcelo Tosatti
2011-02-26 13:45       ` Anthony Liguori
2011-02-28 19:09         ` Marcelo Tosatti
2011-03-01  2:35         ` Marcelo Tosatti
2011-02-26 15:32       ` Anthony Liguori
2011-02-22 17:00 ` [Qemu-devel] [patch 3/3] do not allow migration if block copy in progress Marcelo Tosatti

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D642181.4080509@codemonkey.ws \
    --to=anthony@codemonkey.ws \
    --cc=Jes.Sorensen@redhat.com \
    --cc=avi@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).