All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anthony Liguori <anthony@codemonkey.ws>
To: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Jes.Sorensen@redhat.com, qemu-devel@nongnu.org, avi@redhat.com
Subject: [Qemu-devel] Re: [patch 2/3] Add support for live block copy
Date: Tue, 22 Feb 2011 14:50:09 -0600	[thread overview]
Message-ID: <4D642181.4080509@codemonkey.ws> (raw)
In-Reply-To: <20110222170115.710717278@redhat.com>

On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> Support live image copy + switch. That is, copy an image backing
> a guest hard disk to a destination image (destination image must
> be created separately), and switch to this copy.
>
> Command syntax:
>
> block_copy device filename [commit_filename] [-i] -- live block copy device to image
>               optional commit filename
>               -i for incremental copy (base image shared between src and destination)
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/block-copy.c
> ===================================================================
> --- /dev/null
> +++ qemu/block-copy.c
> @@ -0,0 +1,741 @@
> +/*
> + * QEMU live block copy
> + *
> + * Copyright (C) 2010 Red Hat Inc.
> + *
> + * Authors: Marcelo Tosatti<mtosatti@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "blockdev.h"
> +#include "qemu-queue.h"
> +#include "qemu-timer.h"
> +#include "monitor.h"
> +#include "block-copy.h"
> +#include "migration.h"
> +#include "sysemu.h"
> +#include "qjson.h"
> +#include<assert.h>
> +
> +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<<  BDRV_SECTOR_BITS)
> +#define MAX_IS_ALLOCATED_SEARCH 65536
> +
> +/*
> + * Stages:
> + *
> + * STAGE_BULK: bulk reads/writes in progress
> + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
> + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
> + * STAGE_SWITCH_FINISHED: switched to new image.
> + */
> +
> +enum BdrvCopyStage {
> +    STAGE_BULK,
> +    STAGE_BULK_FINISHED,
> +    STAGE_DIRTY,
> +    STAGE_SWITCH_FINISHED,
> +};
> +
> +typedef struct BdrvCopyState {
> +    BlockDriverState *src;
> +    BlockDriverState *dst;
> +    bool shared_base;
> +
> +    int64_t curr_sector;
> +    int64_t completed_sectors;
> +    int64_t nr_sectors;
> +
> +    enum BdrvCopyStage stage;
> +    int inflight_reads;
> +    int error;
> +    int failed;
> +    int cancelled;
> +    QLIST_HEAD(, BdrvCopyBlock) io_list;
> +    unsigned long *aio_bitmap;
> +    QEMUTimer *aio_timer;
> +    QLIST_ENTRY(BdrvCopyState) list;
> +
> +    int64_t blocks;
> +    int64_t total_time;
> +
> +    char src_device_name[32];
> +    char dst_filename[1024];
> +    int commit_fd;
> +} BdrvCopyState;
> +
> +typedef struct BdrvCopyBlock {
> +    BdrvCopyState *state;
> +    uint8_t *buf;
> +    int64_t sector;
> +    int64_t nr_sectors;
> +    struct iovec iov;
> +    QEMUIOVector qiov;
> +    BlockDriverAIOCB *aiocb;
> +    int64_t time;
> +    QLIST_ENTRY(BdrvCopyBlock) list;
> +} BdrvCopyBlock;
> +
> +static QLIST_HEAD(, BdrvCopyState) block_copy_list =
> +    QLIST_HEAD_INITIALIZER(block_copy_list);
> +
> +static void alloc_aio_bitmap(BdrvCopyState *s)
> +{
> +    BlockDriverState *bs = s->src;
> +    int64_t bitmap_size;
> +
> +    bitmap_size = (bdrv_getlength(bs)>>  BDRV_SECTOR_BITS) +
> +            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> +
> +    s->aio_bitmap = qemu_mallocz(bitmap_size);
> +}
> +
> +static bool aio_inflight(BdrvCopyState *s, int64_t sector)
> +{
> +    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    if (s->aio_bitmap&&
> +        (sector<<  BDRV_SECTOR_BITS)<  bdrv_getlength(s->src)) {
> +        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]&
> +            (1UL<<  (chunk % (sizeof(unsigned long) * 8))));
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
> +                             int nb_sectors, int set)
> +{
> +    int64_t start, end;
> +    unsigned long val, idx, bit;
> +
> +    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    for (; start<= end; start++) {
> +        idx = start / (sizeof(unsigned long) * 8);
> +        bit = start % (sizeof(unsigned long) * 8);
> +        val = s->aio_bitmap[idx];
> +        if (set) {
> +            if (!(val&  (1UL<<  bit))) {
> +                val |= 1UL<<  bit;
> +            }
> +        } else {
> +            if (val&  (1UL<<  bit)) {
> +                val&= ~(1UL<<  bit);
> +            }
> +        }
> +        s->aio_bitmap[idx] = val;
> +    }
> +}
> +
> +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
> +{
> +    s->stage = stage;
> +
> +    switch (stage) {
> +    case STAGE_BULK:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
> +        break;
> +    case STAGE_BULK_FINISHED:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
> +        break;
> +    case STAGE_DIRTY:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
> +        break;
> +    case STAGE_SWITCH_FINISHED:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
> +        break;
> +    default:
> +        break;
> +    }
> +}
> +
> +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
> +{
> +    s->error = ret;
> +    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
> +{
> +    s->blocks++;
> +    s->total_time += time;
> +}
> +
> +static void blk_copy_write_cb(void *opaque, int ret)
> +{
> +    BdrvCopyBlock *blk = opaque;
> +    BdrvCopyState *s = blk->state;
> +
> +    if (ret<  0) {
> +        QLIST_REMOVE(blk, list);
> +        qemu_free(blk->buf);
> +        qemu_free(blk);
> +        blk_copy_handle_cb_error(s, ret);
> +        return;
> +    }
> +
> +    QLIST_REMOVE(blk, list);
> +    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
> +
> +    /* schedule switch to STAGE_DIRTY on last bulk write completion */
> +    if (blk->state->stage == STAGE_BULK_FINISHED) {
> +        qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +    }
> +
> +    if (blk->state->stage>  STAGE_BULK_FINISHED) {
> +        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
> +    }
> +
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->state = s;
> +    blk->sector = read_blk->sector;
> +    blk->nr_sectors = read_blk->nr_sectors;
> +    blk->time = read_blk->time;
> +    blk->buf = read_blk->buf;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = read_blk->buf;
> +    blk->iov.iov_len = read_blk->iov.iov_len;
> +    qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> +    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
> +    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov,
> +                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
> +                                 blk_copy_write_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(read_blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static void blk_copy_read_cb(void *opaque, int ret)
> +{
> +    BdrvCopyBlock *blk = opaque;
> +    BdrvCopyState *s = blk->state;
> +
> +    s->inflight_reads--;
> +    if (ret<  0) {
> +        QLIST_REMOVE(blk, list);
> +        qemu_free(blk->buf);
> +        qemu_free(blk);
> +        blk_copy_handle_cb_error(s, ret);
> +        return;
> +    }
> +    blk_copy_issue_write(s, blk);
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk);
> +    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
> +                                int nr_sectors)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->buf = qemu_mallocz(BLOCK_SIZE);
> +    blk->state = s;
> +    blk->sector = sector;
> +    blk->nr_sectors = nr_sectors;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = blk->buf;
> +    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> +    s->inflight_reads++;
> +    blk->time = qemu_get_clock_ns(rt_clock);
> +    blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors,
> +                                blk_copy_read_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    s->inflight_reads--;
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static bool blkcopy_can_switch(BdrvCopyState *s)
> +{
> +    int64_t remaining_dirty;
> +    int64_t avg_transfer_time;
> +
> +    remaining_dirty = bdrv_get_dirty_count(s->src);
> +    if (remaining_dirty == 0 || s->blocks == 0) {
> +        return true;
> +    }
> +
> +    avg_transfer_time = s->total_time / s->blocks;
> +    if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) {
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static int blk_issue_reads_dirty(BdrvCopyState *s)
> +{
> +    int64_t sector;
> +
> +    for (sector = s->curr_sector; sector<  s->nr_sectors;) {
> +        if (bdrv_get_dirty(s->src, sector)&&  !aio_inflight(s, sector)) {
> +            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
> +                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> +            blk_copy_issue_read(s, sector, nr_sectors);
> +            bdrv_reset_dirty(s->src, sector, nr_sectors);
> +            set_aio_inflight(s, sector, nr_sectors, 1);
> +            break;
> +        }
> +
> +        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
> +        s->curr_sector = sector;
> +    }
> +
> +    if (sector>= s->nr_sectors) {
> +        s->curr_sector = 0;
> +    }
> +    return 0;
> +}
> +
> +static int blk_issue_reads_bulk(BdrvCopyState *s)
> +{
> +    int nr_sectors;
> +    int64_t curr_sector = s->curr_sector;
> +
> +    if (s->shared_base) {
> +        while (curr_sector<  s->nr_sectors&&
> +                !bdrv_is_allocated(s->src, curr_sector,
> +                                   MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) {
> +                curr_sector += nr_sectors;
> +        }
> +    }
> +
> +    if (curr_sector>= s->nr_sectors) {
> +        s->curr_sector = 0;
> +        return 1;
> +    }
> +
> +    curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
> +    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
> +    s->curr_sector += nr_sectors;
> +    s->completed_sectors = curr_sector;
> +    return 0;
> +}
> +
> +static void blkcopy_finish(BdrvCopyState *s)
> +{
> +    int64_t sector;
> +    uint8_t *buf;
> +
> +    buf = qemu_malloc(BLOCK_SIZE);
> +
> +    /* FIXME: speed up loop, get_next_dirty_block? */
> +    for (sector = 0; sector<  s->nr_sectors;
> +         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
> +        if (bdrv_get_dirty(s->src, sector)) {
> +            int nr_sectors = MIN(s->nr_sectors - sector,
> +                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> +            memset(buf, 0, BLOCK_SIZE);
> +            if (bdrv_read(s->src, sector, buf, nr_sectors)<  0) {
> +                goto error;
> +            }
> +            if (bdrv_write(s->dst, sector, buf, nr_sectors)<  0) {
> +                goto error;
> +            }
> +            bdrv_reset_dirty(s->src, sector, nr_sectors);
> +        }
> +
> +        if (bdrv_get_dirty_count(s->src) == 0)
> +            break;
> +    }
> +    qemu_free(buf);
> +    return;
> +
> +error:
> +    qemu_free(buf);
> +    s->error = 1;
> +}
> +
> +static int write_commit_file(BdrvCopyState *s)
> +{
> +    char commit_msg[1400];
> +    const char *buf = commit_msg;
> +    int len, ret;
> +
> +    sprintf(commit_msg, "commit QEMU block_copy %s ->  %s\n", s->src_device_name,
> +                        s->dst_filename);
> +
> +    len = strlen(commit_msg);
> +    while (len>  0) {
> +        ret = write(s->commit_fd, buf, len);
> +        if (ret == -1&&  errno == EINTR) {
> +            continue;
> +        }
> +        if (ret<= 0) {
> +            return -errno;
> +        }
> +        buf += ret;
> +        len -= ret;
> +    }
> +
> +    if (fsync(s->commit_fd) == -1) {
> +        return -errno;
> +    }
>
>    

This is more or less black magic.  What is this commit file used for and 
why aren't we using something like a QMP event?

Regards,

Anthony Liguori

  reply	other threads:[~2011-02-22 20:50 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-22 17:00 [Qemu-devel] [patch 0/3] live block copy (v2) Marcelo Tosatti
2011-02-22 17:00 ` [Qemu-devel] [patch 1/3] add migration_active function Marcelo Tosatti
2011-02-22 17:00 ` [Qemu-devel] [patch 2/3] Add support for live block copy Marcelo Tosatti
2011-02-22 20:50   ` Anthony Liguori [this message]
2011-02-22 21:07     ` [Qemu-devel] " Marcelo Tosatti
2011-02-22 21:11       ` Anthony Liguori
2011-02-22 23:09         ` Marcelo Tosatti
2011-02-22 23:14           ` Anthony Liguori
2011-02-23 13:01             ` Avi Kivity
2011-02-23 14:35               ` Anthony Liguori
2011-02-23 15:31                 ` Avi Kivity
2011-02-23 16:01                   ` Anthony Liguori
2011-02-23 16:14                     ` Avi Kivity
2011-02-23 16:28                       ` Anthony Liguori
2011-02-23 17:18                         ` Avi Kivity
2011-02-23 20:18                           ` Anthony Liguori
2011-02-23 20:44                             ` Marcelo Tosatti
2011-02-23 21:41                               ` Anthony Liguori
2011-02-24 14:39                                 ` Marcelo Tosatti
2011-02-24  7:37                             ` Markus Armbruster
2011-02-24  8:54                             ` Avi Kivity
2011-02-24 15:00                               ` Anthony Liguori
2011-02-24 15:22                                 ` Avi Kivity
2011-02-24 17:58                                   ` Anthony Liguori
2011-02-27  9:10                                     ` Avi Kivity
2011-02-27  9:55                                       ` Dor Laor
2011-02-27 13:49                                         ` Anthony Liguori
2011-02-27 16:02                                           ` Dor Laor
2011-02-27 17:25                                             ` Anthony Liguori
2011-02-28  8:58                                               ` Dor Laor
2011-02-27 14:00                                       ` Anthony Liguori
2011-02-27 15:31                                         ` Avi Kivity
2011-02-27 17:41                                           ` Anthony Liguori
2011-02-28  8:38                                             ` Avi Kivity
2011-02-28 12:45                                               ` Anthony Liguori
2011-02-28 13:21                                                 ` Avi Kivity
2011-02-28 17:33                                                   ` Anthony Liguori
2011-02-28 17:47                                                     ` Avi Kivity
2011-02-28 18:12                                                       ` Anthony Liguori
     [not found]                                                         ` <4D6CBECF.8090805@redhat.c! om>
     [not found]                                                         ` <4D6CB556.5060401@redhat.c! om>
2011-03-01  8:59                                                         ` Dor Laor
2011-03-02 12:39                                                           ` Anthony Liguori
2011-03-02 13:00                                                             ` Avi Kivity
2011-03-02 15:07                                                               ` Anthony Liguori
2011-03-01  9:39                                                         ` Avi Kivity
2011-03-01 15:51                                                           ` Anthony Liguori
2011-03-01 22:27                                                             ` Dor Laor
2011-03-02 16:30                                                             ` Avi Kivity
2011-03-02 21:55                                                               ` Anthony Liguori
2011-02-28 18:56                                                       ` Marcelo Tosatti
2011-03-01  9:45                                                         ` Avi Kivity
2011-02-23 16:17                     ` Peter Maydell
2011-02-23 16:30                       ` Anthony Liguori
2011-02-24  5:41                         ` [Qemu-devel] Unsubsribing James Brown
2011-02-24 10:00                           ` Stefan Hajnoczi
2011-02-23 17:26                   ` [Qemu-devel] Re: [patch 2/3] Add support for live block copy Markus Armbruster
2011-02-23 20:06                     ` Anthony Liguori
2011-02-24 12:15                       ` Markus Armbruster
2011-02-25  7:16                   ` Stefan Hajnoczi
2011-02-23 17:49               ` Marcelo Tosatti
2011-02-24  8:58                 ` Avi Kivity
2011-02-24 15:14                   ` Marcelo Tosatti
2011-02-24 15:28                     ` Avi Kivity
2011-02-24 16:39                       ` Marcelo Tosatti
2011-02-24 17:32                         ` Avi Kivity
2011-02-24 17:45                         ` Anthony Liguori
2011-02-27  9:22                           ` Avi Kivity
2011-02-23 12:46         ` Avi Kivity
2011-02-22 20:50   ` Anthony Liguori
2011-02-22 21:16   ` [Qemu-devel] " Anthony Liguori
2011-02-23 19:06   ` Anthony Liguori
2011-02-26  0:02     ` Marcelo Tosatti
2011-02-26 13:45       ` Anthony Liguori
2011-02-28 19:09         ` Marcelo Tosatti
2011-03-01  2:35         ` Marcelo Tosatti
2011-02-26 15:32       ` Anthony Liguori
2011-02-22 17:00 ` [Qemu-devel] [patch 3/3] do not allow migration if block copy in progress Marcelo Tosatti

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D642181.4080509@codemonkey.ws \
    --to=anthony@codemonkey.ws \
    --cc=Jes.Sorensen@redhat.com \
    --cc=avi@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.