* [Qemu-devel] [PATCH 01/13] qemu-config: }, { -> }, { to please checkpatch.pl
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 02/13] block: add -drive copy-on-read=on|off Stefan Hajnoczi
` (13 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
The checkpatch.pl coding style checker complains that there should be a
space after the comma in },{. Make it easy for others to change
qemu-config.c in the future by complying with the coding style here.
A follow-up patch adds new options to qemu-config.c and therefore
benefits from this fix.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
qemu-config.c | 160 ++++++++++++++++++++++++++++----------------------------
1 files changed, 80 insertions(+), 80 deletions(-)
diff --git a/qemu-config.c b/qemu-config.c
index c63741c..7558fa8 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -12,70 +12,70 @@ static QemuOptsList qemu_drive_opts = {
.name = "bus",
.type = QEMU_OPT_NUMBER,
.help = "bus number",
- },{
+ }, {
.name = "unit",
.type = QEMU_OPT_NUMBER,
.help = "unit number (i.e. lun for scsi)",
- },{
+ }, {
.name = "if",
.type = QEMU_OPT_STRING,
.help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)",
- },{
+ }, {
.name = "index",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "cyls",
.type = QEMU_OPT_NUMBER,
.help = "number of cylinders (ide disk geometry)",
- },{
+ }, {
.name = "heads",
.type = QEMU_OPT_NUMBER,
.help = "number of heads (ide disk geometry)",
- },{
+ }, {
.name = "secs",
.type = QEMU_OPT_NUMBER,
.help = "number of sectors (ide disk geometry)",
- },{
+ }, {
.name = "trans",
.type = QEMU_OPT_STRING,
.help = "chs translation (auto, lba. none)",
- },{
+ }, {
.name = "media",
.type = QEMU_OPT_STRING,
.help = "media type (disk, cdrom)",
- },{
+ }, {
.name = "snapshot",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "file",
.type = QEMU_OPT_STRING,
.help = "disk image",
- },{
+ }, {
.name = "cache",
.type = QEMU_OPT_STRING,
.help = "host cache usage (none, writeback, writethrough, unsafe)",
- },{
+ }, {
.name = "aio",
.type = QEMU_OPT_STRING,
.help = "host AIO implementation (threads, native)",
- },{
+ }, {
.name = "format",
.type = QEMU_OPT_STRING,
.help = "disk format (raw, qcow2, ...)",
- },{
+ }, {
.name = "serial",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "rerror",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "werror",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "addr",
.type = QEMU_OPT_STRING,
.help = "pci address (virtio only)",
- },{
+ }, {
.name = "readonly",
.type = QEMU_OPT_BOOL,
},
@@ -91,64 +91,64 @@ static QemuOptsList qemu_chardev_opts = {
{
.name = "backend",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "path",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "host",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "port",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "localaddr",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "localport",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "to",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "ipv4",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "ipv6",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "wait",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "server",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "delay",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "telnet",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "width",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "height",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "cols",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "rows",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "mux",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "signal",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "name",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "debug",
.type = QEMU_OPT_NUMBER,
},
@@ -245,10 +245,10 @@ static QemuOptsList qemu_rtc_opts = {
{
.name = "base",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "clock",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "driftfix",
.type = QEMU_OPT_STRING,
},
@@ -263,10 +263,10 @@ static QemuOptsList qemu_global_opts = {
{
.name = "driver",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "property",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "value",
.type = QEMU_OPT_STRING,
},
@@ -282,13 +282,13 @@ static QemuOptsList qemu_mon_opts = {
{
.name = "mode",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "chardev",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "default",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "pretty",
.type = QEMU_OPT_BOOL,
},
@@ -318,40 +318,40 @@ static QemuOptsList qemu_cpudef_opts = {
{
.name = "name",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "level",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "vendor",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "family",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "model",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "stepping",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "feature_edx", /* cpuid 0000_0001.edx */
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "feature_ecx", /* cpuid 0000_0001.ecx */
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "extfeature_edx", /* cpuid 8000_0001.edx */
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "extfeature_ecx", /* cpuid 8000_0001.ecx */
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "xlevel",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "model_id",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "vendor_override",
.type = QEMU_OPT_NUMBER,
},
@@ -366,73 +366,73 @@ QemuOptsList qemu_spice_opts = {
{
.name = "port",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "tls-port",
.type = QEMU_OPT_NUMBER,
- },{
+ }, {
.name = "addr",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "ipv4",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "ipv6",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "password",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "disable-ticketing",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "disable-copy-paste",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "sasl",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "x509-dir",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "x509-key-file",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "x509-key-password",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "x509-cert-file",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "x509-cacert-file",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "x509-dh-key-file",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "tls-ciphers",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "tls-channel",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "plaintext-channel",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "image-compression",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "jpeg-wan-compression",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "zlib-glz-wan-compression",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "streaming-video",
.type = QEMU_OPT_STRING,
- },{
+ }, {
.name = "agent-mouse",
.type = QEMU_OPT_BOOL,
- },{
+ }, {
.name = "playback-compression",
.type = QEMU_OPT_BOOL,
},
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 02/13] block: add -drive copy-on-read=on|off
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 01/13] qemu-config: }, { -> }, { to please checkpatch.pl Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 03/13] qed: replace is_write with flags field Stefan Hajnoczi
` (12 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
This patch adds the -drive copy-on-read=on|off command-line option:
copy-on-read=on|off
copy-on-read is "on" or "off" and enables whether to copy read backing
file sectors into the image file. Copy-on-read avoids accessing the
same backing file sectors repeatedly and is useful when the backing file
is over a slow network. By default copy-on-read is off.
The new BlockDriverState.copy_on_read field indicates whether
copy-on-read is enabled. Block drivers can use this as a hint to copy
sectors read from the backing file into the image file. The point of
copy-on-read is to avoid accessing the backing file again in the future.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block.c | 5 +++++
block.h | 1 +
block_int.h | 1 +
blockdev.c | 6 ++++++
hmp-commands.hx | 5 +++--
qemu-config.c | 3 +++
qemu-options.hx | 9 ++++++++-
7 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/block.c b/block.c
index 24a25d5..99068c9 100644
--- a/block.c
+++ b/block.c
@@ -430,6 +430,11 @@ static int bdrv_open_common(BlockDriverState *bs, const char *filename,
/* buffer_alignment defaulted to 512, drivers can change this value */
bs->buffer_alignment = 512;
+ bs->copy_on_read = 0;
+ if (flags & BDRV_O_RDWR) {
+ bs->copy_on_read = !!(flags & BDRV_O_COPY_ON_READ);
+ }
+
pstrcpy(bs->filename, sizeof(bs->filename), filename);
if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
diff --git a/block.h b/block.h
index da7d39c..deb19ec 100644
--- a/block.h
+++ b/block.h
@@ -34,6 +34,7 @@ typedef struct QEMUSnapshotInfo {
#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */
#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
+#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
diff --git a/block_int.h b/block_int.h
index fa91337..135625a 100644
--- a/block_int.h
+++ b/block_int.h
@@ -152,6 +152,7 @@ struct BlockDriverState {
int encrypted; /* if true, the media is encrypted */
int valid_key; /* if true, a valid encryption key has been set */
int sg; /* if true, the device is a /dev/sg* */
+ int copy_on_read; /* if true, copy read backing sectors into image */
/* event callback when inserting/removing */
void (*change_cb)(void *opaque, int reason);
void *change_opaque;
diff --git a/blockdev.c b/blockdev.c
index 1502575..9dbd2fa 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -237,6 +237,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
const char *devaddr;
DriveInfo *dinfo;
int snapshot = 0;
+ int copy_on_read;
int ret;
translation = BIOS_ATA_TRANSLATION_AUTO;
@@ -261,6 +262,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
ro = qemu_opt_get_bool(opts, "readonly", 0);
+ copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", 0);
file = qemu_opt_get(opts, "file");
serial = qemu_opt_get(opts, "serial");
@@ -521,6 +523,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
bdrv_flags |= (BDRV_O_SNAPSHOT|BDRV_O_CACHE_WB|BDRV_O_NO_FLUSH);
}
+ if (copy_on_read) {
+ bdrv_flags |= BDRV_O_COPY_ON_READ;
+ }
+
if (media == MEDIA_CDROM) {
/* CDROM is fine for any interface, don't check. */
ro = 1;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 6ad8806..f6cc724 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -863,9 +863,10 @@ ETEXI
.args_type = "pci_addr:s,opts:s",
.params = "[[<domain>:]<bus>:]<slot>\n"
"[file=file][,if=type][,bus=n]\n"
- "[,unit=m][,media=d][index=i]\n"
+ "[,unit=m][,media=d][,index=i]\n"
"[,cyls=c,heads=h,secs=s[,trans=t]]\n"
- "[snapshot=on|off][,cache=on|off]",
+ "[,snapshot=on|off][,cache=on|off]\n"
+ "[,readonly=on|off][,copy-on-read=on|off]",
.help = "add drive to PCI storage controller",
.mhandler.cmd = drive_hot_add,
},
diff --git a/qemu-config.c b/qemu-config.c
index 7558fa8..dafacb7 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -78,6 +78,9 @@ static QemuOptsList qemu_drive_opts = {
}, {
.name = "readonly",
.type = QEMU_OPT_BOOL,
+ }, {
+ .name = "copy-on-read",
+ .type = QEMU_OPT_BOOL,
},
{ /* end of list */ }
},
diff --git a/qemu-options.hx b/qemu-options.hx
index f2ef9a1..97abcd9 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -120,7 +120,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
" [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
" [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
" [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
- " [,readonly=on|off]\n"
+ " [,readonly=on|off][,copy-on-read=on|off]\n"
" use 'file' as a drive image\n", QEMU_ARCH_ALL)
STEXI
@item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
@@ -160,6 +160,9 @@ an untrusted format header.
This option specifies the serial number to assign to the device.
@item addr=@var{addr}
Specify the controller's PCI address (if=virtio only).
+@item copy-on-read=@var{copy-on-read}
+@var{copy-on-read} is "on" or "off" and enables whether to copy read backing
+file sectors into the image file.
@end table
By default, writethrough caching is used for all block device. This means that
@@ -187,6 +190,10 @@ like your host losing power, the disk storage getting disconnected accidently,
etc. you're image will most probably be rendered unusable. When using
the @option{-snapshot} option, unsafe caching is always used.
+Copy-on-read avoids accessing the same backing file sectors repeatedly and is
+useful when the backing file is over a slow network. By default copy-on-read
+is off.
+
Instead of @option{-cdrom} you can use:
@example
qemu -drive file=file,index=2,media=cdrom
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 03/13] qed: replace is_write with flags field
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 01/13] qemu-config: }, { -> }, { to please checkpatch.pl Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 02/13] block: add -drive copy-on-read=on|off Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 04/13] qed: extract qed_start_allocating_write() Stefan Hajnoczi
` (11 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
Per-request attributes like read/write are currently implemented as bool
fields in the QEDAIOCB struct. This becomes unwiedly as the number of
attributes grows. For example, the qed_aio_setup() function would have
to take multiple bool arguments and at call sites it would be hard to
distinguish the meaning of each bool.
Instead use a flags field with bitmask constants. This will be used
when the copy-on-write and check for zeroes attributes are introduced.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block/qed.c | 15 ++++++++-------
block/qed.h | 6 +++++-
trace-events | 2 +-
3 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/block/qed.c b/block/qed.c
index 3970379..565bbc1 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1254,8 +1254,8 @@ static void qed_aio_next_io(void *opaque, int ret)
{
QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
- QEDFindClusterFunc *io_fn =
- acb->is_write ? qed_aio_write_data : qed_aio_read_data;
+ QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
+ qed_aio_write_data : qed_aio_read_data;
trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size);
@@ -1285,14 +1285,14 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
int64_t sector_num,
QEMUIOVector *qiov, int nb_sectors,
BlockDriverCompletionFunc *cb,
- void *opaque, bool is_write)
+ void *opaque, int flags)
{
QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors,
- opaque, is_write);
+ opaque, flags);
- acb->is_write = is_write;
+ acb->flags = flags;
acb->finished = NULL;
acb->qiov = qiov;
acb->qiov_offset = 0;
@@ -1312,7 +1312,7 @@ static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, false);
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
}
static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
@@ -1321,7 +1321,8 @@ static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, true);
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QED_AIOCB_WRITE);
}
static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index 388fdb3..dbc00be 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -123,12 +123,16 @@ typedef struct QEDRequest {
CachedL2Table *l2_table;
} QEDRequest;
+enum {
+ QED_AIOCB_WRITE = 0x0001, /* read or write? */
+};
+
typedef struct QEDAIOCB {
BlockDriverAIOCB common;
QEMUBH *bh;
int bh_ret; /* final return status for completion bh */
QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */
- bool is_write; /* false - read, true - write */
+ int flags; /* QED_AIOCB_* bits ORed together */
bool *finished; /* signal for cancel completion */
uint64_t end_pos; /* request end on block device, in bytes */
diff --git a/trace-events b/trace-events
index e0e9574..6e1a19f 100644
--- a/trace-events
+++ b/trace-events
@@ -233,7 +233,7 @@ disable qed_need_check_timer_cb(void *s) "s %p"
disable qed_start_need_check_timer(void *s) "s %p"
disable qed_cancel_need_check_timer(void *s) "s %p"
disable qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
-disable qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int is_write) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p is_write %d"
+disable qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x"
disable qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64""
disable qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
disable qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 04/13] qed: extract qed_start_allocating_write()
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (2 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 03/13] qed: replace is_write with flags field Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 05/13] qed: make qed_aio_write_alloc() reusable Stefan Hajnoczi
` (10 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
Copy-on-read requests are a form of allocating write and will need to be
queued like other allocating writes. This patch extracts the request
queuing code for allocating writes so that it can be reused for
copy-on-read in a later patch.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block/qed.c | 32 ++++++++++++++++++++++++++------
1 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/block/qed.c b/block/qed.c
index 565bbc1..cc193ad 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1097,14 +1097,15 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
}
/**
- * Write new data cluster
+ * Start an allocating write request or queue it
*
- * @acb: Write request
- * @len: Length in bytes
+ * @ret: true if request can proceed, false if queued
*
- * This path is taken when writing to previously unallocated clusters.
+ * If a request is queued this function returns false and the caller should
+ * return. When it becomes time for the request to proceed the qed_aio_next()
+ * function will be called.
*/
-static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static bool qed_start_allocating_write(QEDAIOCB *acb)
{
BDRVQEDState *s = acb_to_s(acb);
@@ -1119,7 +1120,26 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
}
if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
s->allocating_write_reqs_plugged) {
- return; /* wait for existing request to finish */
+ return false;
+ }
+ return true;
+}
+
+/**
+ * Write new data cluster
+ *
+ * @acb: Write request
+ * @len: Length in bytes
+ *
+ * This path is taken when writing to previously unallocated clusters.
+ */
+static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverCompletionFunc *cb;
+
+ if (!qed_start_allocating_write(acb)) {
+ return;
}
acb->cur_nclusters = qed_bytes_to_clusters(s,
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 05/13] qed: make qed_aio_write_alloc() reusable
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (3 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 04/13] qed: extract qed_start_allocating_write() Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 06/13] qed: add support for copy-on-read Stefan Hajnoczi
` (9 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
Copy-on-read requests will share the allocating write code path. This
requires making qed_aio_write_alloc() reusable outside of a write
request. This patch ensures that iovec setup is performed in a common
place before qed_aio_write_alloc() is called.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block/qed.c | 53 +++++++++++++++--------------------------------------
1 files changed, 15 insertions(+), 38 deletions(-)
diff --git a/block/qed.c b/block/qed.c
index cc193ad..4f535aa 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1133,19 +1133,18 @@ static bool qed_start_allocating_write(QEDAIOCB *acb)
*
* This path is taken when writing to previously unallocated clusters.
*/
-static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
+static void qed_aio_write_alloc(QEDAIOCB *acb)
{
BDRVQEDState *s = acb_to_s(acb);
- BlockDriverCompletionFunc *cb;
if (!qed_start_allocating_write(acb)) {
- return;
+ qemu_iovec_reset(&acb->cur_qiov);
+ return; /* wait until current allocating write completes */
}
acb->cur_nclusters = qed_bytes_to_clusters(s,
- qed_offset_into_cluster(s, acb->cur_pos) + len);
+ qed_offset_into_cluster(s, acb->cur_pos) + acb->cur_qiov.size);
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
- qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
if (qed_should_set_need_check(s)) {
s->header.features |= QED_F_NEED_CHECK;
@@ -1156,25 +1155,6 @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
}
/**
- * Write data cluster in place
- *
- * @acb: Write request
- * @offset: Cluster offset in bytes
- * @len: Length in bytes
- *
- * This path is taken when writing to already allocated clusters.
- */
-static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len)
-{
- /* Calculate the I/O vector */
- acb->cur_cluster = offset;
- qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
-
- /* Do the actual write */
- qed_aio_write_main(acb, 0);
-}
-
-/**
* Write data cluster
*
* @opaque: Write request
@@ -1192,22 +1172,19 @@ static void qed_aio_write_data(void *opaque, int ret,
trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len);
- acb->find_cluster_ret = ret;
-
- switch (ret) {
- case QED_CLUSTER_FOUND:
- qed_aio_write_inplace(acb, offset, len);
- break;
+ if (ret < 0) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
- case QED_CLUSTER_L2:
- case QED_CLUSTER_L1:
- case QED_CLUSTER_ZERO:
- qed_aio_write_alloc(acb, len);
- break;
+ acb->find_cluster_ret = ret;
+ qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
- default:
- qed_aio_complete(acb, ret);
- break;
+ if (ret == QED_CLUSTER_FOUND) {
+ acb->cur_cluster = offset;
+ qed_aio_write_main(acb, 0);
+ } else {
+ qed_aio_write_alloc(acb);
}
}
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 06/13] qed: add support for copy-on-read
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (4 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 05/13] qed: make qed_aio_write_alloc() reusable Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 07/13] qed: avoid deadlock on emulated synchronous I/O Stefan Hajnoczi
` (8 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
From: Anthony Liguori <aliguori@us.ibm.com>
This patch implements copy-on-read in QED. Once a read request reaches
the copy-on-read state it adds itself to the allocating write queue in
order to avoid race conditions with write requests.
If an allocating write request manages to sneak in before the
copy-on-read request, then the copy-on-read will notice that the cluster
has been allocated when qed_find_cluster() is re-run. This works
because only one allocating request is active at any time and when the
next request is activated it will re-run qed_find_cluster().
[Originally by Anthony. Stefan added allocating write queuing and
factored out the QED_CF_COPY_ON_READ header flag.]
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block/qed.c | 35 +++++++++++++++++++++++++++++++++--
block/qed.h | 3 ++-
trace-events | 1 +
3 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/block/qed.c b/block/qed.c
index 4f535aa..6ca57f2 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1189,6 +1189,25 @@ static void qed_aio_write_data(void *opaque, int ret,
}
/**
+ * Copy on read callback
+ *
+ * Write data from backing file to QED that's been read if CoR is enabled.
+ */
+static void qed_copy_on_read_cb(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+
+ trace_qed_copy_on_read_cb(acb, ret);
+
+ if (ret < 0) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ qed_aio_write_alloc(acb);
+}
+
+/**
* Read data cluster
*
* @opaque: Read request
@@ -1216,6 +1235,7 @@ static void qed_aio_read_data(void *opaque, int ret,
goto err;
}
+ acb->find_cluster_ret = ret;
qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
/* Handle zero cluster and backing file reads */
@@ -1224,8 +1244,17 @@ static void qed_aio_read_data(void *opaque, int ret,
qed_aio_next_io(acb, 0);
return;
} else if (ret != QED_CLUSTER_FOUND) {
+ BlockDriverCompletionFunc *cb = qed_aio_next_io;
+
+ if (bs->backing_hd && (acb->flags & QED_AIOCB_COPY_ON_READ)) {
+ if (!qed_start_allocating_write(acb)) {
+ qemu_iovec_reset(&acb->cur_qiov);
+ return; /* wait for current allocating write to complete */
+ }
+ cb = qed_copy_on_read_cb;
+ }
qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
- qed_aio_next_io, acb);
+ cb, acb);
return;
}
@@ -1309,7 +1338,9 @@ static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+ int flags = bs->copy_on_read ? QED_AIOCB_COPY_ON_READ : 0;
+
+ return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, flags);
}
static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
diff --git a/block/qed.h b/block/qed.h
index dbc00be..16f4bd9 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -124,7 +124,8 @@ typedef struct QEDRequest {
} QEDRequest;
enum {
- QED_AIOCB_WRITE = 0x0001, /* read or write? */
+ QED_AIOCB_WRITE = 0x0001, /* read or write? */
+ QED_AIOCB_COPY_ON_READ = 0x0002,
};
typedef struct QEDAIOCB {
diff --git a/trace-events b/trace-events
index 6e1a19f..10faa07 100644
--- a/trace-events
+++ b/trace-events
@@ -236,6 +236,7 @@ disable qed_aio_complete(void *s, void *acb, int ret) "s %p acb %p ret %d"
disable qed_aio_setup(void *s, void *acb, int64_t sector_num, int nb_sectors, void *opaque, int flags) "s %p acb %p sector_num %"PRId64" nb_sectors %d opaque %p flags %#x"
disable qed_aio_next_io(void *s, void *acb, int ret, uint64_t cur_pos) "s %p acb %p ret %d cur_pos %"PRIu64""
disable qed_aio_read_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
+disable qed_copy_on_read_cb(void *acb, int ret) "acb %p ret %d"
disable qed_aio_write_data(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
disable qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64""
disable qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64""
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 07/13] qed: avoid deadlock on emulated synchronous I/O
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (5 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 06/13] qed: add support for copy-on-read Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 08/13] qerror: add qerror_from_args() to create qerror objects Stefan Hajnoczi
` (7 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
The block layer emulates synchronous bdrv_read()/bdrv_write() for
drivers that only provide the asynchronous interfaces. The emulation
issues an asynchronous request inside a new "async context" and waits
for that request to complete. If currently outstanding requests
complete during this time, their completion functions are not invoked
until the async context is popped again.
This can lead to deadlock if an allocating write is being processed when
synchronous I/O emulation starts. The emulated synchronous write will
be queued because an existing request is being processed. But the
existing request on cannot complete until the async context is popped.
The result is that qemu_aio_wait() sits in a deadlock.
Address this problem in two ways:
1. Add an assertion so that we instantly know if this corner case is
hit. This saves us time by giving a clear failure indication.
2. Ignore the copy-on-read hint for emulated synchronous reads. This
allows us to do emulated synchronous reads without hitting the
deadlock.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block/qed.c | 12 +++++++++++-
1 files changed, 11 insertions(+), 1 deletions(-)
diff --git a/block/qed.c b/block/qed.c
index 6ca57f2..ffdbc2d 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1120,6 +1120,14 @@ static bool qed_start_allocating_write(QEDAIOCB *acb)
}
if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) ||
s->allocating_write_reqs_plugged) {
+ /* Queuing an emulated synchronous write causes deadlock since
+ * currently outstanding requests are not in the current async context
+ * and their completion will never be invoked. Once the block layer
+ * moves to truly asynchronous semantics this failure case will be
+ * eliminated.
+ */
+ assert(get_async_context_id() == 0);
+
return false;
}
return true;
@@ -1246,7 +1254,9 @@ static void qed_aio_read_data(void *opaque, int ret,
} else if (ret != QED_CLUSTER_FOUND) {
BlockDriverCompletionFunc *cb = qed_aio_next_io;
- if (bs->backing_hd && (acb->flags & QED_AIOCB_COPY_ON_READ)) {
+ /* See qed_start_allocating_write() for get_async_context_id() hack */
+ if (bs->backing_hd && (acb->flags & QED_AIOCB_COPY_ON_READ) &&
+ get_async_context_id() == 0) {
if (!qed_start_allocating_write(acb)) {
qemu_iovec_reset(&acb->cur_qiov);
return; /* wait for current allocating write to complete */
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 08/13] qerror: add qerror_from_args() to create qerror objects
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (6 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 07/13] qed: avoid deadlock on emulated synchronous I/O Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 09/13] block: add bdrv_aio_copy_backing() Stefan Hajnoczi
` (6 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
There is no convenient way to create a QError object without reporting
it immediately. This patch adds the qerror_from_args() function to make
it easy to create an error in qerror_report() style but without
reporting it.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
qerror.c | 21 +++++++++++++++++++++
qerror.h | 6 ++++++
2 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/qerror.c b/qerror.c
index d7fcd93..64b41ca 100644
--- a/qerror.c
+++ b/qerror.c
@@ -443,6 +443,27 @@ void qerror_print(QError *qerror)
QDECREF(qstring);
}
+/**
+ * qerror_from_args_internal(): Create a new QError
+ *
+ * This is a convenience wrapper of the qerror_from_info() function for when a
+ * va_list is not available.
+ *
+ * Return a strong reference.
+ */
+QError *qerror_from_args_internal(const char *file, int linenr,
+ const char *func, const char *fmt, ...)
+{
+ va_list va;
+ QError *qerror;
+
+ va_start(va, fmt);
+ qerror = qerror_from_info(file, linenr, func, fmt, &va);
+ va_end(va);
+
+ return qerror;
+}
+
void qerror_report_internal(const char *file, int linenr, const char *func,
const char *fmt, ...)
{
diff --git a/qerror.h b/qerror.h
index 16c830d..173c84f 100644
--- a/qerror.h
+++ b/qerror.h
@@ -35,6 +35,12 @@ typedef struct QError {
QError *qerror_new(void);
QError *qerror_from_info(const char *file, int linenr, const char *func,
const char *fmt, va_list *va) GCC_FMT_ATTR(4, 0);
+QError *qerror_from_args_internal(const char *file, int linenr,
+ const char *func, const char *fmt,
+ ...) GCC_FMT_ATTR(4, 0);
+#define qerror_from_args(fmt, ...) \
+ qerror_from_args_internal(__FILE__, __LINE__, __func__, \
+ fmt, ## __VA_ARGS__)
QString *qerror_human(const QError *qerror);
void qerror_print(QError *qerror);
void qerror_report_internal(const char *file, int linenr, const char *func,
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 09/13] block: add bdrv_aio_copy_backing()
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (7 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 08/13] qerror: add qerror_from_args() to create qerror objects Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 10/13] qmp: add QMP support for stream commands Stefan Hajnoczi
` (5 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
From: Anthony Liguori <aliguori@us.ibm.com>
Add the bdrv_aio_copy_backing() function to the BlockDriver interface.
This function copies unallocated sectors from the backing file and can
be used to implement image streaming.
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block.c | 37 +++++++++++++++++++++++++++++++++++++
block.h | 5 +++++
block_int.h | 2 ++
3 files changed, 44 insertions(+), 0 deletions(-)
diff --git a/block.c b/block.c
index 99068c9..4da35be 100644
--- a/block.c
+++ b/block.c
@@ -2221,6 +2221,43 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
return ret;
}
+/**
+ * Attempt to copy unallocated sectors from backing file.
+ *
+ * @sector_num - the first sector to start from
+ * @cb - completion callback
+ * @opaque - data to pass completion callback
+ *
+ * Returns NULL if the image format not support the operation, the image is
+ * read-only, or no image is open.
+ *
+ * The intention of this function is for a user to execute it once with a
+ * sector_num of 0 and then upon receiving a completion callback, to remember
+ * the number of sectors copied, and then to call this function again with
+ * an offset adjusted by the number of sectors previously copied.
+ *
+ * This allows a user to progressive stream in an image at a pace that makes
+ * sense. In general, this function tries to do the smallest amount of I/O
+ * possible to do some useful work.
+ *
+ * This function only really makes sense in combination with a block format
+ * that supports copy on read and has it enabled. If copy on read is not
+ * enabled, a block format driver may return NULL.
+ *
+ * If an I/O error occurs the completion callback is invoked with -errno in the
+ * nb_sectors argument.
+ */
+BlockDriverAIOCB *bdrv_aio_copy_backing(BlockDriverState *bs,
+ int64_t sector_num,
+ BlockDriverCopyBackingCB *cb,
+ void *opaque)
+{
+ if (!bs->drv || bs->read_only || !bs->drv->bdrv_aio_copy_backing) {
+ return NULL;
+ }
+
+ return bs->drv->bdrv_aio_copy_backing(bs, sector_num, cb, opaque);
+}
typedef struct MultiwriteCB {
int error;
diff --git a/block.h b/block.h
index deb19ec..b5de366 100644
--- a/block.h
+++ b/block.h
@@ -112,6 +112,7 @@ typedef struct BlockDriverAIOCB BlockDriverAIOCB;
typedef void BlockDriverCompletionFunc(void *opaque, int ret);
typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
int sector_num);
+typedef void BlockDriverCopyBackingCB(void *opaque, int nb_sectors);
BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *iov, int nb_sectors,
BlockDriverCompletionFunc *cb, void *opaque);
@@ -120,6 +121,10 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
BlockDriverCompletionFunc *cb, void *opaque);
BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_copy_backing(BlockDriverState *bs,
+ int64_t sector_num,
+ BlockDriverCopyBackingCB *cb,
+ void *opaque);
void bdrv_aio_cancel(BlockDriverAIOCB *acb);
typedef struct BlockRequest {
diff --git a/block_int.h b/block_int.h
index 135625a..327efb3 100644
--- a/block_int.h
+++ b/block_int.h
@@ -73,6 +73,8 @@ struct BlockDriver {
BlockDriverCompletionFunc *cb, void *opaque);
BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
BlockDriverCompletionFunc *cb, void *opaque);
+ BlockDriverAIOCB *(*bdrv_aio_copy_backing)(BlockDriverState *bs,
+ int64_t sector_num, BlockDriverCopyBackingCB *cb, void *opaque);
int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num,
int nb_sectors);
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 10/13] qmp: add QMP support for stream commands
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (8 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 09/13] block: add bdrv_aio_copy_backing() Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 11/13] block: add -drive stream=on|off Stefan Hajnoczi
` (4 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
From: Anthony Liguori <aliguori@us.ibm.com>
For leaf images with copy-on-read semantics, the stream commands allow the user
to populate local blocks by manually streaming them from the backing image.
Once all blocks have been streamed, the dependency on the original backing
image can be removed. Therefore, stream commands can be used to implement
post-copy live block migration and rapid deployment.
The block_stream command can be used to stream a single cluster, to
start streaming the entire device, and to cancel an active stream. It
is easiest to allow the block_stream command to manage streaming for the
entire device but a managent tool could use single cluster mode to
throttle the I/O rate.
The command synopses are as follows:
block_stream
------------
Copy data from a backing file into a block device.
If the optional 'all' argument is true, this operation is performed in the
background until the entire backing file has been copied. The status of
ongoing block_stream operations can be checked with query-block-stream.
Arguments:
- all: copy entire device (json-bool, optional)
- stop: stop copying to device (json-bool, optional)
- device: device name (json-string)
Return:
- device: device name (json-string)
- len: size of the device, in bytes (json-int)
- offset: ending offset of the completed I/O, in bytes (json-int)
Examples:
-> { "execute": "block_stream", "arguments": { "device": "virtio0" } }
<- { "return": { "device": "virtio0", "len": 10737418240, "offset": 512 } }
-> { "execute": "block_stream", "arguments": { "all": true, "device": "virtio0" } }
<- { "return": {} }
-> { "execute": "block_stream", "arguments": { "stop": true, "device": "virtio0" } }
<- { "return": {} }
query-block-stream
------------------
Show progress of ongoing block_stream operations.
Return a json-array of all operations. If no operation is active then an empty
array will be returned. Each operation is a json-object with the following
data:
- device: device name (json-string)
- len: size of the device, in bytes (json-int)
- offset: ending offset of the completed I/O, in bytes (json-int)
Example:
-> { "execute": "query-block-stream" }
<- { "return":[
{ "device": "virtio0", "len": 10737418240, "offset": 709632}
]
}
The naming of these commands is slightly odd but consistent with
existing block and query commands. The block_stream command follows the
naming of block_resize, block_passwd, and others. The
query-block-stream command follows the hyphenated naming of QMP query
commands like query-block.
Signed-off-by: Adam Litke <agl@us.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
blockdev.c | 270 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
blockdev.h | 6 ++
hmp-commands.hx | 18 ++++
monitor.c | 23 +++++
monitor.h | 1 +
qerror.c | 9 ++
qerror.h | 6 ++
qmp-commands.hx | 68 ++++++++++++++
8 files changed, 401 insertions(+), 0 deletions(-)
diff --git a/blockdev.c b/blockdev.c
index 9dbd2fa..ffbc45e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -16,6 +16,7 @@
#include "sysemu.h"
#include "hw/qdev.h"
#include "block_int.h"
+#include "qjson.h"
static QTAILQ_HEAD(drivelist, DriveInfo) drives = QTAILQ_HEAD_INITIALIZER(drives);
@@ -50,6 +51,197 @@ static const int if_max_devs[IF_COUNT] = {
[IF_SCSI] = 7,
};
+typedef struct StreamState {
+ MonitorCompletion *cb;
+ void *cb_opaque;
+ MonitorCompletion *cancel_cb;
+ void *cancel_opaque;
+ int64_t offset; /* current position in block device */
+ bool running; /* prevent re-entrancy */
+ bool once; /* only stream for one iteration */
+ BlockDriverState *bs;
+ QEMUTimer *timer;
+ uint64_t stream_delay;
+ QLIST_ENTRY(StreamState) list;
+} StreamState;
+
+static QLIST_HEAD(, StreamState) block_streams =
+ QLIST_HEAD_INITIALIZER(block_streams);
+
+static QObject *stream_get_qobject(StreamState *s)
+{
+ const char *name = bdrv_get_device_name(s->bs);
+ int64_t len = bdrv_getlength(s->bs);
+
+ return qobject_from_jsonf("{ 'device': %s, 'offset': %" PRId64 ", "
+ "'len': %" PRId64 " }", name, s->offset, len);
+}
+
+static void stream_mon_event(StreamState *s, int ret)
+{
+ QObject *data = stream_get_qobject(s);
+
+ if (ret < 0) {
+ QError *qerror = qerror_from_args(QERR_STREAMING_ERROR,
+ strerror(-ret));
+
+ qdict_put(qobject_to_qdict(data), "error", qerror);
+ }
+
+ monitor_protocol_event(QEVENT_BLOCK_STREAM_COMPLETED, data);
+ qobject_decref(data);
+}
+
+static void stream_free(StreamState *s)
+{
+ QLIST_REMOVE(s, list);
+
+ if (s->cancel_cb) {
+ s->cancel_cb(s->cancel_opaque, NULL);
+ }
+
+ qemu_del_timer(s->timer);
+ qemu_free_timer(s->timer);
+ free(s);
+}
+
+static void stream_invoke_monitor_cb(StreamState *s)
+{
+ QObject *qobject = stream_get_qobject(s);
+
+ s->cb(s->cb_opaque, qobject);
+ qobject_decref(qobject);
+}
+
+static void stream_complete(StreamState *s, int ret)
+{
+ if (ret < 0) {
+ /* Error return if we have a callback, otherwise generate an event */
+ if (s->cb) {
+ qerror_report(QERR_STREAMING_ERROR, strerror(-ret));
+ } else {
+ stream_mon_event(s, ret);
+ }
+ } else {
+ /* Always generate event on successful completion */
+ stream_mon_event(s, ret);
+ }
+
+ if (s->cb) {
+ stream_invoke_monitor_cb(s);
+ }
+ stream_free(s);
+}
+
+static void stream_cb(void *opaque, int nb_sectors)
+{
+ StreamState *s = opaque;
+
+ if (nb_sectors < 0) {
+ stream_complete(s, nb_sectors);
+ return;
+ }
+
+ s->offset += nb_sectors * BDRV_SECTOR_SIZE;
+
+ if (s->offset == bdrv_getlength(s->bs)) {
+ bdrv_change_backing_file(s->bs, NULL, NULL);
+ stream_complete(s, 0);
+ } else if (s->once) {
+ assert(s->cb);
+ stream_invoke_monitor_cb(s);
+ s->running = false;
+ } else if (s->cancel_cb) {
+ stream_free(s);
+ } else {
+ qemu_mod_timer(s->timer, qemu_get_clock_ns(rt_clock) +
+ s->stream_delay);
+ }
+}
+
+/* We can't call bdrv_aio_stream() directly from the callback because that
+ * makes qemu_aio_flush() not complete until the streaming is completed.
+ * By delaying with a timer, we give qemu_aio_flush() a chance to complete.
+ */
+static void stream_next_iteration(void *opaque)
+{
+ StreamState *s = opaque;
+
+ bdrv_aio_copy_backing(s->bs, s->offset / BDRV_SECTOR_SIZE, stream_cb, s);
+}
+
+static StreamState *stream_find(const char *device)
+{
+ StreamState *s;
+
+ QLIST_FOREACH(s, &block_streams, list) {
+ if (strcmp(bdrv_get_device_name(s->bs), device) == 0) {
+ return s;
+ }
+ }
+ return NULL;
+}
+
+static StreamState *stream_start(const char *device, bool once,
+ MonitorCompletion cb, void *opaque)
+{
+ StreamState *s;
+ BlockDriverAIOCB *acb;
+
+ s = stream_find(device);
+ if (s && s->running) {
+ qerror_report(QERR_DEVICE_IN_USE, device);
+ return NULL;
+ }
+
+ /* Create a new stream, if necessary */
+ if (!s) {
+ BlockDriverState *bs = bdrv_find(device);
+ if (!bs) {
+ qerror_report(QERR_DEVICE_NOT_FOUND, device);
+ return NULL;
+ }
+
+ s = qemu_mallocz(sizeof(*s));
+ s->bs = bs;
+ s->timer = qemu_new_timer_ns(rt_clock, stream_next_iteration, s);
+ QLIST_INSERT_HEAD(&block_streams, s, list);
+ }
+
+ s->running = true;
+ s->once = once;
+ s->cb = once ? cb : NULL;
+ s->cb_opaque = opaque;
+ s->stream_delay = 0; /* FIXME make this configurable */
+
+ acb = bdrv_aio_copy_backing(s->bs, s->offset / BDRV_SECTOR_SIZE,
+ stream_cb, s);
+ if (acb == NULL) {
+ stream_free(s);
+ qerror_report(QERR_NOT_SUPPORTED);
+ return NULL;
+ }
+ return s;
+}
+
+static int stream_stop(const char *device, MonitorCompletion *cb, void *opaque)
+{
+ StreamState *s = stream_find(device);
+
+ if (!s || s->once) {
+ qerror_report(QERR_DEVICE_NOT_ACTIVE, device);
+ return -1;
+ }
+ if (s->cancel_cb) {
+ qerror_report(QERR_DEVICE_IN_USE, device);
+ return -1;
+ }
+
+ s->cancel_cb = cb;
+ s->cancel_opaque = opaque;
+ return 0;
+}
+
/*
* We automatically delete the drive when a device using it gets
* unplugged. Questionable feature, but we can't just drop it.
@@ -654,6 +846,84 @@ out:
return ret;
}
+void monitor_print_block_stream(Monitor *mon, const QObject *data)
+{
+ QDict *stream;
+
+ assert(data);
+ stream = qobject_to_qdict(data);
+
+ monitor_printf(mon, "Streaming device %s: Completed %" PRId64 " of %"
+ PRId64 " bytes\n", qdict_get_str(stream, "device"),
+ qdict_get_int(stream, "offset"),
+ qdict_get_int(stream, "len"));
+}
+
+void monitor_print_block_stream_info(Monitor *mon, const QObject *data)
+{
+ QList *streams;
+ QListEntry *entry;
+
+ assert(data);
+ streams = qobject_to_qlist(data);
+ assert(streams); /* we pass a list of stream objects to ourselves */
+
+ if (qlist_empty(streams)) {
+ monitor_printf(mon, "No active stream\n");
+ return;
+ }
+
+ QLIST_FOREACH_ENTRY(streams, entry) {
+ monitor_print_block_stream(mon, entry->value);
+ }
+}
+
+int do_block_stream_info(Monitor *mon, MonitorCompletion *cb, void *opaque)
+{
+ QList *streams = qlist_new();
+ StreamState *s;
+
+ QLIST_FOREACH(s, &block_streams, list) {
+ if (!s->once) {
+ qlist_append_obj(streams, stream_get_qobject(s));
+ }
+ }
+
+ cb(opaque, QOBJECT(streams));
+ QDECREF(streams);
+ return 0;
+}
+
+int do_block_stream(Monitor *mon, const QDict *params,
+ MonitorCompletion cb, void *opaque)
+{
+ int all = qdict_get_try_bool(params, "all", false);
+ int stop = qdict_get_try_bool(params, "stop", false);
+ const char *device = qdict_get_str(params, "device");
+ StreamState *s;
+
+ if (all && stop) {
+ qerror_report(QERR_INVALID_PARAMETER, "stop' not allowed with 'all");
+ return -1;
+ }
+
+ if (stop) {
+ return stream_stop(device, cb, opaque);
+ } else if (all) {
+ s = stream_start(device, false, NULL, NULL);
+ if (!s) {
+ return -1;
+ }
+ cb(opaque, NULL);
+ } else {
+ s = stream_start(device, true, cb, opaque);
+ if (!s) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
static int eject_device(Monitor *mon, BlockDriverState *bs, int force)
{
if (!force) {
diff --git a/blockdev.h b/blockdev.h
index 3587786..e246c81 100644
--- a/blockdev.h
+++ b/blockdev.h
@@ -12,6 +12,7 @@
#include "block.h"
#include "qemu-queue.h"
+#include "monitor.h"
void blockdev_mark_auto_del(BlockDriverState *bs);
void blockdev_auto_del(BlockDriverState *bs);
@@ -65,5 +66,10 @@ int do_change_block(Monitor *mon, const char *device,
int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data);
int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data);
int do_block_resize(Monitor *mon, const QDict *qdict, QObject **ret_data);
+void monitor_print_block_stream(Monitor *mon, const QObject *data);
+void monitor_print_block_stream_info(Monitor *mon, const QObject *data);
+int do_block_stream_info(Monitor *mon, MonitorCompletion *cb, void *opaque);
+int do_block_stream(Monitor *mon, const QDict *params,
+ MonitorCompletion cb, void *opaque);
#endif
diff --git a/hmp-commands.hx b/hmp-commands.hx
index f6cc724..e78a1f8 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -38,6 +38,22 @@ Commit changes to the disk images (if -snapshot is used) or backing files.
ETEXI
{
+ .name = "block_stream",
+ .args_type = "all:-a,stop:-s,device:B",
+ .params = "[-a] [-s] device",
+ .help = "Stream data to a block device",
+ .user_print = monitor_print_block_stream,
+ .mhandler.cmd_async = do_block_stream,
+ .flags = MONITOR_CMD_ASYNC,
+ },
+
+STEXI
+@item block_stream
+@findex block_stream
+Copy data from a backing file into a block device.
+ETEXI
+
+ {
.name = "q|quit",
.args_type = "",
.params = "",
@@ -1354,6 +1370,8 @@ show device tree
show qdev device model list
@item info roms
show roms
+@item info block-stream
+show progress of ongoing block_stream operations
@end table
ETEXI
diff --git a/monitor.c b/monitor.c
index 6af6a4d..f9ee743 100644
--- a/monitor.c
+++ b/monitor.c
@@ -468,6 +468,9 @@ void monitor_protocol_event(MonitorEvent event, QObject *data)
case QEVENT_SPICE_DISCONNECTED:
event_name = "SPICE_DISCONNECTED";
break;
+ case QEVENT_BLOCK_STREAM_COMPLETED:
+ event_name = "BLOCK_STREAM_COMPLETED";
+ break;
default:
abort();
break;
@@ -3105,6 +3108,16 @@ static const mon_cmd_t info_cmds[] = {
.mhandler.info = do_info_trace_events,
},
#endif
+ {
+ .name = "block-stream",
+ .args_type = "",
+ .params = "",
+ .help = "show block streaming status",
+ .user_print = monitor_print_block_stream_info,
+ .mhandler.info_async = do_block_stream_info,
+ .flags = MONITOR_CMD_ASYNC,
+
+ },
{
.name = NULL,
},
@@ -3247,6 +3260,16 @@ static const mon_cmd_t qmp_query_cmds[] = {
.mhandler.info_async = do_info_balloon,
.flags = MONITOR_CMD_ASYNC,
},
+ {
+ .name = "block-stream",
+ .args_type = "",
+ .params = "",
+ .help = "show block streaming status",
+ .user_print = monitor_print_block_stream_info,
+ .mhandler.info_async = do_block_stream_info,
+ .flags = MONITOR_CMD_ASYNC,
+
+ },
{ /* NULL */ },
};
diff --git a/monitor.h b/monitor.h
index 4f2d328..7b916c7 100644
--- a/monitor.h
+++ b/monitor.h
@@ -35,6 +35,7 @@ typedef enum MonitorEvent {
QEVENT_SPICE_CONNECTED,
QEVENT_SPICE_INITIALIZED,
QEVENT_SPICE_DISCONNECTED,
+ QEVENT_BLOCK_STREAM_COMPLETED,
QEVENT_MAX,
} MonitorEvent;
diff --git a/qerror.c b/qerror.c
index 64b41ca..83164c1 100644
--- a/qerror.c
+++ b/qerror.c
@@ -157,6 +157,10 @@ static const QErrorStringTable qerror_table[] = {
.desc = "No '%(bus)' bus found for device '%(device)'",
},
{
+ .error_fmt = QERR_NOT_SUPPORTED,
+ .desc = "Operation is not supported",
+ },
+ {
.error_fmt = QERR_OPEN_FILE_FAILED,
.desc = "Could not open '%(filename)'",
},
@@ -213,6 +217,11 @@ static const QErrorStringTable qerror_table[] = {
.error_fmt = QERR_VNC_SERVER_FAILED,
.desc = "Could not start VNC server on %(target)",
},
+ {
+ .error_fmt = QERR_STREAMING_ERROR,
+ .desc = "An error occurred during streaming: %(msg)",
+ },
+
{}
};
diff --git a/qerror.h b/qerror.h
index 173c84f..ad80982 100644
--- a/qerror.h
+++ b/qerror.h
@@ -142,6 +142,9 @@ QError *qobject_to_qerror(const QObject *obj);
#define QERR_NO_BUS_FOR_DEVICE \
"{ 'class': 'NoBusForDevice', 'data': { 'device': %s, 'bus': %s } }"
+#define QERR_NOT_SUPPORTED \
+ "{ 'class': 'NotSupported', 'data': {} }"
+
#define QERR_OPEN_FILE_FAILED \
"{ 'class': 'OpenFileFailed', 'data': { 'filename': %s } }"
@@ -187,4 +190,7 @@ QError *qobject_to_qerror(const QObject *obj);
#define QERR_FEATURE_DISABLED \
"{ 'class': 'FeatureDisabled', 'data': { 'name': %s } }"
+#define QERR_STREAMING_ERROR \
+ "{ 'class': 'StreamingError', 'data': { 'msg': %s } }"
+
#endif /* QERROR_H */
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 92c5c3a..d8966bf 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -885,6 +885,51 @@ Example:
EQMP
{
+ .name = "block_stream",
+ .args_type = "all:-a,stop:-s,device:B",
+ .params = "[-a] [-s] device",
+ .help = "Copy data from a backing file into a block device",
+ .user_print = monitor_print_block_stream,
+ .mhandler.cmd_async = do_block_stream,
+ .flags = MONITOR_CMD_ASYNC,
+ },
+
+SQMP
+block_stream
+------------
+
+Copy data from a backing file into a block device.
+
+If the optional 'all' argument is true, this operation is performed in the
+background until the entire backing file has been copied. The status of
+ongoing block_stream operations can be checked with query-block-stream.
+
+Arguments:
+
+- all: copy entire device (json-bool, optional)
+- stop: stop copying to device (json-bool, optional)
+- device: device name (json-string)
+
+Return:
+
+- device: device name (json-string)
+- len: size of the device, in bytes (json-int)
+- offset: ending offset of the completed I/O, in bytes (json-int)
+
+Examples:
+
+-> { "execute": "block_stream", "arguments": { "device": "virtio0" } }
+<- { "return": { "device": "virtio0", "len": 10737418240, "offset": 512 } }
+
+-> { "execute": "block_stream", "arguments": { "all": true, "device": "virtio0" } }
+<- { "return": {} }
+
+-> { "execute": "block_stream", "arguments": { "stop": true, "device": "virtio0" } }
+<- { "return": {} }
+
+EQMP
+
+ {
.name = "qmp_capabilities",
.args_type = "",
.params = "",
@@ -1805,3 +1850,26 @@ Example:
EQMP
+SQMP
+query-block-stream
+------------------
+
+Show progress of ongoing block_stream operations.
+
+Return a json-array of all operations. If no operation is active then an empty
+array will be returned. Each operation is a json-object with the following
+data:
+
+- device: device name (json-string)
+- len: size of the device, in bytes (json-int)
+- offset: ending offset of the completed I/O, in bytes (json-int)
+
+Example:
+
+-> { "execute": "query-block-stream" }
+<- { "return":[
+ { "device": "virtio0", "len": 10737418240, "offset": 709632}
+ ]
+ }
+
+EQMP
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 11/13] block: add -drive stream=on|off
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (9 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 10/13] qmp: add QMP support for stream commands Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 12/13] qed: intelligent streaming implementation Stefan Hajnoczi
` (3 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
This patch adds the -drive stream=on|off command-line option:
stream=on|off
stream is "on" or "off" and enables background copying of backing file
contents into the image file until the backing file is no longer
needed.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
blockdev.c | 12 +++++++++++-
hmp-commands.hx | 3 ++-
qemu-config.c | 3 +++
qemu-options.hx | 5 ++++-
4 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/blockdev.c b/blockdev.c
index ffbc45e..9b2fbb5 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -429,7 +429,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
const char *devaddr;
DriveInfo *dinfo;
int snapshot = 0;
- int copy_on_read;
+ int copy_on_read, stream;
int ret;
translation = BIOS_ATA_TRANSLATION_AUTO;
@@ -455,6 +455,7 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
ro = qemu_opt_get_bool(opts, "readonly", 0);
copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", 0);
+ stream = qemu_opt_get_bool(opts, "stream", 0);
file = qemu_opt_get(opts, "file");
serial = qemu_opt_get(opts, "serial");
@@ -738,6 +739,15 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
goto err;
}
+ if (stream) {
+ const char *device_name = bdrv_get_device_name(dinfo->bdrv);
+
+ if (!stream_start(device_name, false, NULL, NULL)) {
+ fprintf(stderr, "qemu: warning: stream_start failed for '%s'\n",
+ device_name);
+ }
+ }
+
if (bdrv_key_required(dinfo->bdrv))
autostart = 0;
return dinfo;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index e78a1f8..46e385a 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -882,7 +882,8 @@ ETEXI
"[,unit=m][,media=d][,index=i]\n"
"[,cyls=c,heads=h,secs=s[,trans=t]]\n"
"[,snapshot=on|off][,cache=on|off]\n"
- "[,readonly=on|off][,copy-on-read=on|off]",
+ "[,readonly=on|off][,copy-on-read=on|off]"
+ "[,stream=on|off]",
.help = "add drive to PCI storage controller",
.mhandler.cmd = drive_hot_add,
},
diff --git a/qemu-config.c b/qemu-config.c
index dafacb7..a14d3f0 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -81,6 +81,9 @@ static QemuOptsList qemu_drive_opts = {
}, {
.name = "copy-on-read",
.type = QEMU_OPT_BOOL,
+ }, {
+ .name = "stream",
+ .type = QEMU_OPT_BOOL,
},
{ /* end of list */ }
},
diff --git a/qemu-options.hx b/qemu-options.hx
index 97abcd9..b1d65d4 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -120,7 +120,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
" [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
" [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
" [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
- " [,readonly=on|off][,copy-on-read=on|off]\n"
+ " [,readonly=on|off][,copy-on-read=on|off][,stream=on|off]\n"
" use 'file' as a drive image\n", QEMU_ARCH_ALL)
STEXI
@item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
@@ -163,6 +163,9 @@ Specify the controller's PCI address (if=virtio only).
@item copy-on-read=@var{copy-on-read}
@var{copy-on-read} is "on" or "off" and enables whether to copy read backing
file sectors into the image file.
+@item stream=@var{stream}
+@var{stream} is "on" or "off" and enables background copying of backing file
+contents into the image file until the backing file is no longer needed.
@end table
By default, writethrough caching is used for all block device. This means that
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 12/13] qed: intelligent streaming implementation
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (10 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 11/13] block: add -drive stream=on|off Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-14 18:18 ` [Qemu-devel] [PATCH 13/13] trace: trace bdrv_aio_readv/writev error paths Stefan Hajnoczi
` (2 subsequent siblings)
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Adam Litke
From: Anthony Liguori <aliguori@us.ibm.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
block/qed.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----
block/qed.h | 3 +-
2 files changed, 234 insertions(+), 17 deletions(-)
diff --git a/block/qed.c b/block/qed.c
index ffdbc2d..f9f7c94 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -951,9 +951,8 @@ static void qed_aio_write_l1_update(void *opaque, int ret)
/**
* Update L2 table with new cluster offsets and write them out
*/
-static void qed_aio_write_l2_update(void *opaque, int ret)
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
{
- QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
int index;
@@ -969,7 +968,7 @@ static void qed_aio_write_l2_update(void *opaque, int ret)
index = qed_l2_index(s, acb->cur_pos);
qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
- acb->cur_cluster);
+ offset);
if (need_alloc) {
/* Write out the whole new L2 table */
@@ -986,6 +985,51 @@ err:
qed_aio_complete(acb, ret);
}
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+ qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
+/**
+ * Determine if we have a zero write to a block of clusters
+ *
+ * We validate that the write is aligned to a cluster boundary, and that it's
+ * a multiple of cluster size with all zeros.
+ */
+static bool qed_is_zero_write(QEDAIOCB *acb)
+{
+ BDRVQEDState *s = acb_to_s(acb);
+ int i;
+
+ if (!qed_offset_is_cluster_aligned(s, acb->cur_pos)) {
+ return false;
+ }
+
+ if (!qed_offset_is_cluster_aligned(s, acb->cur_qiov.size)) {
+ return false;
+ }
+
+ for (i = 0; i < acb->cur_qiov.niov; i++) {
+ struct iovec *iov = &acb->cur_qiov.iov[i];
+ uint64_t *v;
+ int j;
+
+ if ((iov->iov_len & 0x07)) {
+ return false;
+ }
+
+ v = iov->iov_base;
+ for (j = 0; j < iov->iov_len; j += sizeof(v[0])) {
+ if (v[j >> 3]) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
/**
* Flush new data clusters before updating the L2 table
*
@@ -1000,7 +1044,7 @@ static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
QEDAIOCB *acb = opaque;
BDRVQEDState *s = acb_to_s(acb);
- if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) {
+ if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
qed_aio_complete(acb, -EIO);
}
}
@@ -1030,7 +1074,7 @@ static void qed_aio_write_main(void *opaque, int ret)
if (s->bs->backing_hd) {
next_fn = qed_aio_write_flush_before_l2_update;
} else {
- next_fn = qed_aio_write_l2_update;
+ next_fn = qed_aio_write_l2_update_cb;
}
}
@@ -1096,6 +1140,18 @@ static bool qed_should_set_need_check(BDRVQEDState *s)
return !(s->header.features & QED_F_NEED_CHECK);
}
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+ QEDAIOCB *acb = opaque;
+
+ if (ret) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ qed_aio_write_l2_update(acb, 0, 1);
+}
+
/**
* Start an allocating write request or queue it
*
@@ -1144,6 +1200,7 @@ static bool qed_start_allocating_write(QEDAIOCB *acb)
static void qed_aio_write_alloc(QEDAIOCB *acb)
{
BDRVQEDState *s = acb_to_s(acb);
+ BlockDriverCompletionFunc *cb;
if (!qed_start_allocating_write(acb)) {
qemu_iovec_reset(&acb->cur_qiov);
@@ -1154,11 +1211,18 @@ static void qed_aio_write_alloc(QEDAIOCB *acb)
qed_offset_into_cluster(s, acb->cur_pos) + acb->cur_qiov.size);
acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
+ cb = qed_aio_write_prefill;
+
+ /* Zero write detection */
+ if ((acb->flags & QED_AIOCB_CHECK_ZERO_WRITE) && qed_is_zero_write(acb)) {
+ cb = qed_aio_write_zero_cluster;
+ }
+
if (qed_should_set_need_check(s)) {
s->header.features |= QED_F_NEED_CHECK;
- qed_write_header(s, qed_aio_write_prefill, acb);
+ qed_write_header(s, cb, acb);
} else {
- qed_aio_write_prefill(acb, 0);
+ cb(acb, 0);
}
}
@@ -1317,11 +1381,11 @@ static void qed_aio_next_io(void *opaque, int ret)
io_fn, acb);
}
-static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
- int64_t sector_num,
- QEMUIOVector *qiov, int nb_sectors,
- BlockDriverCompletionFunc *cb,
- void *opaque, int flags)
+static QEDAIOCB *qed_aio_setup(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque, int flags)
{
QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
@@ -1337,8 +1401,22 @@ static BlockDriverAIOCB *qed_aio_setup(BlockDriverState *bs,
acb->request.l2_table = NULL;
qemu_iovec_init(&acb->cur_qiov, qiov->niov);
+ return acb;
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_setup(BlockDriverState *bs,
+ int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb,
+ void *opaque, int flags)
+{
+ QEDAIOCB *acb;
+
+ acb = qed_aio_setup(bs, sector_num, qiov, nb_sectors,
+ cb, opaque, flags);
/* Start request */
qed_aio_next_io(acb, 0);
+
return &acb->common;
}
@@ -1348,9 +1426,15 @@ static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- int flags = bs->copy_on_read ? QED_AIOCB_COPY_ON_READ : 0;
+ /* Don't bloat image file in copy-on-read, use zero detection */
+ int flags = QED_AIOCB_CHECK_ZERO_WRITE;
+
+ if (bs->copy_on_read) {
+ flags |= QED_AIOCB_COPY_ON_READ;
+ }
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, flags);
+ return bdrv_qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, flags);
}
static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
@@ -1359,8 +1443,139 @@ static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
BlockDriverCompletionFunc *cb,
void *opaque)
{
- return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
- opaque, QED_AIOCB_WRITE);
+ return bdrv_qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb,
+ opaque, QED_AIOCB_WRITE);
+}
+
+typedef struct QEDCopyBackingData {
+ QEDAIOCB *acb;
+ uint64_t offset;
+ QEMUIOVector qiov;
+ void *buffer;
+ size_t len;
+ BlockDriverCompletionFunc *cb;
+ void *opaque;
+} QEDCopyBackingData;
+
+static void qed_aio_copy_backing_cb(void *opaque, int ret)
+{
+ QEDCopyBackingData *copy_backing_data = opaque;
+ QEDAIOCB *acb = copy_backing_data->acb;
+
+ if (ret) {
+ ret = -EIO;
+ } else {
+ ret = (acb->end_pos - copy_backing_data->offset) / BDRV_SECTOR_SIZE;
+ }
+
+ copy_backing_data->cb(copy_backing_data->opaque, ret);
+
+ qemu_iovec_destroy(©_backing_data->qiov);
+ qemu_vfree(copy_backing_data->buffer);
+ qemu_free(copy_backing_data);
+}
+
+static void qed_copy_backing_find_cluster_cb(void *opaque, int ret,
+ uint64_t offset, size_t len);
+
+/**
+ * Perform the next qed_find_cluster() from a BH
+ *
+ * This is necessary because we iterate over each cluster in turn.
+ * qed_find_cluster() may invoke its callback immediately without returning up
+ * the call stack, causing us to overflow the call stack. By starting each
+ * iteration from a BH we guarantee that a fresh stack is used each time.
+ */
+static void qed_copy_backing_next_cluster_bh(void *opaque)
+{
+ QEDCopyBackingData *copy_backing_data = opaque;
+ QEDAIOCB *acb = copy_backing_data->acb;
+ BDRVQEDState *s = acb_to_s(acb);
+
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+
+ acb->cur_pos += s->header.cluster_size;
+ acb->end_pos += s->header.cluster_size;
+
+ qed_find_cluster(s, &acb->request, acb->cur_pos,
+ acb->end_pos - acb->cur_pos,
+ qed_copy_backing_find_cluster_cb, copy_backing_data);
+}
+
+/**
+ * Search for an unallocated cluster adjusting the current request until we
+ * can use it to read an unallocated cluster.
+ *
+ * Callback from qed_find_cluster().
+ */
+static void qed_copy_backing_find_cluster_cb(void *opaque, int ret,
+ uint64_t offset, size_t len)
+{
+ QEDCopyBackingData *copy_backing_data = opaque;
+ QEDAIOCB *acb = copy_backing_data->acb;
+ BDRVQEDState *s = acb_to_s(acb);
+
+ if (ret < 0) {
+ qed_aio_complete(acb, ret);
+ return;
+ }
+
+ if (ret == QED_CLUSTER_FOUND ||
+ ret == QED_CLUSTER_ZERO) {
+ /* proceed to next cluster */
+
+ if (acb->end_pos == s->header.image_size) {
+ qed_aio_complete(acb, 0);
+ return;
+ }
+
+ acb->bh = qemu_bh_new(qed_copy_backing_next_cluster_bh,
+ copy_backing_data);
+ qemu_bh_schedule(acb->bh);
+ } else {
+ /* found a hole, kick off request */
+ qed_aio_next_io(acb, 0);
+ }
+}
+
+static BlockDriverAIOCB *bdrv_qed_aio_copy_backing(BlockDriverState *bs,
+ int64_t sector_num, BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVQEDState *s = bs->opaque;
+ QEDCopyBackingData *copy_backing_data;
+ QEDAIOCB *acb;
+ uint32_t cluster_size = s->header.cluster_size;
+ uint64_t start_cluster;
+ QEMUIOVector *qiov;
+
+ copy_backing_data = qemu_mallocz(sizeof(*copy_backing_data));
+
+ copy_backing_data->cb = cb;
+ copy_backing_data->opaque = opaque;
+ copy_backing_data->len = cluster_size;
+ copy_backing_data->buffer = qemu_blockalign(s->bs, cluster_size);
+ copy_backing_data->offset = sector_num * BDRV_SECTOR_SIZE;
+
+ start_cluster = qed_start_of_cluster(s, copy_backing_data->offset);
+ sector_num = start_cluster / BDRV_SECTOR_SIZE;
+
+ qiov = ©_backing_data->qiov;
+ qemu_iovec_init(qiov, 1);
+ qemu_iovec_add(qiov, copy_backing_data->buffer, cluster_size);
+
+ acb = qed_aio_setup(bs, sector_num, qiov,
+ cluster_size / BDRV_SECTOR_SIZE,
+ qed_aio_copy_backing_cb, copy_backing_data,
+ QED_AIOCB_CHECK_ZERO_WRITE |
+ QED_AIOCB_COPY_ON_READ);
+ copy_backing_data->acb = acb;
+
+ qed_find_cluster(s, &acb->request, acb->cur_pos,
+ acb->end_pos - acb->cur_pos,
+ qed_copy_backing_find_cluster_cb, copy_backing_data);
+
+ return &acb->common;
}
static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
@@ -1527,6 +1742,7 @@ static BlockDriver bdrv_qed = {
.bdrv_make_empty = bdrv_qed_make_empty,
.bdrv_aio_readv = bdrv_qed_aio_readv,
.bdrv_aio_writev = bdrv_qed_aio_writev,
+ .bdrv_aio_copy_backing = bdrv_qed_aio_copy_backing,
.bdrv_aio_flush = bdrv_qed_aio_flush,
.bdrv_truncate = bdrv_qed_truncate,
.bdrv_getlength = bdrv_qed_getlength,
diff --git a/block/qed.h b/block/qed.h
index 16f4bd9..48c65f7 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -124,8 +124,9 @@ typedef struct QEDRequest {
} QEDRequest;
enum {
- QED_AIOCB_WRITE = 0x0001, /* read or write? */
+ QED_AIOCB_WRITE = 0x0001, /* read or write? */
QED_AIOCB_COPY_ON_READ = 0x0002,
+ QED_AIOCB_CHECK_ZERO_WRITE = 0x0004, /* detect zeroes? */
};
typedef struct QEDAIOCB {
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* [Qemu-devel] [PATCH 13/13] trace: trace bdrv_aio_readv/writev error paths
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (11 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 12/13] qed: intelligent streaming implementation Stefan Hajnoczi
@ 2011-06-14 18:18 ` Stefan Hajnoczi
2011-06-15 10:46 ` [Qemu-devel] [PATCH 00/13] QED image streaming Philipp Hahn
2011-06-16 12:35 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Kevin Wolf
14 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-14 18:18 UTC (permalink / raw)
To: qemu-devel; +Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, Adam Litke
It is useful to understand why an I/O request was failed. Add trace
events for the error paths in bdrv_aio_readv() and bdrv_aio_writev().
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
block.c | 24 +++++++++++++++++++-----
trace-events | 7 +++++++
2 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/block.c b/block.c
index 4da35be..816edd6 100644
--- a/block.c
+++ b/block.c
@@ -2129,10 +2129,14 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
- if (!drv)
+ if (!drv) {
+ trace_bdrv_aio_readv_null_drv(bs, sector_num, nb_sectors, opaque);
return NULL;
- if (bdrv_check_request(bs, sector_num, nb_sectors))
+ }
+ if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+ trace_bdrv_aio_readv_out_of_range(bs, sector_num, nb_sectors, opaque);
return NULL;
+ }
ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
cb, opaque);
@@ -2141,6 +2145,8 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
/* Update stats even though technically transfer has not happened. */
bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
bs->rd_ops ++;
+ } else {
+ trace_bdrv_aio_readv_failed(bs, sector_num, nb_sectors, opaque);
}
return ret;
@@ -2192,12 +2198,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
- if (!drv)
+ if (!drv) {
+ trace_bdrv_aio_writev_null_drv(bs, sector_num, nb_sectors, opaque);
return NULL;
- if (bs->read_only)
+ }
+ if (bs->read_only) {
+ trace_bdrv_aio_writev_read_only(bs, sector_num, nb_sectors, opaque);
return NULL;
- if (bdrv_check_request(bs, sector_num, nb_sectors))
+ }
+ if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+ trace_bdrv_aio_writev_out_of_range(bs, sector_num, nb_sectors, opaque);
return NULL;
+ }
if (bs->dirty_bitmap) {
blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2216,6 +2228,8 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
bs->wr_highest_sector = sector_num + nb_sectors - 1;
}
+ } else {
+ trace_bdrv_aio_writev_failed(bs, sector_num, nb_sectors, opaque);
}
return ret;
diff --git a/trace-events b/trace-events
index 10faa07..3518254 100644
--- a/trace-events
+++ b/trace-events
@@ -53,7 +53,14 @@ disable bdrv_aio_multiwrite_earlyfail(void *mcb) "mcb %p"
disable bdrv_aio_multiwrite_latefail(void *mcb, int i) "mcb %p i %d"
disable bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
disable bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_readv_null_drv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_readv_out_of_range(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_readv_failed(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
disable bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_writev_null_drv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_writev_read_only(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_writev_out_of_range(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+disable bdrv_aio_writev_failed(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
disable bdrv_set_locked(void *bs, int locked) "bs %p locked %d"
# hw/virtio-blk.c
--
1.7.5.3
^ permalink raw reply related [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] [PATCH 00/13] QED image streaming
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (12 preceding siblings ...)
2011-06-14 18:18 ` [Qemu-devel] [PATCH 13/13] trace: trace bdrv_aio_readv/writev error paths Stefan Hajnoczi
@ 2011-06-15 10:46 ` Philipp Hahn
2011-06-15 12:18 ` Stefan Hajnoczi
2011-06-16 12:35 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Kevin Wolf
14 siblings, 1 reply; 43+ messages in thread
From: Philipp Hahn @ 2011-06-15 10:46 UTC (permalink / raw)
To: qemu-devel
[-- Attachment #1: Type: text/plain, Size: 1741 bytes --]
Hello,
hopefully just a quick question...
On Tuesday 14 June 2011 20:18:18 Stefan Hajnoczi wrote:
> This patch series adds image streaming support for QED image files. Other
> image formats can also be supported in the future.
>
> Image streaming populates the file in the background while the guest is
> running. This makes it possible to start the guest before its image file
> has been fully provisioned.
...
> When image streaming is enabled, the unallocated regions of the image file
> are populated with the data from the backing file. This occurs in the
> background and the guest can perform regular I/O in the meantime. Once the
> entire backing file has been streamed, the image no longer requires a
> backing file and will drop its reference.
Does this also work for intermediate images, that is
- master image on a NFS share,
- local copy on a local drive using CoR-streaming from master,
- local instance based on local copy using CoW.
We normally have mostly independent servers without a shared storage, expect a
global NFS share for the master images. Currently we have to copy the master
image to the local server first, before we than can create lots of instances
from then, which have few changed relative to the master, so we don't want
the copying to happen there.
If haven't looked at QED yet, so thanks in advance for your answer.
Sincerely
Philipp Hahn
--
Philipp Hahn Open Source Software Engineer hahn@univention.de
Univention GmbH Linux for Your Business fon: +49 421 22 232- 0
Mary-Somerville-Str.1 D-28359 Bremen fax: +49 421 22 232-99
http://www.univention.de/
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 197 bytes --]
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] [PATCH 00/13] QED image streaming
2011-06-15 10:46 ` [Qemu-devel] [PATCH 00/13] QED image streaming Philipp Hahn
@ 2011-06-15 12:18 ` Stefan Hajnoczi
0 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-15 12:18 UTC (permalink / raw)
To: Philipp Hahn; +Cc: qemu-devel
On Wed, Jun 15, 2011 at 11:46 AM, Philipp Hahn <hahn@univention.de> wrote:
> On Tuesday 14 June 2011 20:18:18 Stefan Hajnoczi wrote:
>> This patch series adds image streaming support for QED image files. Other
>> image formats can also be supported in the future.
>>
>> Image streaming populates the file in the background while the guest is
>> running. This makes it possible to start the guest before its image file
>> has been fully provisioned.
> ...
>> When image streaming is enabled, the unallocated regions of the image file
>> are populated with the data from the backing file. This occurs in the
>> background and the guest can perform regular I/O in the meantime. Once the
>> entire backing file has been streamed, the image no longer requires a
>> backing file and will drop its reference.
>
> Does this also work for intermediate images, that is
> - master image on a NFS share,
> - local copy on a local drive using CoR-streaming from master,
> - local instance based on local copy using CoW.
>
> We normally have mostly independent servers without a shared storage, expect a
> global NFS share for the master images. Currently we have to copy the master
> image to the local server first, before we than can create lots of instances
> from then, which have few changed relative to the master, so we don't want
> the copying to happen there.
No, unfortunately adding another level of backing files prevents use
of streaming today. The reason is because backing files are opened
read-only. This ensures that multiple VMs running concurrently will
not change the backing file. Image formats are typically not safe for
concurrent updates from multiple applications just like regular file
systems cannot be accessed from multiple hosts simultaneously. You'd
need a cluster file system for that, and similarly we need multiple
writers support for image formats or some other solution.
We'd need a solution that makes it possible to update the local copy
of the master image while multiple VMs are running. For example
qemu-nbd serving the local copy of the master image and each VM image
using the backing file over NBD. That way all accesses to the local
copy of the master image go through qemu-nbd and streaming can be
performed safely. I haven't tested this approach, it might require
some improvements to qemu-nbd and I expect the performance would need
work too.
Thanks for raising this use case, I think it's an interesting one
because you are saving disk space by keeping a local master copy.
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming)
2011-06-14 18:18 [Qemu-devel] [PATCH 00/13] QED image streaming Stefan Hajnoczi
` (13 preceding siblings ...)
2011-06-15 10:46 ` [Qemu-devel] [PATCH 00/13] QED image streaming Philipp Hahn
@ 2011-06-16 12:35 ` Kevin Wolf
2011-06-16 12:49 ` [Qemu-devel] Image streaming and live block copy Avi Kivity
` (2 more replies)
14 siblings, 3 replies; 43+ messages in thread
From: Kevin Wolf @ 2011-06-16 12:35 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Anthony Liguori, Dor Laor, jes sorensen, Marcelo Tosatti,
qemu-devel, Avi Kivity, Adam Litke
Am 14.06.2011 20:18, schrieb Stefan Hajnoczi:
> Overview
> --------
>
> This patch series adds image streaming support for QED image files. Other
> image formats can also be supported in the future.
>
> Image streaming populates the file in the background while the guest is
> running. This makes it possible to start the guest before its image file has
> been fully provisioned.
>
> Example use cases include:
> * Providing small virtual appliances for download that can be launched
> immediately but provision themselves in the background.
> * Reducing guest provisioning time by creating local image files but backing
> them with shared master images which will be streamed.
>
> When image streaming is enabled, the unallocated regions of the image file are
> populated with the data from the backing file. This occurs in the background
> and the guest can perform regular I/O in the meantime. Once the entire backing
> file has been streamed, the image no longer requires a backing file and will
> drop its reference
Long CC list and Kevin wearing his upstream hat - this might be an
unpleasant email. :-)
So yesterday I had separate discussions with Stefan about image
streaming and Marcelo, Avi and some other folks about live block copy.
The conclusion was in both cases that, yes, both things are pretty
similar and, yes, the current implementation don't reflect that but
duplicate everything.
To summarise what both things are about:
* Image streaming is a normal image file plus copy-on-read plus a
background task that copies data from the source image
* Live block copy is a block-mirror of two normal image files plus a
background task that copies data from the source image
The right solution is probably to implement COR and the background task
in generic block layer code (no reason to restrict it to QED) and use it
for both image streaming and live block copy. (This is a bit more
complicated than it may sound here because guest writes must always take
precedence over a copy - but doing complicated things is an even better
reason to do it in a common place instead of duplicating)
People seem to agree on this, and the reason that I've heard why we
should merge the existing code instead is downstream time pressure. That
may be a valid reason for downstreams to add such code, but is taking
such code really the best option for upstream (and therefore long-term
for downstreams)?
If we take these patches as they are, I doubt that we'll ever get a
rewrite to implement the code as it should have been done in the first
place.
So I'm tempted to reject the current versions of both the image
streaming and live block copy series and leave it to downstreams to use
these as temporary solutions if the time pressure is too high. I know
that maintaining things downstream is painful, but that's the whole
point: I want to see the real implementation one day, and I'm afraid
this might be the only way to get it.
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 12:35 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Kevin Wolf
@ 2011-06-16 12:49 ` Avi Kivity
2011-06-16 13:08 ` Kevin Wolf
2011-06-16 13:10 ` Anthony Liguori
2011-06-16 14:38 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Marcelo Tosatti
2 siblings, 1 reply; 43+ messages in thread
From: Avi Kivity @ 2011-06-16 12:49 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Dor Laor, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Adam Litke
On 06/16/2011 03:35 PM, Kevin Wolf wrote:
> * Image streaming is a normal image file plus copy-on-read plus a
> background task that copies data from the source image
Or a block-mirror started in degraded mode.
> * Live block copy is a block-mirror of two normal image files plus a
> background task that copies data from the source image
= block-mirror started in degraded mode
> The right solution is probably to implement COR and the background task
> in generic block layer code (no reason to restrict it to QED) and use it
> for both image streaming and live block copy. (This is a bit more
> complicated than it may sound here because guest writes must always take
> precedence over a copy - but doing complicated things is an even better
> reason to do it in a common place instead of duplicating)
Or in a block-mirror block format driver - generic code need not be
involved.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 12:49 ` [Qemu-devel] Image streaming and live block copy Avi Kivity
@ 2011-06-16 13:08 ` Kevin Wolf
2011-06-16 13:38 ` Avi Kivity
2011-06-16 14:52 ` Marcelo Tosatti
0 siblings, 2 replies; 43+ messages in thread
From: Kevin Wolf @ 2011-06-16 13:08 UTC (permalink / raw)
To: Avi Kivity
Cc: Anthony Liguori, Dor Laor, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Adam Litke
Am 16.06.2011 14:49, schrieb Avi Kivity:
> On 06/16/2011 03:35 PM, Kevin Wolf wrote:
>> * Image streaming is a normal image file plus copy-on-read plus a
>> background task that copies data from the source image
>
> Or a block-mirror started in degraded mode.
At least not in the same configuration as with live block copy: You
don't want to write to the source, you only want to read from it when
the destination doesn't have the data yet.
>> * Live block copy is a block-mirror of two normal image files plus a
>> background task that copies data from the source image
>
> = block-mirror started in degraded mode
>> The right solution is probably to implement COR and the background task
>> in generic block layer code (no reason to restrict it to QED) and use it
>> for both image streaming and live block copy. (This is a bit more
>> complicated than it may sound here because guest writes must always take
>> precedence over a copy - but doing complicated things is an even better
>> reason to do it in a common place instead of duplicating)
>
> Or in a block-mirror block format driver - generic code need not be
> involved.
Might be an option. In this case generic code is only involved with the
stacking of BlockDriverStates, which is already implemented (but
requires -blockdev for a sane way to configure things).
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 13:08 ` Kevin Wolf
@ 2011-06-16 13:38 ` Avi Kivity
2011-06-16 14:52 ` Marcelo Tosatti
1 sibling, 0 replies; 43+ messages in thread
From: Avi Kivity @ 2011-06-16 13:38 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Dor Laor, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Adam Litke
On 06/16/2011 04:08 PM, Kevin Wolf wrote:
> Am 16.06.2011 14:49, schrieb Avi Kivity:
> > On 06/16/2011 03:35 PM, Kevin Wolf wrote:
> >> * Image streaming is a normal image file plus copy-on-read plus a
> >> background task that copies data from the source image
> >
> > Or a block-mirror started in degraded mode.
>
> At least not in the same configuration as with live block copy: You
> don't want to write to the source, you only want to read from it when
> the destination doesn't have the data yet.
>
You're right, it's not exactly the same. But I think it can use much of
the same code.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 13:08 ` Kevin Wolf
2011-06-16 13:38 ` Avi Kivity
@ 2011-06-16 14:52 ` Marcelo Tosatti
2011-06-16 15:30 ` Stefan Hajnoczi
2011-06-17 8:36 ` Kevin Wolf
1 sibling, 2 replies; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-16 14:52 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
On Thu, Jun 16, 2011 at 03:08:30PM +0200, Kevin Wolf wrote:
> Am 16.06.2011 14:49, schrieb Avi Kivity:
> > On 06/16/2011 03:35 PM, Kevin Wolf wrote:
> >> * Image streaming is a normal image file plus copy-on-read plus a
> >> background task that copies data from the source image
> >
> > Or a block-mirror started in degraded mode.
>
> At least not in the same configuration as with live block copy: You
> don't want to write to the source, you only want to read from it when
> the destination doesn't have the data yet.
>
> >> * Live block copy is a block-mirror of two normal image files plus a
> >> background task that copies data from the source image
> >
> > = block-mirror started in degraded mode
>
> >> The right solution is probably to implement COR and the background task
> >> in generic block layer code (no reason to restrict it to QED) and use it
> >> for both image streaming and live block copy. (This is a bit more
> >> complicated than it may sound here because guest writes must always take
> >> precedence over a copy - but doing complicated things is an even better
> >> reason to do it in a common place instead of duplicating)
> >
> > Or in a block-mirror block format driver - generic code need not be
> > involved.
>
> Might be an option. In this case generic code is only involved with the
> stacking of BlockDriverStates, which is already implemented (but
> requires -blockdev for a sane way to configure things).
>
> Kevin
What are the disadvantages of such an approach for image streaming,
versus the current QED approach?
blkstream block driver:
- Maintain in memory whether given block is allocated in local image,
if not, read from remote, write to local. Set block as local.
Local and remote simply two block drivers from image streaming driver
POV.
- Once all blocks are local, notify mgmt so it can switch to local
copy.
- Writes are mirrored to source and destination, minding guest writes
over copy writes.
Over this scheme, you'd have:
1) Block copy.
Reopen image to be copied with
blkstream:/path/to/current-image:/path/to/destination-image,
background read sectors 0...N.
2) Image stream:
blkstream:remote-image:/path/to/local-image,
background read sectors 0...N.
Where remote-image is remote accessible image such as NBD.
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 14:52 ` Marcelo Tosatti
@ 2011-06-16 15:30 ` Stefan Hajnoczi
2011-06-17 12:31 ` Marcelo Tosatti
2011-06-17 13:54 ` Marcelo Tosatti
2011-06-17 8:36 ` Kevin Wolf
1 sibling, 2 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-16 15:30 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Kevin Wolf, Anthony Liguori, jes sorensen, Dor Laor, qemu-devel,
Avi Kivity, Adam Litke
On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
This approach does not use the backing file feature?
> blkstream block driver:
>
> - Maintain in memory whether given block is allocated in local image,
> if not, read from remote, write to local. Set block as local.
> Local and remote simply two block drivers from image streaming driver
> POV.
> - Once all blocks are local, notify mgmt so it can switch to local
> copy.
> - Writes are mirrored to source and destination, minding guest writes
> over copy writes.
We open the remote file read-only for image streaming and do not want to
mirror writes.
If QEMU crashes or there is a power failure we need to restart the
streaming process carefully - local blocks must not be overwritten.
Perhaps this is the tricky part.
> Over this scheme, you'd have:
>
> 1) Block copy.
> Reopen image to be copied with
> blkstream:/path/to/current-image:/path/to/destination-image,
> background read sectors 0...N.
>
> 2) Image stream:
> blkstream:remote-image:/path/to/local-image,
> background read sectors 0...N.
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 15:30 ` Stefan Hajnoczi
@ 2011-06-17 12:31 ` Marcelo Tosatti
2011-06-18 9:15 ` Stefan Hajnoczi
2011-06-17 13:54 ` Marcelo Tosatti
1 sibling, 1 reply; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-17 12:31 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Kevin Wolf, Anthony Liguori, jes sorensen, Dor Laor, qemu-devel,
Avi Kivity, Adam Litke
On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
> This approach does not use the backing file feature?
>
> > blkstream block driver:
> >
> > - Maintain in memory whether given block is allocated in local image,
> > if not, read from remote, write to local. Set block as local.
> > Local and remote simply two block drivers from image streaming driver
> > POV.
> > - Once all blocks are local, notify mgmt so it can switch to local
> > copy.
> > - Writes are mirrored to source and destination, minding guest writes
> > over copy writes.
>
> We open the remote file read-only for image streaming and do not want to
> mirror writes.
Why not? Is there any disadvantage of mirroring writes?
> If QEMU crashes or there is a power failure we need to restart the
> streaming process carefully - local blocks must not be overwritten.
> Perhaps this is the tricky part.
Under the proposed scheme, if QEMU crashes you'd restart streaming from
zero. In that case, the remote image is consistent due to mirrored
writes.
That is one disadvantage of keeping the local/remote status of blocks
in memory: in case of a crash you'd have to restart from zero. But this
should be an uncommon case (and there is not much of an option for
generic-format image streaming without keeping metadata).
Do you see a problem with that?
> > Over this scheme, you'd have:
> >
> > 1) Block copy.
> > Reopen image to be copied with
> > blkstream:/path/to/current-image:/path/to/destination-image,
> > background read sectors 0...N.
> >
> > 2) Image stream:
> > blkstream:remote-image:/path/to/local-image,
> > background read sectors 0...N.
>
> Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 12:31 ` Marcelo Tosatti
@ 2011-06-18 9:15 ` Stefan Hajnoczi
2011-06-18 9:17 ` Stefan Hajnoczi
0 siblings, 1 reply; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-18 9:15 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, jes sorensen,
Dor Laor, qemu-devel, Avi Kivity, Adam Litke
On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>> This approach does not use the backing file feature?
>>
>> > blkstream block driver:
>> >
>> > - Maintain in memory whether given block is allocated in local image,
>> > if not, read from remote, write to local. Set block as local.
>> > Local and remote simply two block drivers from image streaming driver
>> > POV.
>> > - Once all blocks are local, notify mgmt so it can switch to local
>> > copy.
>> > - Writes are mirrored to source and destination, minding guest writes
>> > over copy writes.
>>
>> We open the remote file read-only for image streaming and do not want to
>> mirror writes.
>
> Why not? Is there any disadvantage of mirroring writes?
Think of the use case with a Fedora master image over NFS. You want a
local clone of that master image and use the stream command to copy
the data from the master image into the local clone.
You cannot modify that master image because other VMs are using it too
and/or you want to be able to clone new VMs from it in the future.
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-18 9:15 ` Stefan Hajnoczi
@ 2011-06-18 9:17 ` Stefan Hajnoczi
2011-06-19 16:02 ` Dor Laor
0 siblings, 1 reply; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-18 9:17 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, jes sorensen,
Dor Laor, qemu-devel, Avi Kivity, Adam Litke
On Sat, Jun 18, 2011 at 10:15 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>>> This approach does not use the backing file feature?
>>>
>>> > blkstream block driver:
>>> >
>>> > - Maintain in memory whether given block is allocated in local image,
>>> > if not, read from remote, write to local. Set block as local.
>>> > Local and remote simply two block drivers from image streaming driver
>>> > POV.
>>> > - Once all blocks are local, notify mgmt so it can switch to local
>>> > copy.
>>> > - Writes are mirrored to source and destination, minding guest writes
>>> > over copy writes.
>>>
>>> We open the remote file read-only for image streaming and do not want to
>>> mirror writes.
>>
>> Why not? Is there any disadvantage of mirroring writes?
>
> Think of the use case with a Fedora master image over NFS. You want a
> local clone of that master image and use the stream command to copy
> the data from the master image into the local clone.
>
> You cannot modify that master image because other VMs are using it too
> and/or you want to be able to clone new VMs from it in the future.
BTW the workaround is to create two local images:
1. Local clone with master image as a backing file. This is the live
block copy source image.
2. Local image without a backing file. This is the live block copy
destination image.
But this is not very elegant. Writes get mirrored so that crash recovery works.
Compare that to image streaming, which works across crash and doesn't
duplicate I/O.
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-18 9:17 ` Stefan Hajnoczi
@ 2011-06-19 16:02 ` Dor Laor
2011-06-24 9:28 ` Stefan Hajnoczi
0 siblings, 1 reply; 43+ messages in thread
From: Dor Laor @ 2011-06-19 16:02 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Avi Kivity, Adam Litke
On 06/18/2011 12:17 PM, Stefan Hajnoczi wrote:
> On Sat, Jun 18, 2011 at 10:15 AM, Stefan Hajnoczi<stefanha@gmail.com> wrote:
>> On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti<mtosatti@redhat.com> wrote:
>>> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>>>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>>>> This approach does not use the backing file feature?
>>>>
>>>>> blkstream block driver:
>>>>>
>>>>> - Maintain in memory whether given block is allocated in local image,
>>>>> if not, read from remote, write to local. Set block as local.
>>>>> Local and remote simply two block drivers from image streaming driver
>>>>> POV.
>>>>> - Once all blocks are local, notify mgmt so it can switch to local
>>>>> copy.
>>>>> - Writes are mirrored to source and destination, minding guest writes
>>>>> over copy writes.
>>>>
>>>> We open the remote file read-only for image streaming and do not want to
>>>> mirror writes.
>>>
>>> Why not? Is there any disadvantage of mirroring writes?
>>
>> Think of the use case with a Fedora master image over NFS. You want a
>> local clone of that master image and use the stream command to copy
>> the data from the master image into the local clone.
>>
>> You cannot modify that master image because other VMs are using it too
>> and/or you want to be able to clone new VMs from it in the future.
>
> BTW the workaround is to create two local images:
> 1. Local clone with master image as a backing file. This is the live
> block copy source image.
> 2. Local image without a backing file. This is the live block copy
> destination image.
>
> But this is not very elegant. Writes get mirrored so that crash recovery works.
There is an easier work around for image streaming using live block copy
(mirror approach):
- Create the dst VM as an empty new COW image of the src (even over
the non shared storage, use some protocol tag for the src location
like nbd://original_path/src_file_name)
- Run the usual live block copy of src image (master read only OS
template) to the destination.
- Use a -src-read-only flag that will make the copy skip the src
writing.
Voila - no duplicate writes, crash recovery works since we reference the
original image and we share the code.
>
> Compare that to image streaming, which works across crash and doesn't
> duplicate I/O.
>
> Stefan
>
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-19 16:02 ` Dor Laor
@ 2011-06-24 9:28 ` Stefan Hajnoczi
2011-06-26 12:50 ` Dor Laor
0 siblings, 1 reply; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-24 9:28 UTC (permalink / raw)
To: dlaor
Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Avi Kivity, Adam Litke
On Sun, Jun 19, 2011 at 5:02 PM, Dor Laor <dlaor@redhat.com> wrote:
> On 06/18/2011 12:17 PM, Stefan Hajnoczi wrote:
>>
>> On Sat, Jun 18, 2011 at 10:15 AM, Stefan Hajnoczi<stefanha@gmail.com>
>> wrote:
>>>
>>> On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti<mtosatti@redhat.com>
>>> wrote:
>>>>
>>>> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>>>>>
>>>>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>>>>> This approach does not use the backing file feature?
>>>>>
>>>>>> blkstream block driver:
>>>>>>
>>>>>> - Maintain in memory whether given block is allocated in local image,
>>>>>> if not, read from remote, write to local. Set block as local.
>>>>>> Local and remote simply two block drivers from image streaming driver
>>>>>> POV.
>>>>>> - Once all blocks are local, notify mgmt so it can switch to local
>>>>>> copy.
>>>>>> - Writes are mirrored to source and destination, minding guest writes
>>>>>> over copy writes.
>>>>>
>>>>> We open the remote file read-only for image streaming and do not want
>>>>> to
>>>>> mirror writes.
>>>>
>>>> Why not? Is there any disadvantage of mirroring writes?
>>>
>>> Think of the use case with a Fedora master image over NFS. You want a
>>> local clone of that master image and use the stream command to copy
>>> the data from the master image into the local clone.
>>>
>>> You cannot modify that master image because other VMs are using it too
>>> and/or you want to be able to clone new VMs from it in the future.
>>
>> BTW the workaround is to create two local images:
>> 1. Local clone with master image as a backing file. This is the live
>> block copy source image.
>> 2. Local image without a backing file. This is the live block copy
>> destination image.
>>
>> But this is not very elegant. Writes get mirrored so that crash recovery
>> works.
>
> There is an easier work around for image streaming using live block copy
> (mirror approach):
> - Create the dst VM as an empty new COW image of the src (even over
> the non shared storage, use some protocol tag for the src location
> like nbd://original_path/src_file_name)
Migration and non-shared storage has come up a few times in this
discussion. But both live block copy and image streaming need access
to source and destination - they do not have explicit non-shared
storage support. I think non-shared and using nbd:// is orthogonal to
the discussion. Just want to check that you agree and I haven't
missed something?
> - Run the usual live block copy of src image (master read only OS
> template) to the destination.
> - Use a -src-read-only flag that will make the copy skip the src
> writing.
>
> Voila - no duplicate writes, crash recovery works since we reference the
> original image and we share the code.
So the running guest is using the destination image since the source
is read-only?
This approach makes sense to me.
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-24 9:28 ` Stefan Hajnoczi
@ 2011-06-26 12:50 ` Dor Laor
2011-06-27 7:48 ` Kevin Wolf
0 siblings, 1 reply; 43+ messages in thread
From: Dor Laor @ 2011-06-26 12:50 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Kevin Wolf, Anthony Liguori, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Avi Kivity, Adam Litke
On 06/24/2011 12:28 PM, Stefan Hajnoczi wrote:
> On Sun, Jun 19, 2011 at 5:02 PM, Dor Laor<dlaor@redhat.com> wrote:
>> On 06/18/2011 12:17 PM, Stefan Hajnoczi wrote:
>>>
>>> On Sat, Jun 18, 2011 at 10:15 AM, Stefan Hajnoczi<stefanha@gmail.com>
>>> wrote:
>>>>
>>>> On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti<mtosatti@redhat.com>
>>>> wrote:
>>>>>
>>>>> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>>>>>>
>>>>>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>>>>>> This approach does not use the backing file feature?
>>>>>>
>>>>>>> blkstream block driver:
>>>>>>>
>>>>>>> - Maintain in memory whether given block is allocated in local image,
>>>>>>> if not, read from remote, write to local. Set block as local.
>>>>>>> Local and remote simply two block drivers from image streaming driver
>>>>>>> POV.
>>>>>>> - Once all blocks are local, notify mgmt so it can switch to local
>>>>>>> copy.
>>>>>>> - Writes are mirrored to source and destination, minding guest writes
>>>>>>> over copy writes.
>>>>>>
>>>>>> We open the remote file read-only for image streaming and do not want
>>>>>> to
>>>>>> mirror writes.
>>>>>
>>>>> Why not? Is there any disadvantage of mirroring writes?
>>>>
>>>> Think of the use case with a Fedora master image over NFS. You want a
>>>> local clone of that master image and use the stream command to copy
>>>> the data from the master image into the local clone.
>>>>
>>>> You cannot modify that master image because other VMs are using it too
>>>> and/or you want to be able to clone new VMs from it in the future.
>>>
>>> BTW the workaround is to create two local images:
>>> 1. Local clone with master image as a backing file. This is the live
>>> block copy source image.
>>> 2. Local image without a backing file. This is the live block copy
>>> destination image.
>>>
>>> But this is not very elegant. Writes get mirrored so that crash recovery
>>> works.
>>
>> There is an easier work around for image streaming using live block copy
>> (mirror approach):
>> - Create the dst VM as an empty new COW image of the src (even over
>> the non shared storage, use some protocol tag for the src location
>> like nbd://original_path/src_file_name)
>
> Migration and non-shared storage has come up a few times in this
> discussion. But both live block copy and image streaming need access
> to source and destination - they do not have explicit non-shared
> storage support. I think non-shared and using nbd:// is orthogonal to
> the discussion. Just want to check that you agree and I haven't
> missed something?
You're right, I was mainly trying to be as general as possible.
>
>> - Run the usual live block copy of src image (master read only OS
>> template) to the destination.
>> - Use a -src-read-only flag that will make the copy skip the src
>> writing.
>>
>> Voila - no duplicate writes, crash recovery works since we reference the
>> original image and we share the code.
>
> So the running guest is using the destination image since the source
> is read-only?
Yes. The image will run on the destination not because of the source in
RO state but because that is what we look for.
>
> This approach makes sense to me.
>
> Stefan
>
>
>
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-26 12:50 ` Dor Laor
@ 2011-06-27 7:48 ` Kevin Wolf
2011-06-27 9:13 ` Dor Laor
0 siblings, 1 reply; 43+ messages in thread
From: Kevin Wolf @ 2011-06-27 7:48 UTC (permalink / raw)
To: dlaor
Cc: Anthony Liguori, Stefan Hajnoczi, Stefan Hajnoczi,
Marcelo Tosatti, qemu-devel, Adam Litke, Avi Kivity, jes sorensen
Am 26.06.2011 14:50, schrieb Dor Laor:
> On 06/24/2011 12:28 PM, Stefan Hajnoczi wrote:
>> On Sun, Jun 19, 2011 at 5:02 PM, Dor Laor<dlaor@redhat.com> wrote:
>>> On 06/18/2011 12:17 PM, Stefan Hajnoczi wrote:
>>>>
>>>> On Sat, Jun 18, 2011 at 10:15 AM, Stefan Hajnoczi<stefanha@gmail.com>
>>>> wrote:
>>>>>
>>>>> On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti<mtosatti@redhat.com>
>>>>> wrote:
>>>>>>
>>>>>> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>>>>>>>
>>>>>>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>>>>>>> This approach does not use the backing file feature?
>>>>>>>
>>>>>>>> blkstream block driver:
>>>>>>>>
>>>>>>>> - Maintain in memory whether given block is allocated in local image,
>>>>>>>> if not, read from remote, write to local. Set block as local.
>>>>>>>> Local and remote simply two block drivers from image streaming driver
>>>>>>>> POV.
>>>>>>>> - Once all blocks are local, notify mgmt so it can switch to local
>>>>>>>> copy.
>>>>>>>> - Writes are mirrored to source and destination, minding guest writes
>>>>>>>> over copy writes.
>>>>>>>
>>>>>>> We open the remote file read-only for image streaming and do not want
>>>>>>> to
>>>>>>> mirror writes.
>>>>>>
>>>>>> Why not? Is there any disadvantage of mirroring writes?
>>>>>
>>>>> Think of the use case with a Fedora master image over NFS. You want a
>>>>> local clone of that master image and use the stream command to copy
>>>>> the data from the master image into the local clone.
>>>>>
>>>>> You cannot modify that master image because other VMs are using it too
>>>>> and/or you want to be able to clone new VMs from it in the future.
>>>>
>>>> BTW the workaround is to create two local images:
>>>> 1. Local clone with master image as a backing file. This is the live
>>>> block copy source image.
>>>> 2. Local image without a backing file. This is the live block copy
>>>> destination image.
>>>>
>>>> But this is not very elegant. Writes get mirrored so that crash recovery
>>>> works.
>>>
>>> There is an easier work around for image streaming using live block copy
>>> (mirror approach):
>>> - Create the dst VM as an empty new COW image of the src (even over
>>> the non shared storage, use some protocol tag for the src location
>>> like nbd://original_path/src_file_name)
>>
>> Migration and non-shared storage has come up a few times in this
>> discussion. But both live block copy and image streaming need access
>> to source and destination - they do not have explicit non-shared
>> storage support. I think non-shared and using nbd:// is orthogonal to
>> the discussion. Just want to check that you agree and I haven't
>> missed something?
>
> You're right, I was mainly trying to be as general as possible.
I think there is one important point to consider for using NBD: You
always see a single image on the NBD client, which could in fact have a
backing file chain on the source. So bdrv_is_allocated() doesn't work
over NBD, which becomes interesting when you want to share a backing
file with the new copy.
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-27 7:48 ` Kevin Wolf
@ 2011-06-27 9:13 ` Dor Laor
0 siblings, 0 replies; 43+ messages in thread
From: Dor Laor @ 2011-06-27 9:13 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Stefan Hajnoczi,
Marcelo Tosatti, qemu-devel, Orit Wasserman, Avi Kivity,
Adam Litke
On 06/27/2011 10:48 AM, Kevin Wolf wrote:
> Am 26.06.2011 14:50, schrieb Dor Laor:
>> On 06/24/2011 12:28 PM, Stefan Hajnoczi wrote:
>>> On Sun, Jun 19, 2011 at 5:02 PM, Dor Laor<dlaor@redhat.com> wrote:
>>>> On 06/18/2011 12:17 PM, Stefan Hajnoczi wrote:
>>>>>
>>>>> On Sat, Jun 18, 2011 at 10:15 AM, Stefan Hajnoczi<stefanha@gmail.com>
>>>>> wrote:
>>>>>>
>>>>>> On Fri, Jun 17, 2011 at 1:31 PM, Marcelo Tosatti<mtosatti@redhat.com>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
>>>>>>>>
>>>>>>>> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
>>>>>>>> This approach does not use the backing file feature?
>>>>>>>>
>>>>>>>>> blkstream block driver:
>>>>>>>>>
>>>>>>>>> - Maintain in memory whether given block is allocated in local image,
>>>>>>>>> if not, read from remote, write to local. Set block as local.
>>>>>>>>> Local and remote simply two block drivers from image streaming driver
>>>>>>>>> POV.
>>>>>>>>> - Once all blocks are local, notify mgmt so it can switch to local
>>>>>>>>> copy.
>>>>>>>>> - Writes are mirrored to source and destination, minding guest writes
>>>>>>>>> over copy writes.
>>>>>>>>
>>>>>>>> We open the remote file read-only for image streaming and do not want
>>>>>>>> to
>>>>>>>> mirror writes.
>>>>>>>
>>>>>>> Why not? Is there any disadvantage of mirroring writes?
>>>>>>
>>>>>> Think of the use case with a Fedora master image over NFS. You want a
>>>>>> local clone of that master image and use the stream command to copy
>>>>>> the data from the master image into the local clone.
>>>>>>
>>>>>> You cannot modify that master image because other VMs are using it too
>>>>>> and/or you want to be able to clone new VMs from it in the future.
>>>>>
>>>>> BTW the workaround is to create two local images:
>>>>> 1. Local clone with master image as a backing file. This is the live
>>>>> block copy source image.
>>>>> 2. Local image without a backing file. This is the live block copy
>>>>> destination image.
>>>>>
>>>>> But this is not very elegant. Writes get mirrored so that crash recovery
>>>>> works.
>>>>
>>>> There is an easier work around for image streaming using live block copy
>>>> (mirror approach):
>>>> - Create the dst VM as an empty new COW image of the src (even over
>>>> the non shared storage, use some protocol tag for the src location
>>>> like nbd://original_path/src_file_name)
>>>
>>> Migration and non-shared storage has come up a few times in this
>>> discussion. But both live block copy and image streaming need access
>>> to source and destination - they do not have explicit non-shared
>>> storage support. I think non-shared and using nbd:// is orthogonal to
>>> the discussion. Just want to check that you agree and I haven't
>>> missed something?
>>
>> You're right, I was mainly trying to be as general as possible.
>
> I think there is one important point to consider for using NBD: You
> always see a single image on the NBD client, which could in fact have a
> backing file chain on the source. So bdrv_is_allocated() doesn't work
> over NBD, which becomes interesting when you want to share a backing
> file with the new copy.
What is we'll use iscsi? Will the client have a matching iscsi verb to
detect whether a certain block is in fact a unallocated and we'll be
able to use this for our benefit? Of course it will make the non shared
storage case more complex.
>
> Kevin
>
>
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 15:30 ` Stefan Hajnoczi
2011-06-17 12:31 ` Marcelo Tosatti
@ 2011-06-17 13:54 ` Marcelo Tosatti
1 sibling, 0 replies; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-17 13:54 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Kevin Wolf, Anthony Liguori, jes sorensen, Dor Laor, qemu-devel,
Avi Kivity, Adam Litke
On Thu, Jun 16, 2011 at 04:30:18PM +0100, Stefan Hajnoczi wrote:
> On Thu, Jun 16, 2011 at 11:52:43AM -0300, Marcelo Tosatti wrote:
> This approach does not use the backing file feature?
Not directly. For block copy, the destination (or source) images might
share the same backing file (this is a requirement for live snapshot
merge).
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 14:52 ` Marcelo Tosatti
2011-06-16 15:30 ` Stefan Hajnoczi
@ 2011-06-17 8:36 ` Kevin Wolf
2011-06-17 8:57 ` Stefan Hajnoczi
` (3 more replies)
1 sibling, 4 replies; 43+ messages in thread
From: Kevin Wolf @ 2011-06-17 8:36 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
Am 16.06.2011 16:52, schrieb Marcelo Tosatti:
> On Thu, Jun 16, 2011 at 03:08:30PM +0200, Kevin Wolf wrote:
>> Am 16.06.2011 14:49, schrieb Avi Kivity:
>>> On 06/16/2011 03:35 PM, Kevin Wolf wrote:
>>>> * Image streaming is a normal image file plus copy-on-read plus a
>>>> background task that copies data from the source image
>>>
>>> Or a block-mirror started in degraded mode.
>>
>> At least not in the same configuration as with live block copy: You
>> don't want to write to the source, you only want to read from it when
>> the destination doesn't have the data yet.
>>
>>>> * Live block copy is a block-mirror of two normal image files plus a
>>>> background task that copies data from the source image
>>>
>>> = block-mirror started in degraded mode
>>
>>>> The right solution is probably to implement COR and the background task
>>>> in generic block layer code (no reason to restrict it to QED) and use it
>>>> for both image streaming and live block copy. (This is a bit more
>>>> complicated than it may sound here because guest writes must always take
>>>> precedence over a copy - but doing complicated things is an even better
>>>> reason to do it in a common place instead of duplicating)
>>>
>>> Or in a block-mirror block format driver - generic code need not be
>>> involved.
>>
>> Might be an option. In this case generic code is only involved with the
>> stacking of BlockDriverStates, which is already implemented (but
>> requires -blockdev for a sane way to configure things).
>>
>> Kevin
>
> What are the disadvantages of such an approach for image streaming,
> versus the current QED approach?
>
> blkstream block driver:
>
> - Maintain in memory whether given block is allocated in local image,
> if not, read from remote, write to local. Set block as local.
> Local and remote simply two block drivers from image streaming driver
> POV.
Why maintain it in memory? We already have mechanisms to track this in
COW image formats, so that you can even continue after a crash.
We can still add a raw-cow driver that maintains the COW data in memory
for allowing raw copies, if this is needed.
> - Once all blocks are local, notify mgmt so it can switch to local
> copy.
> - Writes are mirrored to source and destination, minding guest writes
> over copy writes.
Image streaming shouldn't write to the source. But adding a flag for
this isn't a major problem.
> Over this scheme, you'd have:
>
> 1) Block copy.
> Reopen image to be copied with
> blkstream:/path/to/current-image:/path/to/destination-image,
> background read sectors 0...N.
>
> 2) Image stream:
> blkstream:remote-image:/path/to/local-image,
> background read sectors 0...N.
>
> Where remote-image is remote accessible image such as NBD.
I think that should work.
By the way, we'll get problems with the colon syntax. Without -blockdev
we'll have to invent a new syntax, maybe with brackets:
blkstream:[nbd:localhost]:out.qcow2
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 8:36 ` Kevin Wolf
@ 2011-06-17 8:57 ` Stefan Hajnoczi
2011-06-17 9:22 ` Kevin Wolf
2011-06-17 12:21 ` Anthony Liguori
` (2 subsequent siblings)
3 siblings, 1 reply; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-17 8:57 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Dor Laor, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Avi Kivity, Adam Litke
On Fri, Jun 17, 2011 at 9:36 AM, Kevin Wolf <kwolf@redhat.com> wrote:
> By the way, we'll get problems with the colon syntax. Without -blockdev
> we'll have to invent a new syntax, maybe with brackets:
>
> blkstream:[nbd:localhost]:out.qcow2
Embedding block driver options in filenames is getting worse as time
goes on. I recently tried to refactor and eliminate
QEMUOptionParameter so that we only use QemuOpts instead of two
different option APIs. Part of that involves keeping separate
per-block driver (i.e. -blockdev) options lists, which would allow us
to pass proper options to block drivers instead of embedding them in
the filename.
I got stuck because today the protocol and format QEMUOptionParameters
get concatenated in some cases. Concatenation is not really supported
by QemuOpts :).
Anyway, here's the current state if anyone is interested:
http://repo.or.cz/w/qemu/stefanha.git/commitdiff/b49babb2c8b476a36357cfd7276ca45a11039ca5
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 8:57 ` Stefan Hajnoczi
@ 2011-06-17 9:22 ` Kevin Wolf
2011-06-17 10:11 ` Stefan Hajnoczi
0 siblings, 1 reply; 43+ messages in thread
From: Kevin Wolf @ 2011-06-17 9:22 UTC (permalink / raw)
To: Stefan Hajnoczi
Cc: Anthony Liguori, Dor Laor, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Avi Kivity, Adam Litke
Am 17.06.2011 10:57, schrieb Stefan Hajnoczi:
> On Fri, Jun 17, 2011 at 9:36 AM, Kevin Wolf <kwolf@redhat.com> wrote:
>> By the way, we'll get problems with the colon syntax. Without -blockdev
>> we'll have to invent a new syntax, maybe with brackets:
>>
>> blkstream:[nbd:localhost]:out.qcow2
>
> Embedding block driver options in filenames is getting worse as time
> goes on.
Well, yes. We need -blockdev for a sane way to express complex relations
between BlockDriverStates. But then, we'll also want to have convenient
shortcuts for manual use, and that may be something like the existing
colon syntax. I really don't feel like typing three full -blockdev
parameters for qcow2 on blockdbg on raw.
> I recently tried to refactor and eliminate
> QEMUOptionParameter so that we only use QemuOpts instead of two
> different option APIs. Part of that involves keeping separate
> per-block driver (i.e. -blockdev) options lists, which would allow us
> to pass proper options to block drivers instead of embedding them in
> the filename.
Aren't these completely independent things? QEMUOptionParameter is used
for image creation, whereas filenames are used for opening images. I
think you can change one without changing the other.
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 9:22 ` Kevin Wolf
@ 2011-06-17 10:11 ` Stefan Hajnoczi
0 siblings, 0 replies; 43+ messages in thread
From: Stefan Hajnoczi @ 2011-06-17 10:11 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Dor Laor, Stefan Hajnoczi, jes sorensen,
Marcelo Tosatti, qemu-devel, Avi Kivity, Adam Litke
On Fri, Jun 17, 2011 at 10:22 AM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 17.06.2011 10:57, schrieb Stefan Hajnoczi:
>> On Fri, Jun 17, 2011 at 9:36 AM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> By the way, we'll get problems with the colon syntax. Without -blockdev
>>> we'll have to invent a new syntax, maybe with brackets:
>>>
>>> blkstream:[nbd:localhost]:out.qcow2
>>
>> Embedding block driver options in filenames is getting worse as time
>> goes on.
>
> Well, yes. We need -blockdev for a sane way to express complex relations
> between BlockDriverStates. But then, we'll also want to have convenient
> shortcuts for manual use, and that may be something like the existing
> colon syntax. I really don't feel like typing three full -blockdev
> parameters for qcow2 on blockdbg on raw.
>
>> I recently tried to refactor and eliminate
>> QEMUOptionParameter so that we only use QemuOpts instead of two
>> different option APIs. Part of that involves keeping separate
>> per-block driver (i.e. -blockdev) options lists, which would allow us
>> to pass proper options to block drivers instead of embedding them in
>> the filename.
>
> Aren't these completely independent things? QEMUOptionParameter is used
> for image creation, whereas filenames are used for opening images. I
> think you can change one without changing the other.
Yeah but I'd rather not spread two different APIs to do the same
thing. Most of QEMU uses QemuOpts but image creation uses
QEMUOptionParameter. The longer we leave that the more quirks like
the concatenation behavior will fragment the two.
Stefan
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 8:36 ` Kevin Wolf
2011-06-17 8:57 ` Stefan Hajnoczi
@ 2011-06-17 12:21 ` Anthony Liguori
2011-06-17 13:04 ` Marcelo Tosatti
2011-06-17 13:50 ` Marcelo Tosatti
3 siblings, 0 replies; 43+ messages in thread
From: Anthony Liguori @ 2011-06-17 12:21 UTC (permalink / raw)
To: Kevin Wolf
Cc: Marcelo Tosatti, agl, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity
On 06/17/2011 03:36 AM, Kevin Wolf wrote:
> Am 16.06.2011 16:52, schrieb Marcelo Tosatti:
>> On Thu, Jun 16, 2011 at 03:08:30PM +0200, Kevin Wolf wrote:
>> Over this scheme, you'd have:
>>
>> 1) Block copy.
>> Reopen image to be copied with
>> blkstream:/path/to/current-image:/path/to/destination-image,
>> background read sectors 0...N.
>>
>> 2) Image stream:
>> blkstream:remote-image:/path/to/local-image,
>> background read sectors 0...N.
>>
>> Where remote-image is remote accessible image such as NBD.
>
> I think that should work.
>
> By the way, we'll get problems with the colon syntax. Without -blockdev
> we'll have to invent a new syntax, maybe with brackets:
>
> blkstream:[nbd:localhost]:out.qcow2
So what's the main issue with -blockdev today? Just need someone to
spend some time implementing it?
Also, it would be a useful exercise to try and capture some of this in
the wiki. That makes it a bit easier to reference as we move forward.
Regards,
Anthony Liguori
> Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 8:36 ` Kevin Wolf
2011-06-17 8:57 ` Stefan Hajnoczi
2011-06-17 12:21 ` Anthony Liguori
@ 2011-06-17 13:04 ` Marcelo Tosatti
2011-06-17 13:50 ` Marcelo Tosatti
3 siblings, 0 replies; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-17 13:04 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
On Fri, Jun 17, 2011 at 10:36:21AM +0200, Kevin Wolf wrote:
> Am 16.06.2011 16:52, schrieb Marcelo Tosatti:
> > On Thu, Jun 16, 2011 at 03:08:30PM +0200, Kevin Wolf wrote:
> >> Am 16.06.2011 14:49, schrieb Avi Kivity:
> >>> On 06/16/2011 03:35 PM, Kevin Wolf wrote:
> >>>> * Image streaming is a normal image file plus copy-on-read plus a
> >>>> background task that copies data from the source image
> >>>
> >>> Or a block-mirror started in degraded mode.
> >>
> >> At least not in the same configuration as with live block copy: You
> >> don't want to write to the source, you only want to read from it when
> >> the destination doesn't have the data yet.
> >>
> >>>> * Live block copy is a block-mirror of two normal image files plus a
> >>>> background task that copies data from the source image
> >>>
> >>> = block-mirror started in degraded mode
> >>
> >>>> The right solution is probably to implement COR and the background task
> >>>> in generic block layer code (no reason to restrict it to QED) and use it
> >>>> for both image streaming and live block copy. (This is a bit more
> >>>> complicated than it may sound here because guest writes must always take
> >>>> precedence over a copy - but doing complicated things is an even better
> >>>> reason to do it in a common place instead of duplicating)
> >>>
> >>> Or in a block-mirror block format driver - generic code need not be
> >>> involved.
> >>
> >> Might be an option. In this case generic code is only involved with the
> >> stacking of BlockDriverStates, which is already implemented (but
> >> requires -blockdev for a sane way to configure things).
> >>
> >> Kevin
> >
> > What are the disadvantages of such an approach for image streaming,
> > versus the current QED approach?
> >
> > blkstream block driver:
> >
> > - Maintain in memory whether given block is allocated in local image,
> > if not, read from remote, write to local. Set block as local.
> > Local and remote simply two block drivers from image streaming driver
> > POV.
>
> Why maintain it in memory? We already have mechanisms to track this in
> COW image formats, so that you can even continue after a crash.
>
> We can still add a raw-cow driver that maintains the COW data in memory
> for allowing raw copies, if this is needed.
Well, then image streaming is not for generic-format anymore. OK, the
uptodate information can live in disk if supported by the lower level
format.
> > - Once all blocks are local, notify mgmt so it can switch to local
> > copy.
> > - Writes are mirrored to source and destination, minding guest writes
> > over copy writes.
>
> Image streaming shouldn't write to the source. But adding a flag for
> this isn't a major problem.
OK, block copy does write to the source.
> > Over this scheme, you'd have:
> >
> > 1) Block copy.
> > Reopen image to be copied with
> > blkstream:/path/to/current-image:/path/to/destination-image,
> > background read sectors 0...N.
> >
> > 2) Image stream:
> > blkstream:remote-image:/path/to/local-image,
> > background read sectors 0...N.
> >
> > Where remote-image is remote accessible image such as NBD.
>
> I think that should work.
>
> By the way, we'll get problems with the colon syntax. Without -blockdev
> we'll have to invent a new syntax, maybe with brackets:
>
> blkstream:[nbd:localhost]:out.qcow2
>
> Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-17 8:36 ` Kevin Wolf
` (2 preceding siblings ...)
2011-06-17 13:04 ` Marcelo Tosatti
@ 2011-06-17 13:50 ` Marcelo Tosatti
3 siblings, 0 replies; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-17 13:50 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
On Fri, Jun 17, 2011 at 10:36:21AM +0200, Kevin Wolf wrote:
> Am 16.06.2011 16:52, schrieb Marcelo Tosatti:
> > On Thu, Jun 16, 2011 at 03:08:30PM +0200, Kevin Wolf wrote:
> >> Am 16.06.2011 14:49, schrieb Avi Kivity:
> >>> On 06/16/2011 03:35 PM, Kevin Wolf wrote:
> >>>> * Image streaming is a normal image file plus copy-on-read plus a
> >>>> background task that copies data from the source image
> >>>
> >>> Or a block-mirror started in degraded mode.
> >>
> >> At least not in the same configuration as with live block copy: You
> >> don't want to write to the source, you only want to read from it when
> >> the destination doesn't have the data yet.
> >>
> >>>> * Live block copy is a block-mirror of two normal image files plus a
> >>>> background task that copies data from the source image
> >>>
> >>> = block-mirror started in degraded mode
> >>
> >>>> The right solution is probably to implement COR and the background task
> >>>> in generic block layer code (no reason to restrict it to QED) and use it
> >>>> for both image streaming and live block copy. (This is a bit more
> >>>> complicated than it may sound here because guest writes must always take
> >>>> precedence over a copy - but doing complicated things is an even better
> >>>> reason to do it in a common place instead of duplicating)
> >>>
> >>> Or in a block-mirror block format driver - generic code need not be
> >>> involved.
> >>
> >> Might be an option. In this case generic code is only involved with the
> >> stacking of BlockDriverStates, which is already implemented (but
> >> requires -blockdev for a sane way to configure things).
> >>
> >> Kevin
> >
> > What are the disadvantages of such an approach for image streaming,
> > versus the current QED approach?
> >
> > blkstream block driver:
> >
> > - Maintain in memory whether given block is allocated in local image,
> > if not, read from remote, write to local. Set block as local.
> > Local and remote simply two block drivers from image streaming driver
> > POV.
>
> Why maintain it in memory? We already have mechanisms to track this in
> COW image formats, so that you can even continue after a crash.
Which mechanism is that? You'd need separate metadata for image
streaming purposes, since you might want the destination-image to
use a backing image itself.
> We can still add a raw-cow driver that maintains the COW data in memory
> for allowing raw copies, if this is needed.
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 12:35 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Kevin Wolf
2011-06-16 12:49 ` [Qemu-devel] Image streaming and live block copy Avi Kivity
@ 2011-06-16 13:10 ` Anthony Liguori
2011-06-16 13:50 ` Kevin Wolf
2011-06-16 14:38 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Marcelo Tosatti
2 siblings, 1 reply; 43+ messages in thread
From: Anthony Liguori @ 2011-06-16 13:10 UTC (permalink / raw)
To: Kevin Wolf
Cc: Dor Laor, agl, Stefan Hajnoczi, jes sorensen, Marcelo Tosatti,
qemu-devel, Avi Kivity
On 06/16/2011 07:35 AM, Kevin Wolf wrote:
> Am 14.06.2011 20:18, schrieb Stefan Hajnoczi:
>> Overview
>> --------
>>
>> This patch series adds image streaming support for QED image files. Other
>> image formats can also be supported in the future.
>>
>> Image streaming populates the file in the background while the guest is
>> running. This makes it possible to start the guest before its image file has
>> been fully provisioned.
>>
>> Example use cases include:
>> * Providing small virtual appliances for download that can be launched
>> immediately but provision themselves in the background.
>> * Reducing guest provisioning time by creating local image files but backing
>> them with shared master images which will be streamed.
>>
>> When image streaming is enabled, the unallocated regions of the image file are
>> populated with the data from the backing file. This occurs in the background
>> and the guest can perform regular I/O in the meantime. Once the entire backing
>> file has been streamed, the image no longer requires a backing file and will
>> drop its reference
>
> Long CC list and Kevin wearing his upstream hat - this might be an
> unpleasant email. :-)
>
> So yesterday I had separate discussions with Stefan about image
> streaming and Marcelo, Avi and some other folks about live block copy.
> The conclusion was in both cases that, yes, both things are pretty
> similar and, yes, the current implementation don't reflect that but
> duplicate everything.
>
> To summarise what both things are about:
>
> * Image streaming is a normal image file plus copy-on-read plus a
> background task that copies data from the source image
> * Live block copy is a block-mirror of two normal image files plus a
> background task that copies data from the source image
Is this correct in practice?
Image streaming has the following semantics for A -> B where B is the
backing file of A.
1) All writes go to A.
2) If a read can be satisified by A, read from A, else read from B, copy
to A, then return
Block copy has the following semantics where A is the source and B is
the destination.
1) All reads and writes go to A
2) Copy data from B to A in the background
3) When B matches the content of A, switch over to B
Other than at a hand wave, they both do copies, I'm not sure I see the
overlap in implementations.
Regards,
Anthony Liguori
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 13:10 ` Anthony Liguori
@ 2011-06-16 13:50 ` Kevin Wolf
0 siblings, 0 replies; 43+ messages in thread
From: Kevin Wolf @ 2011-06-16 13:50 UTC (permalink / raw)
To: Anthony Liguori
Cc: Dor Laor, agl, Stefan Hajnoczi, jes sorensen, Marcelo Tosatti,
qemu-devel, Avi Kivity
Am 16.06.2011 15:10, schrieb Anthony Liguori:
> On 06/16/2011 07:35 AM, Kevin Wolf wrote:
>> To summarise what both things are about:
>>
>> * Image streaming is a normal image file plus copy-on-read plus a
>> background task that copies data from the source image
>> * Live block copy is a block-mirror of two normal image files plus a
>> background task that copies data from the source image
>
> Is this correct in practice?
>
> Image streaming has the following semantics for A -> B where B is the
> backing file of A.
That B is a backing file of A is an implementation detail and not a
requirement.
> 1) All writes go to A.
> 2) If a read can be satisified by A, read from A, else read from B, copy
> to A, then return
>
> Block copy has the following semantics where A is the source and B is
> the destination.
>
> 1) All reads and writes go to A
> 2) Copy data from B to A in the background
> 3) When B matches the content of A, switch over to B
3) is optional, it would be like adding an item for image streaming that
it drops the backing file as soon as everything has been copied.
> Other than at a hand wave, they both do copies, I'm not sure I see the
> overlap in implementations.
One thing is handling concurrent requests. If there's a concurrent guest
write request, it must always have precedence over the background copy/COR.
And even if we couldn't use a common implementation for live block copy
and image streaming, I think it's something that shouldn't be duplicated
for copy on read in each image format driver. I think it's possible to
have a generic COR implementation that would not only work for QED, but
also for qcow2, VMDK and any other format implementing backing files
without adding code to each driver.
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming)
2011-06-16 12:35 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Kevin Wolf
2011-06-16 12:49 ` [Qemu-devel] Image streaming and live block copy Avi Kivity
2011-06-16 13:10 ` Anthony Liguori
@ 2011-06-16 14:38 ` Marcelo Tosatti
2011-06-16 14:55 ` Marcelo Tosatti
2011-06-17 8:21 ` [Qemu-devel] Image streaming and live block copy Kevin Wolf
2 siblings, 2 replies; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-16 14:38 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
On Thu, Jun 16, 2011 at 02:35:37PM +0200, Kevin Wolf wrote:
> Am 14.06.2011 20:18, schrieb Stefan Hajnoczi:
> > Overview
> > --------
> >
> > This patch series adds image streaming support for QED image files. Other
> > image formats can also be supported in the future.
> >
> > Image streaming populates the file in the background while the guest is
> > running. This makes it possible to start the guest before its image file has
> > been fully provisioned.
> >
> > Example use cases include:
> > * Providing small virtual appliances for download that can be launched
> > immediately but provision themselves in the background.
> > * Reducing guest provisioning time by creating local image files but backing
> > them with shared master images which will be streamed.
> >
> > When image streaming is enabled, the unallocated regions of the image file are
> > populated with the data from the backing file. This occurs in the background
> > and the guest can perform regular I/O in the meantime. Once the entire backing
> > file has been streamed, the image no longer requires a backing file and will
> > drop its reference
>
> Long CC list and Kevin wearing his upstream hat - this might be an
> unpleasant email. :-)
>
> So yesterday I had separate discussions with Stefan about image
> streaming and Marcelo, Avi and some other folks about live block copy.
> The conclusion was in both cases that, yes, both things are pretty
> similar and, yes, the current implementation don't reflect that but
> duplicate everything.
>
> To summarise what both things are about:
>
> * Image streaming is a normal image file plus copy-on-read plus a
> background task that copies data from the source image
> * Live block copy is a block-mirror of two normal image files plus a
> background task that copies data from the source image
>
> The right solution is probably to implement COR and the background task
> in generic block layer code (no reason to restrict it to QED) and use it
> for both image streaming and live block copy. (This is a bit more
> complicated than it may sound here because guest writes must always take
> precedence over a copy - but doing complicated things is an even better
> reason to do it in a common place instead of duplicating)
>
> People seem to agree on this, and the reason that I've heard why we
> should merge the existing code instead is downstream time pressure. That
> may be a valid reason for downstreams to add such code, but is taking
> such code really the best option for upstream (and therefore long-term
> for downstreams)?
>
> If we take these patches as they are, I doubt that we'll ever get a
> rewrite to implement the code as it should have been done in the first
> place.
That (a newer, unified mechanism) is just a matter of allocating
resources to the implementation.
At least in block copy's case the interface can be reused, so it can be
seen as an incremental approach (read: advocating in favour of merging
live block copy patchset).
> So I'm tempted to reject the current versions of both the image
> streaming and live block copy series and leave it to downstreams to use
> these as temporary solutions if the time pressure is too high. I know
> that maintaining things downstream is painful, but that's the whole
> point: I want to see the real implementation one day, and I'm afraid
> this might be the only way to get it.
>
> Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming)
2011-06-16 14:38 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Marcelo Tosatti
@ 2011-06-16 14:55 ` Marcelo Tosatti
2011-06-17 8:21 ` [Qemu-devel] Image streaming and live block copy Kevin Wolf
1 sibling, 0 replies; 43+ messages in thread
From: Marcelo Tosatti @ 2011-06-16 14:55 UTC (permalink / raw)
To: Kevin Wolf
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
On Thu, Jun 16, 2011 at 11:38:48AM -0300, Marcelo Tosatti wrote:
> > People seem to agree on this, and the reason that I've heard why we
> > should merge the existing code instead is downstream time pressure. That
> > may be a valid reason for downstreams to add such code, but is taking
> > such code really the best option for upstream (and therefore long-term
> > for downstreams)?
> >
> > If we take these patches as they are, I doubt that we'll ever get a
> > rewrite to implement the code as it should have been done in the first
> > place.
>
> That (a newer, unified mechanism) is just a matter of allocating
> resources to the implementation.
>
> At least in block copy's case the interface can be reused, so it can be
> seen as an incremental approach (read: advocating in favour of merging
> live block copy patchset).
>
> > So I'm tempted to reject the current versions of both the image
> > streaming and live block copy series and leave it to downstreams to use
> > these as temporary solutions if the time pressure is too high. I know
> > that maintaining things downstream is painful, but that's the whole
> > point: I want to see the real implementation one day, and I'm afraid
> > this might be the only way to get it.
Again: there is no need to reject current live block copy
implementation, as image streaming implemented for generic image formats
is a strong motivator behind the unified implementation.
^ permalink raw reply [flat|nested] 43+ messages in thread
* Re: [Qemu-devel] Image streaming and live block copy
2011-06-16 14:38 ` [Qemu-devel] Image streaming and live block copy (was: [PATCH 00/13] QED image streaming) Marcelo Tosatti
2011-06-16 14:55 ` Marcelo Tosatti
@ 2011-06-17 8:21 ` Kevin Wolf
1 sibling, 0 replies; 43+ messages in thread
From: Kevin Wolf @ 2011-06-17 8:21 UTC (permalink / raw)
To: Marcelo Tosatti
Cc: Anthony Liguori, Stefan Hajnoczi, jes sorensen, Dor Laor,
qemu-devel, Avi Kivity, Adam Litke
Am 16.06.2011 16:38, schrieb Marcelo Tosatti:
> On Thu, Jun 16, 2011 at 02:35:37PM +0200, Kevin Wolf wrote:
>> Am 14.06.2011 20:18, schrieb Stefan Hajnoczi:
>>> Overview
>>> --------
>>>
>>> This patch series adds image streaming support for QED image files. Other
>>> image formats can also be supported in the future.
>>>
>>> Image streaming populates the file in the background while the guest is
>>> running. This makes it possible to start the guest before its image file has
>>> been fully provisioned.
>>>
>>> Example use cases include:
>>> * Providing small virtual appliances for download that can be launched
>>> immediately but provision themselves in the background.
>>> * Reducing guest provisioning time by creating local image files but backing
>>> them with shared master images which will be streamed.
>>>
>>> When image streaming is enabled, the unallocated regions of the image file are
>>> populated with the data from the backing file. This occurs in the background
>>> and the guest can perform regular I/O in the meantime. Once the entire backing
>>> file has been streamed, the image no longer requires a backing file and will
>>> drop its reference
>>
>> Long CC list and Kevin wearing his upstream hat - this might be an
>> unpleasant email. :-)
>>
>> So yesterday I had separate discussions with Stefan about image
>> streaming and Marcelo, Avi and some other folks about live block copy.
>> The conclusion was in both cases that, yes, both things are pretty
>> similar and, yes, the current implementation don't reflect that but
>> duplicate everything.
>>
>> To summarise what both things are about:
>>
>> * Image streaming is a normal image file plus copy-on-read plus a
>> background task that copies data from the source image
>> * Live block copy is a block-mirror of two normal image files plus a
>> background task that copies data from the source image
>>
>> The right solution is probably to implement COR and the background task
>> in generic block layer code (no reason to restrict it to QED) and use it
>> for both image streaming and live block copy. (This is a bit more
>> complicated than it may sound here because guest writes must always take
>> precedence over a copy - but doing complicated things is an even better
>> reason to do it in a common place instead of duplicating)
>>
>> People seem to agree on this, and the reason that I've heard why we
>> should merge the existing code instead is downstream time pressure. That
>> may be a valid reason for downstreams to add such code, but is taking
>> such code really the best option for upstream (and therefore long-term
>> for downstreams)?
>>
>> If we take these patches as they are, I doubt that we'll ever get a
>> rewrite to implement the code as it should have been done in the first
>> place.
>
> That (a newer, unified mechanism) is just a matter of allocating
> resources to the implementation.
>
> At least in block copy's case the interface can be reused, so it can be
> seen as an incremental approach (read: advocating in favour of merging
> live block copy patchset).
Right. However, my point was that I'm afraid that this resource
allocation and therefore the incremental improvement won't happen
because for most people (who don't care about the code) the result will
be "good enough" and the problem will only be visible for block layer
people who must work with the code.
Kevin
^ permalink raw reply [flat|nested] 43+ messages in thread