From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from [140.186.70.92] (port=59299 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Q56Nd-0007xg-7M for qemu-devel@nongnu.org; Wed, 30 Mar 2011 21:08:58 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Q56Nb-00018I-UL for qemu-devel@nongnu.org; Wed, 30 Mar 2011 21:08:57 -0400 Received: from e8.ny.us.ibm.com ([32.97.182.138]:50273) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Q56Nb-00016e-Rc for qemu-devel@nongnu.org; Wed, 30 Mar 2011 21:08:55 -0400 Received: from d01dlp01.pok.ibm.com (d01dlp01.pok.ibm.com [9.56.224.56]) by e8.ny.us.ibm.com (8.14.4/8.13.1) with ESMTP id p2V0hfmT011857 for ; Wed, 30 Mar 2011 20:43:41 -0400 Received: from d01relay06.pok.ibm.com (d01relay06.pok.ibm.com [9.56.227.116]) by d01dlp01.pok.ibm.com (Postfix) with ESMTP id 59A7F38C8038 for ; Wed, 30 Mar 2011 21:08:24 -0400 (EDT) Received: from d01av02.pok.ibm.com (d01av02.pok.ibm.com [9.56.224.216]) by d01relay06.pok.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id p2V18UVI2478152 for ; Wed, 30 Mar 2011 21:08:30 -0400 Received: from d01av02.pok.ibm.com (loopback [127.0.0.1]) by d01av02.pok.ibm.com (8.14.4/8.13.1/NCO v10.0 AVout) with ESMTP id p2V18Tlc022392 for ; Wed, 30 Mar 2011 22:08:29 -0300 From: Anthony Liguori Date: Wed, 30 Mar 2011 20:08:34 -0500 Message-Id: <1301533714-28997-1-git-send-email-aliguori@us.ibm.com> Subject: [Qemu-devel] [RFC PATCH] qed: add support for Copy-on-Read List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Kevin Wolf , Anthony Liguori , Stefan Hajnoczi When creating an image using qemu-img, just pass '-o copy_on_read' and then whenever QED reads from a backing file, it will write the block to the QED file after the read completes ensuring that you only fetch from the backing device once. This is very useful for streaming images over a slow connection. This isn't ready for merge yet as it's not playing nice with synchronize I/O. I think it's fairly easy to do the same thing in qcow2 by just hooking adding some logic after bdrv_aio_write() to call back into qcow2 with a synchronous I/O write in the backing file case. Thoughts on whether that would actually work? Signed-off-by: Anthony Liguori --- block/qed.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- block/qed.h | 4 +++- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/block/qed.c b/block/qed.c index 75ae244..292833b 100644 --- a/block/qed.c +++ b/block/qed.c @@ -448,7 +448,8 @@ static int bdrv_qed_flush(BlockDriverState *bs) static int qed_create(const char *filename, uint32_t cluster_size, uint64_t image_size, uint32_t table_size, - const char *backing_file, const char *backing_fmt) + const char *backing_file, const char *backing_fmt, + bool copy_on_read) { QEDHeader header = { .magic = QED_MAGIC, @@ -490,6 +491,9 @@ static int qed_create(const char *filename, uint32_t cluster_size, if (qed_fmt_is_raw(backing_fmt)) { header.features |= QED_F_BACKING_FORMAT_NO_PROBE; } + if (copy_on_read) { + header.compat_features |= QED_CF_COPY_ON_READ; + } } qed_header_cpu_to_le(&header, &le_header); @@ -523,6 +527,7 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) uint32_t table_size = QED_DEFAULT_TABLE_SIZE; const char *backing_file = NULL; const char *backing_fmt = NULL; + bool copy_on_read = false; while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { @@ -539,6 +544,10 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) if (options->value.n) { table_size = options->value.n; } + } else if (!strcmp(options->name, "copy_on_read")) { + if (options->value.n) { + copy_on_read = true; + } } options++; } @@ -559,9 +568,14 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) qed_max_image_size(cluster_size, table_size)); return -EINVAL; } + if (copy_on_read && !backing_file) { + fprintf(stderr, + "QED only supports Copy-on-Read with a backing file\n"); + return -EINVAL; + } return qed_create(filename, cluster_size, image_size, table_size, - backing_file, backing_fmt); + backing_file, backing_fmt, copy_on_read); } typedef struct { @@ -1085,6 +1099,27 @@ static void qed_aio_write_data(void *opaque, int ret, } /** + * Copy on read callback + * + * Write data from backing file to QED that's been read if CoR is enabled. + */ +static void qed_copy_on_read_cb(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + BlockDriverAIOCB *cor_acb; + + cor_acb = bdrv_aio_writev(s->bs, + acb->cur_pos / BDRV_SECTOR_SIZE, + &acb->cur_qiov, + acb->cur_qiov.size / BDRV_SECTOR_SIZE, + qed_aio_next_io, acb); + if (!cor_acb) { + qed_aio_complete(acb, -EIO); + } +} + +/** * Read data cluster * * @opaque: Read request @@ -1102,6 +1137,7 @@ static void qed_aio_read_data(void *opaque, int ret, BDRVQEDState *s = acb_to_s(acb); BlockDriverState *bs = acb->common.bs; BlockDriverAIOCB *file_acb; + BlockDriverCompletionFunc *cb; /* Adjust offset into cluster */ offset += qed_offset_into_cluster(s, acb->cur_pos); @@ -1114,10 +1150,15 @@ static void qed_aio_read_data(void *opaque, int ret, qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + cb = qed_aio_next_io; + /* Handle backing file and unallocated sparse hole reads */ if (ret != QED_CLUSTER_FOUND) { + if ((s->header.compat_features & QED_CF_COPY_ON_READ)) { + cb = qed_copy_on_read_cb; + } qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, - qed_aio_next_io, acb); + cb, acb); return; } @@ -1125,7 +1166,7 @@ static void qed_aio_read_data(void *opaque, int ret, file_acb = bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - qed_aio_next_io, acb); + cb, acb); if (!file_acb) { ret = -EIO; goto err; @@ -1338,6 +1379,10 @@ static QEMUOptionParameter qed_create_options[] = { .name = BLOCK_OPT_TABLE_SIZE, .type = OPT_SIZE, .help = "L1/L2 table size (in clusters)" + }, { + .name = "copy_on_read", + .type = OPT_FLAG, + .help = "Copy blocks from base image on read" }, { /* end of list */ } }; diff --git a/block/qed.h b/block/qed.h index 2925e37..ec958af 100644 --- a/block/qed.h +++ b/block/qed.h @@ -53,6 +53,8 @@ enum { /* The image needs a consistency check before use */ QED_F_NEED_CHECK = 0x02, + QED_CF_COPY_ON_READ = 0x01, + /* The backing file format must not be probed, treat as raw image */ QED_F_BACKING_FORMAT_NO_PROBE = 0x04, @@ -60,7 +62,7 @@ enum { QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ QED_F_NEED_CHECK | QED_F_BACKING_FORMAT_NO_PROBE, - QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */ + QED_COMPAT_FEATURE_MASK = QED_CF_COPY_ON_READ, /* supported compat feature bits */ QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */ /* Data is stored in groups of sectors called clusters. Cluster size must -- 1.7.0.4