From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1Lo15y-00017m-RD for qemu-devel@nongnu.org; Sun, 29 Mar 2009 15:55:02 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1Lo15u-00015d-9M for qemu-devel@nongnu.org; Sun, 29 Mar 2009 15:55:02 -0400 Received: from [199.232.76.173] (port=53079 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Lo15u-00015a-1T for qemu-devel@nongnu.org; Sun, 29 Mar 2009 15:54:58 -0400 Received: from verein.lst.de ([213.95.11.210]:56574) by monty-python.gnu.org with esmtps (TLS-1.0:DHE_RSA_3DES_EDE_CBC_SHA1:24) (Exim 4.60) (envelope-from ) id 1Lo15s-0004ug-UQ for qemu-devel@nongnu.org; Sun, 29 Mar 2009 15:54:57 -0400 Received: from verein.lst.de (localhost [127.0.0.1]) by verein.lst.de (8.12.3/8.12.3/Debian-7.1) with ESMTP id n2TJsqIF001314 (version=TLSv1/SSLv3 cipher=EDH-RSA-DES-CBC3-SHA bits=168 verify=NO) for ; Sun, 29 Mar 2009 21:54:53 +0200 Received: (from hch@localhost) by verein.lst.de (8.12.3/8.12.3/Debian-6.6) id n2TJsqKm001311 for qemu-devel@nongnu.org; Sun, 29 Mar 2009 21:54:52 +0200 Date: Sun, 29 Mar 2009 21:54:52 +0200 From: Christoph Hellwig Message-ID: <20090329195452.GC1215@lst.de> References: <20090329195346.GA625@lst.de> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20090329195346.GA625@lst.de> Subject: [Qemu-devel] [PATCH 3/5] push down vector linearization to posix-aio-compat.c Reply-To: qemu-devel@nongnu.org List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Make all AIO requests vectored and defer linearization until the actual I/O thread. This prepares for using native preadv/pwritev. Also enables asynchronous direct I/O by handling that case in the I/O thread. Win32 support is untested and uncompiled. Then again I can't even find a know to turn on win32 aio support. Qcow and qcow2 propably want to be adopted to directly deal with multi-segment requests, but that can be implemented later. Signed-off-by: Christoph Hellwig Index: qemu/block.c =================================================================== --- qemu.orig/block.c 2009-03-29 21:04:13.854131563 +0200 +++ qemu/block.c 2009-03-29 21:05:04.488130743 +0200 @@ -47,25 +47,21 @@ #define SECTOR_BITS 9 #define SECTOR_SIZE (1 << SECTOR_BITS) -static AIOPool vectored_aio_pool; - typedef struct BlockDriverAIOCBSync { BlockDriverAIOCB common; QEMUBH *bh; int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; } BlockDriverAIOCBSync; -static BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque); -static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb); static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num, @@ -144,10 +140,10 @@ void path_combine(char *dest, int dest_s static void bdrv_register(BlockDriver *bdrv) { - if (!bdrv->bdrv_aio_read) { + if (!bdrv->bdrv_aio_readv) { /* add AIO emulation layer */ - bdrv->bdrv_aio_read = bdrv_aio_read_em; - bdrv->bdrv_aio_write = bdrv_aio_write_em; + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; bdrv->bdrv_aio_cancel = bdrv_aio_cancel_em; bdrv->aiocb_size = sizeof(BlockDriverAIOCBSync); } else if (!bdrv->bdrv_read) { @@ -1275,91 +1271,10 @@ char *bdrv_snapshot_dump(char *buf, int /**************************************************************/ /* async I/Os */ -typedef struct VectorTranslationAIOCB { - BlockDriverAIOCB common; - QEMUIOVector *iov; - uint8_t *bounce; - int is_write; - BlockDriverAIOCB *aiocb; -} VectorTranslationAIOCB; - -static void bdrv_aio_cancel_vector(BlockDriverAIOCB *_acb) -{ - VectorTranslationAIOCB *acb - = container_of(_acb, VectorTranslationAIOCB, common); - - bdrv_aio_cancel(acb->aiocb); -} - -static void bdrv_aio_rw_vector_cb(void *opaque, int ret) -{ - VectorTranslationAIOCB *s = (VectorTranslationAIOCB *)opaque; - - if (!s->is_write) { - qemu_iovec_from_buffer(s->iov, s->bounce, s->iov->size); - } - qemu_vfree(s->bounce); - s->common.cb(s->common.opaque, ret); - qemu_aio_release(s); -} - -static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *iov, - int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque, - int is_write) - -{ - VectorTranslationAIOCB *s = qemu_aio_get_pool(&vectored_aio_pool, bs, - cb, opaque); - - s->iov = iov; - s->bounce = qemu_memalign(512, nb_sectors * 512); - s->is_write = is_write; - if (is_write) { - qemu_iovec_to_buffer(s->iov, s->bounce); - s->aiocb = bdrv_aio_write(bs, sector_num, s->bounce, nb_sectors, - bdrv_aio_rw_vector_cb, s); - } else { - s->aiocb = bdrv_aio_read(bs, sector_num, s->bounce, nb_sectors, - bdrv_aio_rw_vector_cb, s); - } - if (!s->aiocb) { - qemu_vfree(s->bounce); - qemu_aio_release(s); - return NULL; - } - return &s->common; -} - BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, + QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return NULL; - - return bdrv_aio_rw_vector(bs, sector_num, iov, nb_sectors, - cb, opaque, 0); -} - -BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return NULL; - - return bdrv_aio_rw_vector(bs, sector_num, iov, nb_sectors, - cb, opaque, 1); -} - -static BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; @@ -1368,7 +1283,8 @@ static BlockDriverAIOCB *bdrv_aio_read(B if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; - ret = drv->bdrv_aio_read(bs, sector_num, buf, nb_sectors, cb, opaque); + ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors, + cb, opaque); if (ret) { /* Update stats even though technically transfer has not happened. */ @@ -1379,9 +1295,9 @@ static BlockDriverAIOCB *bdrv_aio_read(B return ret; } -static BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { BlockDriver *drv = bs->drv; BlockDriverAIOCB *ret; @@ -1393,7 +1309,8 @@ static BlockDriverAIOCB *bdrv_aio_write( if (bdrv_check_request(bs, sector_num, nb_sectors)) return NULL; - ret = drv->bdrv_aio_write(bs, sector_num, buf, nb_sectors, cb, opaque); + ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors, + cb, opaque); if (ret) { /* Update stats even though technically transfer has not happened. */ @@ -1416,42 +1333,62 @@ void bdrv_aio_cancel(BlockDriverAIOCB *a static void bdrv_aio_bh_cb(void *opaque) { BlockDriverAIOCBSync *acb = opaque; + + qemu_vfree(acb->bounce); + + if (!acb->is_write) + qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + int is_write) + { BlockDriverAIOCBSync *acb; - int ret; acb = qemu_aio_get(bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_memalign(512, qiov->size); + if (!acb->bh) acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - acb->ret = ret; + + if (is_write) { + qemu_iovec_to_buffer(acb->qiov, acb->bounce); + acb->ret = bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + qemu_bh_schedule(acb->bh); + return &acb->common; } -static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - BlockDriverAIOCBSync *acb; - int ret; + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} - acb = qemu_aio_get(bs, cb, opaque); - if (!acb->bh) - acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb); - ret = bdrv_write(bs, sector_num, buf, nb_sectors); - acb->ret = ret; - qemu_bh_schedule(acb->bh); - return &acb->common; +static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); } + static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb) { BlockDriverAIOCBSync *acb = (BlockDriverAIOCBSync *)blockacb; @@ -1474,10 +1411,15 @@ static int bdrv_read_em(BlockDriverState { int async_ret; BlockDriverAIOCB *acb; + struct iovec iov; + QEMUIOVector qiov; async_ret = NOT_DONE; - acb = bdrv_aio_read(bs, sector_num, buf, nb_sectors, - bdrv_rw_em_cb, &async_ret); + iov.iov_base = buf; + iov.iov_len = nb_sectors * 512; + qemu_iovec_init_external(&qiov, &iov, 1); + acb = bdrv_aio_readv(bs, sector_num, &qiov, nb_sectors, + bdrv_rw_em_cb, &async_ret); if (acb == NULL) return -1; @@ -1493,10 +1435,15 @@ static int bdrv_write_em(BlockDriverStat { int async_ret; BlockDriverAIOCB *acb; + struct iovec iov; + QEMUIOVector qiov; async_ret = NOT_DONE; - acb = bdrv_aio_write(bs, sector_num, buf, nb_sectors, - bdrv_rw_em_cb, &async_ret); + iov.iov_base = (void *)buf; + iov.iov_len = nb_sectors * 512; + qemu_iovec_init_external(&qiov, &iov, 1); + acb = bdrv_aio_writev(bs, sector_num, &qiov, nb_sectors, + bdrv_rw_em_cb, &async_ret); if (acb == NULL) return -1; while (async_ret == NOT_DONE) { @@ -1507,9 +1454,6 @@ static int bdrv_write_em(BlockDriverStat void bdrv_init(void) { - aio_pool_init(&vectored_aio_pool, sizeof(VectorTranslationAIOCB), - bdrv_aio_cancel_vector); - bdrv_register(&bdrv_raw); bdrv_register(&bdrv_host_device); #ifndef _WIN32 Index: qemu/block-qcow.c =================================================================== --- qemu.orig/block-qcow.c 2009-03-29 21:04:13.861130779 +0200 +++ qemu/block-qcow.c 2009-03-29 21:17:24.549074800 +0200 @@ -525,7 +525,9 @@ static int qcow_write(BlockDriverState * typedef struct QCowAIOCB { BlockDriverAIOCB common; int64_t sector_num; + QEMUIOVector *qiov; uint8_t *buf; + void *orig_buf; int nb_sectors; int n; uint64_t cluster_offset; @@ -543,12 +545,8 @@ static void qcow_aio_read_cb(void *opaqu int index_in_cluster; acb->hd_aiocb = NULL; - if (ret < 0) { - fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; redo: /* post process the read buffer */ @@ -570,9 +568,8 @@ static void qcow_aio_read_cb(void *opaqu if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } /* prepare next AIO request */ @@ -592,7 +589,7 @@ static void qcow_aio_read_cb(void *opaqu acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); @@ -601,14 +598,14 @@ static void qcow_aio_read_cb(void *opaqu } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ if (decompress_cluster(s, acb->cluster_offset) < 0) - goto fail; + goto done; memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); goto redo; } else { if ((acb->cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } acb->hd_iov.iov_base = acb->buf; acb->hd_iov.iov_len = acb->n * 512; @@ -617,12 +614,22 @@ static void qcow_aio_read_cb(void *opaqu (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; + } + + return; + +done: + if (acb->qiov->niov > 1) { + qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); + qemu_vfree(acb->orig_buf); } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { QCowAIOCB *acb; @@ -632,7 +639,11 @@ static BlockDriverAIOCB *qcow_aio_read(B return NULL; acb->hd_aiocb = NULL; acb->sector_num = sector_num; - acb->buf = buf; + acb->qiov = qiov; + if (qiov->niov > 1) + acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size); + else + acb->buf = qiov->iov->iov_base; acb->nb_sectors = nb_sectors; acb->n = 0; acb->cluster_offset = 0; @@ -652,12 +663,8 @@ static void qcow_aio_write_cb(void *opaq acb->hd_aiocb = NULL; - if (ret < 0) { - fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; acb->nb_sectors -= acb->n; acb->sector_num += acb->n; @@ -665,9 +672,8 @@ static void qcow_aio_write_cb(void *opaq if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -679,14 +685,14 @@ static void qcow_aio_write_cb(void *opaq index_in_cluster + acb->n); if (!cluster_offset || (cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } if (s->crypt_method) { if (!acb->cluster_data) { acb->cluster_data = qemu_mallocz(s->cluster_size); if (!acb->cluster_data) { ret = -ENOMEM; - goto fail; + goto done; } } encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, @@ -704,11 +710,18 @@ static void qcow_aio_write_cb(void *opaq &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; + return; + +done: + if (acb->qiov->niov > 1) + qemu_vfree(acb->orig_buf); + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVQcowState *s = bs->opaque; @@ -721,7 +734,12 @@ static BlockDriverAIOCB *qcow_aio_write( return NULL; acb->hd_aiocb = NULL; acb->sector_num = sector_num; - acb->buf = (uint8_t *)buf; + acb->qiov = qiov; + if (qiov->niov > 1) { + acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size); + qemu_iovec_to_buffer(qiov, acb->buf); + } else + acb->buf = qiov->iov->iov_base; acb->nb_sectors = nb_sectors; acb->n = 0; @@ -909,8 +927,8 @@ BlockDriver bdrv_qcow = { .bdrv_is_allocated = qcow_is_allocated, .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, - .bdrv_aio_read = qcow_aio_read, - .bdrv_aio_write = qcow_aio_write, + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, .bdrv_aio_cancel = qcow_aio_cancel, .aiocb_size = sizeof(QCowAIOCB), .bdrv_write_compressed = qcow_write_compressed, Index: qemu/block-qcow2.c =================================================================== --- qemu.orig/block-qcow2.c 2009-03-29 21:04:13.866142521 +0200 +++ qemu/block-qcow2.c 2009-03-29 21:16:35.706130187 +0200 @@ -1260,7 +1260,9 @@ static int qcow_write(BlockDriverState * typedef struct QCowAIOCB { BlockDriverAIOCB common; int64_t sector_num; + QEMUIOVector *qiov; uint8_t *buf; + void *orig_buf; int nb_sectors; int n; uint64_t cluster_offset; @@ -1303,12 +1305,8 @@ static void qcow_aio_read_cb(void *opaqu int index_in_cluster, n1; acb->hd_aiocb = NULL; - if (ret < 0) { -fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; /* post process the read buffer */ if (!acb->cluster_offset) { @@ -1329,9 +1327,8 @@ fail: if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } /* prepare next AIO request */ @@ -1352,32 +1349,32 @@ fail: &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; } else { ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) - goto fail; + goto done; } } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) - goto fail; + goto done; } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ if (decompress_cluster(s, acb->cluster_offset) < 0) - goto fail; + goto done; memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) - goto fail; + goto done; } else { if ((acb->cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } acb->hd_iov.iov_base = acb->buf; @@ -1387,13 +1384,22 @@ fail: (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; + } + + return; +done: + if (acb->qiov->niov > 1) { + qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); + qemu_vfree(acb->orig_buf); } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int is_write) { QCowAIOCB *acb; @@ -1402,7 +1408,13 @@ static QCowAIOCB *qcow_aio_setup(BlockDr return NULL; acb->hd_aiocb = NULL; acb->sector_num = sector_num; - acb->buf = buf; + acb->qiov = qiov; + if (qiov->niov > 1) { + acb->buf = acb->orig_buf = qemu_memalign(512, qiov->size); + if (is_write) + qemu_iovec_to_buffer(qiov, acb->buf); + } else + acb->buf = qiov->iov->iov_base; acb->nb_sectors = nb_sectors; acb->n = 0; acb->cluster_offset = 0; @@ -1410,13 +1422,13 @@ static QCowAIOCB *qcow_aio_setup(BlockDr return acb; } -static BlockDriverAIOCB *qcow_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { QCowAIOCB *acb; - acb = qcow_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); if (!acb) return NULL; @@ -1435,16 +1447,12 @@ static void qcow_aio_write_cb(void *opaq acb->hd_aiocb = NULL; - if (ret < 0) { - fail: - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - return; - } + if (ret < 0) + goto done; if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); - goto fail; + goto done; } acb->nb_sectors -= acb->n; @@ -1453,9 +1461,8 @@ static void qcow_aio_write_cb(void *opaq if (acb->nb_sectors == 0) { /* request completed */ - acb->common.cb(acb->common.opaque, 0); - qemu_aio_release(acb); - return; + ret = 0; + goto done; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); @@ -1469,7 +1476,7 @@ static void qcow_aio_write_cb(void *opaq n_end, &acb->n, &acb->l2meta); if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { ret = -EIO; - goto fail; + goto done; } if (s->crypt_method) { if (!acb->cluster_data) { @@ -1490,11 +1497,19 @@ static void qcow_aio_write_cb(void *opaq &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); if (acb->hd_aiocb == NULL) - goto fail; + goto done; + + return; + +done: + if (acb->qiov->niov > 1) + qemu_vfree(acb->orig_buf); + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); } -static BlockDriverAIOCB *qcow_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVQcowState *s = bs->opaque; @@ -1502,7 +1517,7 @@ static BlockDriverAIOCB *qcow_aio_write( s->cluster_cache_offset = -1; /* disable compressed cache */ - acb = qcow_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); if (!acb) return NULL; @@ -2736,8 +2751,8 @@ BlockDriver bdrv_qcow2 = { .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, - .bdrv_aio_read = qcow_aio_read, - .bdrv_aio_write = qcow_aio_write, + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, .bdrv_aio_cancel = qcow_aio_cancel, .aiocb_size = sizeof(QCowAIOCB), .bdrv_write_compressed = qcow_write_compressed, Index: qemu/block-raw-posix.c =================================================================== --- qemu.orig/block-raw-posix.c 2009-03-29 21:04:13.944130816 +0200 +++ qemu/block-raw-posix.c 2009-03-29 21:05:04.498129504 +0200 @@ -566,8 +566,8 @@ static int posix_aio_init(void) return 0; } -static RawAIOCB *raw_aio_setup(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; @@ -581,24 +581,25 @@ static RawAIOCB *raw_aio_setup(BlockDriv return NULL; acb->aiocb.aio_fildes = s->fd; acb->aiocb.ev_signo = SIGUSR2; - acb->aiocb.aio_buf = buf; - if (nb_sectors < 0) - acb->aiocb.aio_nbytes = -nb_sectors; - else - acb->aiocb.aio_nbytes = nb_sectors * 512; + acb->aiocb.aio_iov = qiov->iov; + acb->aiocb.aio_niov = qiov->niov; + acb->aiocb.aio_nbytes = nb_sectors * 512; acb->aiocb.aio_offset = sector_num * 512; + acb->aiocb.aio_flags = 0; + + /* + * If O_DIRECT is used the buffer needs to be aligned on a sector + * boundary. Tell the low level code to ensure that in case it's + * not done yet. + */ + if (s->aligned_buf) + acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED; + acb->next = posix_aio_state->first_aio; posix_aio_state->first_aio = acb; return acb; } -static void raw_aio_em_cb(void* opaque) -{ - RawAIOCB *acb = opaque; - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_release(acb); -} - static void raw_aio_remove(RawAIOCB *acb) { RawAIOCB **pacb; @@ -618,28 +619,13 @@ static void raw_aio_remove(RawAIOCB *acb } } -static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; - /* - * If O_DIRECT is used and the buffer is not aligned fall back - * to synchronous IO. - */ - BDRVRawState *s = bs->opaque; - - if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { - QEMUBH *bh; - acb = qemu_aio_get(bs, cb, opaque); - acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); - bh = qemu_bh_new(raw_aio_em_cb, acb); - qemu_bh_schedule(bh); - return &acb->common; - } - - acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); if (!acb) return NULL; if (qemu_paio_read(&acb->aiocb) < 0) { @@ -649,28 +635,13 @@ static BlockDriverAIOCB *raw_aio_read(Bl return &acb->common; } -static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; - /* - * If O_DIRECT is used and the buffer is not aligned fall back - * to synchronous IO. - */ - BDRVRawState *s = bs->opaque; - - if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { - QEMUBH *bh; - acb = qemu_aio_get(bs, cb, opaque); - acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors); - bh = qemu_bh_new(raw_aio_em_cb, acb); - qemu_bh_schedule(bh); - return &acb->common; - } - - acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); if (!acb) return NULL; if (qemu_paio_write(&acb->aiocb) < 0) { @@ -851,8 +822,8 @@ BlockDriver bdrv_raw = { .bdrv_flush = raw_flush, #ifdef CONFIG_AIO - .bdrv_aio_read = raw_aio_read, - .bdrv_aio_write = raw_aio_write, + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif @@ -1178,12 +1149,24 @@ static BlockDriverAIOCB *raw_aio_ioctl(B unsigned long int req, void *buf, BlockDriverCompletionFunc *cb, void *opaque) { + BDRVRawState *s = bs->opaque; RawAIOCB *acb; - acb = raw_aio_setup(bs, 0, buf, 0, cb, opaque); + if (fd_open(bs) < 0) + return NULL; + + acb = qemu_aio_get(bs, cb, opaque); if (!acb) return NULL; + acb->aiocb.aio_fildes = s->fd; + acb->aiocb.ev_signo = SIGUSR2; + acb->aiocb.aio_offset = 0; + acb->aiocb.aio_flags = 0; + + acb->next = posix_aio_state->first_aio; + posix_aio_state->first_aio = acb; + acb->aiocb.aio_ioctl_buf = buf; acb->aiocb.aio_ioctl_cmd = req; if (qemu_paio_ioctl(&acb->aiocb) < 0) { raw_aio_remove(acb); @@ -1350,8 +1333,8 @@ BlockDriver bdrv_host_device = { .bdrv_flush = raw_flush, #ifdef CONFIG_AIO - .bdrv_aio_read = raw_aio_read, - .bdrv_aio_write = raw_aio_write, + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif Index: qemu/block-raw-win32.c =================================================================== --- qemu.orig/block-raw-win32.c 2009-03-29 21:04:13.949130755 +0200 +++ qemu/block-raw-win32.c 2009-03-29 21:05:04.501203149 +0200 @@ -44,6 +44,10 @@ typedef struct RawAIOCB { BlockDriverAIOCB common; HANDLE hEvent; OVERLAPPED ov; + struct iovec *iov; + int nr_iov; + char *buf; + int is_write; int count; } RawAIOCB; @@ -188,6 +192,13 @@ static void raw_aio_cb(void *opaque) int ret; ret = GetOverlappedResult(s->hfile, &acb->ov, &ret_count, TRUE); + + if (acb->nr_iov > 1) { + if (is_write) + iovec_from_buffer(acb->iov, acb->nr_iov, acb->buf, acb->count); + qemu_vfree(acb->buf); + } + if (!ret || ret_count != acb->count) { acb->common.cb(acb->common.opaque, -EIO); } else { @@ -196,7 +207,7 @@ static void raw_aio_cb(void *opaque) } static RawAIOCB *raw_aio_setup(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, + int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; @@ -220,44 +231,63 @@ static RawAIOCB *raw_aio_setup(BlockDriv return acb; } -static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, struct iovec *iov, int nr_iov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; RawAIOCB *acb; int ret; - acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, sector_num, nb_sectors, cb, opaque); if (!acb) return NULL; - ret = ReadFile(s->hfile, buf, acb->count, NULL, &acb->ov); + + acb->is_write = 0; + acb->iov = iov; + acb->nr_iov = nr_iov; + if (nr_iov > 1) + acb->buf = qemu_memalign(512, acb->count); + else + acb->buf = iov->iov_base; + + ret = ReadFile(s->hfile, acb->buf, acb->count, NULL, &acb->ov); if (!ret) { qemu_aio_release(acb); return NULL; } qemu_aio_release(acb); - return (BlockDriverAIOCB *)acb; + return &acb->common; } -static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, struct iovec *iov, int nr_iov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; RawAIOCB *acb; int ret; - acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); + acb = raw_aio_setup(bs, sector_num, nb_sectors, cb, opaque); if (!acb) return NULL; + + acb->is_write = 1; + acb->iov = iov; + acb->nr_iov = nr_iov; + if (nr_iov > 1) { + acb->buf = qemu_memalign(512, acb->count); + iovec_to_buffer(iov, nr_iov, acb->buf); + } else + acb->buf = iov->iov_base; + ret = WriteFile(s->hfile, buf, acb->count, NULL, &acb->ov); if (!ret) { qemu_aio_release(acb); return NULL; } qemu_aio_release(acb); - return (BlockDriverAIOCB *)acb; + return &acb->common; } static void raw_aio_cancel(BlockDriverAIOCB *blockacb) @@ -359,8 +389,8 @@ BlockDriver bdrv_raw = { .bdrv_flush = raw_flush, #ifdef WIN32_AIO - .bdrv_aio_read = raw_aio_read, - .bdrv_aio_write = raw_aio_write, + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB); #endif @@ -508,8 +538,8 @@ BlockDriver bdrv_host_device = { .bdrv_flush = raw_flush, #ifdef WIN32_AIO - .bdrv_aio_read = raw_aio_read, - .bdrv_aio_write = raw_aio_write, + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB); #endif Index: qemu/block_int.h =================================================================== --- qemu.orig/block_int.h 2009-03-29 21:04:13.953131055 +0200 +++ qemu/block_int.h 2009-03-29 21:05:04.504129989 +0200 @@ -54,11 +54,11 @@ struct BlockDriver { int (*bdrv_set_key)(BlockDriverState *bs, const char *key); int (*bdrv_make_empty)(BlockDriverState *bs); /* aio */ - BlockDriverAIOCB *(*bdrv_aio_read)(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors, + BlockDriverAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); - BlockDriverAIOCB *(*bdrv_aio_write)(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors, + BlockDriverAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb); int aiocb_size; Index: qemu/posix-aio-compat.c =================================================================== --- qemu.orig/posix-aio-compat.c 2009-03-29 21:04:13.957130657 +0200 +++ qemu/posix-aio-compat.c 2009-03-29 21:05:04.505179284 +0200 @@ -20,6 +20,7 @@ #include #include #include "osdep.h" +#include "qemu-common.h" #include "posix-aio-compat.h" @@ -76,45 +77,110 @@ static void thread_create(pthread_t *thr if (ret) die2(ret, "pthread_create"); } -static size_t handle_aiocb_readwrite(struct qemu_paiocb *aiocb) +static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb) +{ + int ret; + + ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); + if (ret == -1) + return -errno; + return ret; +} + +/* + * Check if we need to copy the data in the aiocb into a new + * properly aligned buffer. + */ +static int aiocb_needs_copy(struct qemu_paiocb *aiocb) +{ + if (aiocb->aio_flags & QEMU_AIO_SECTOR_ALIGNED) { + int i; + + for (i = 0; i < aiocb->aio_niov; i++) + if ((uintptr_t) aiocb->aio_iov[i].iov_base % 512) + return 1; + } + + return 0; +} + +static size_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf) { size_t offset = 0; - ssize_t len; + size_t len; while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type == QEMU_PAIO_WRITE) - len = pwrite(aiocb->aio_fildes, - (const char *)aiocb->aio_buf + offset, + if (aiocb->aio_type == QEMU_PAIO_WRITE) + len = pwrite(aiocb->aio_fildes, + (const char *)buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + else + len = pread(aiocb->aio_fildes, + buf + offset, aiocb->aio_nbytes - offset, aiocb->aio_offset + offset); - else - len = pread(aiocb->aio_fildes, - (char *)aiocb->aio_buf + offset, - aiocb->aio_nbytes - offset, - aiocb->aio_offset + offset); - - if (len == -1 && errno == EINTR) - continue; - else if (len == -1) { - offset = -errno; - break; - } else if (len == 0) - break; - offset += len; + if (len == -1 && errno == EINTR) + continue; + else if (len == -1) { + offset = -errno; + break; + } else if (len == 0) + break; + + offset += len; } return offset; } -static size_t handle_aiocb_ioctl(struct qemu_paiocb *aiocb) +static size_t handle_aiocb_rw(struct qemu_paiocb *aiocb) { - int ret; + size_t nbytes; + char *buf; - ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_buf); - if (ret == -1) - return -errno; - return ret; + if (!aiocb_needs_copy(aiocb) && aiocb->aio_niov == 1) { + /* + * If there is just a single buffer, and it is properly aligned + * we can just use plain pread/pwrite without any problems. + */ + return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); + } + + /* + * Ok, we have to do it the hard way, copy all segments into + * a single aligned buffer. + */ + buf = qemu_memalign(512, aiocb->aio_nbytes); + if (aiocb->aio_type == QEMU_PAIO_WRITE) { + char *p = buf; + int i; + + for (i = 0; i < aiocb->aio_niov; ++i) { + memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); + p += aiocb->aio_iov[i].iov_len; + } + } + + nbytes = handle_aiocb_rw_linear(aiocb, buf); + if (aiocb->aio_type != QEMU_PAIO_WRITE) { + char *p = buf; + size_t count = aiocb->aio_nbytes, copy; + int i; + + for (i = 0; i < aiocb->aio_niov && count; ++i) { + copy = count; + if (copy > aiocb->aio_iov[i].iov_len) + copy = aiocb->aio_iov[i].iov_len; + memcpy(aiocb->aio_iov[i].iov_base, p, copy); + p += copy; + count -= copy; + } + } + qemu_vfree(buf); + + return nbytes; } static void *aio_thread(void *unused) @@ -157,7 +223,7 @@ static void *aio_thread(void *unused) switch (aiocb->aio_type) { case QEMU_PAIO_READ: case QEMU_PAIO_WRITE: - ret = handle_aiocb_readwrite(aiocb); + ret = handle_aiocb_rw(aiocb); break; case QEMU_PAIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); Index: qemu/posix-aio-compat.h =================================================================== --- qemu.orig/posix-aio-compat.h 2009-03-29 21:04:13.963130724 +0200 +++ qemu/posix-aio-compat.h 2009-03-29 21:05:04.506178574 +0200 @@ -27,11 +27,18 @@ struct qemu_paiocb { int aio_fildes; - void *aio_buf; + union { + struct iovec *aio_iov; + void *aio_ioctl_buf; + }; + int aio_niov; size_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_PAIO_IOCTL */ int ev_signo; off_t aio_offset; + unsigned aio_flags; +/* 512 byte alignment required for buffer, offset and length */ +#define QEMU_AIO_SECTOR_ALIGNED 0x01 /* private */ TAILQ_ENTRY(qemu_paiocb) node;