From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:41593) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YJ1vv-00032p-6v for qemu-devel@nongnu.org; Wed, 04 Feb 2015 10:32:19 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1YJ1vi-0008IV-Gg for qemu-devel@nongnu.org; Wed, 04 Feb 2015 10:32:03 -0500 Received: from mx1.redhat.com ([209.132.183.28]:41095) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1YJ1vh-0008IA-SJ for qemu-devel@nongnu.org; Wed, 04 Feb 2015 10:31:50 -0500 Received: from int-mx10.intmail.prod.int.phx2.redhat.com (int-mx10.intmail.prod.int.phx2.redhat.com [10.5.11.23]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id t14FVlRc022002 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=FAIL) for ; Wed, 4 Feb 2015 10:31:48 -0500 From: Stefan Hajnoczi Date: Wed, 4 Feb 2015 15:31:42 +0000 Message-Id: <1423063902-4926-1-git-send-email-stefanha@redhat.com> Subject: [Qemu-devel] [PATCH] block: move I/O request processing to block/io.c List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Kevin Wolf , Stefan Hajnoczi The block.c file has grown to over 6000 lines. It is time to split this file so there are fewer conflicts and the code is easier to maintain. Extract I/O request processing code: * Read * Write * Flush * Discard * ioctl * Tracked requests and queuing * Throttling and copy-on-read The patch simply moves code from block.c into block/io.c. No code changes are made except adding the following block_int.h functions so they can be called across block.c and block/io.c: bdrv_drain_one(), bdrv_set_dirty(), bdrv_reset_dirty(). I/O request processing needs to set up BlockDriver coroutine and AIO emulation function pointers, so add bdrv_setup_io_funcs(bdrv) interface that block.c calls. Signed-off-by: Stefan Hajnoczi --- block.c | 3296 +++++++++------------------------------------ block/Makefile.objs | 1 + block/io.c | 1997 +++++++++++++++++++++++++++ include/block/block_int.h | 14 + 4 files changed, 2673 insertions(+), 2635 deletions(-) create mode 100644 block/io.c diff --git a/block.c b/block.c index d45e4dd..939a04a 100644 --- a/block.c +++ b/block.c @@ -58,36 +58,6 @@ struct BdrvDirtyBitmap { #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); -static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); - static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -97,10 +67,6 @@ static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); -static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors); -static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors); /* If non-zero, use only whitelisted block drivers */ static int use_bdrv_whitelist; @@ -124,104 +90,6 @@ int is_windows_drive(const char *filename) } #endif -/* throttling disk I/O limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - ThrottleConfig *cfg) -{ - int i; - - throttle_config(&bs->throttle_state, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } -} - -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) -{ - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; - - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; - } - } - - bs->io_limits_enabled = enabled; - - return drained; -} - -void bdrv_io_limits_disable(BlockDriverState *bs) -{ - bs->io_limits_enabled = false; - - bdrv_start_throttled_reqs(bs); - - throttle_destroy(&bs->throttle_state); -} - -static void bdrv_throttle_read_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - qemu_co_enter_next(&bs->throttled_reqs[0]); -} - -static void bdrv_throttle_write_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - qemu_co_enter_next(&bs->throttled_reqs[1]); -} - -/* should be called before bdrv_set_io_limits if a limit is set */ -void bdrv_io_limits_enable(BlockDriverState *bs) -{ - assert(!bs->io_limits_enabled); - throttle_init(&bs->throttle_state, - bdrv_get_aio_context(bs), - QEMU_CLOCK_VIRTUAL, - bdrv_throttle_read_timer_cb, - bdrv_throttle_write_timer_cb, - bs); - bs->io_limits_enabled = true; -} - -/* This function makes an IO wait if needed - * - * @nb_sectors: the number of sectors of the IO - * @is_write: is the IO a write - */ -static void bdrv_io_limits_intercept(BlockDriverState *bs, - unsigned int bytes, - bool is_write) -{ - /* does this io must wait */ - bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); - - /* if must wait or any request of this type throttled queue the IO */ - if (must_wait || - !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { - qemu_co_queue_wait(&bs->throttled_reqs[is_write]); - } - - /* the IO will be executed, do the accounting */ - throttle_account(&bs->throttle_state, is_write, bytes); - - - /* if the next request must wait -> do nothing */ - if (throttle_schedule_timer(&bs->throttle_state, is_write)) { - return; - } - - /* else queue next request for execution */ - qemu_co_queue_next(&bs->throttled_reqs[is_write]); -} - size_t bdrv_opt_mem_align(BlockDriverState *bs) { if (!bs || !bs->drv) { @@ -335,20 +203,7 @@ void bdrv_get_full_backing_filename(BlockDriverState *bs, char *dest, size_t sz, void bdrv_register(BlockDriver *bdrv) { - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } + bdrv_setup_io_funcs(bdrv); QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list); } @@ -805,22 +660,6 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) return 0; } -/** - * The copy-on-read flag is actually a reference count so multiple users may - * use the feature without worrying about clobbering its previous state. - * Copy-on-read stays enabled until all users have called to disable it. - */ -void bdrv_enable_copy_on_read(BlockDriverState *bs) -{ - bs->copy_on_read++; -} - -void bdrv_disable_copy_on_read(BlockDriverState *bs) -{ - assert(bs->copy_on_read > 0); - bs->copy_on_read--; -} - /* * Returns the flags that a temporary snapshot should get, based on the * originally requested flags (the originally requested image will have flags @@ -1926,55 +1765,6 @@ void bdrv_close_all(void) } } -/* Check if any requests are in-flight (including throttled requests) */ -static bool bdrv_requests_pending(BlockDriverState *bs) -{ - if (!QLIST_EMPTY(&bs->tracked_requests)) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { - return true; - } - if (bs->file && bdrv_requests_pending(bs->file)) { - return true; - } - if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { - return true; - } - return false; -} - -static bool bdrv_drain_one(BlockDriverState *bs) -{ - bool bs_busy; - - bdrv_flush_io_queue(bs); - bdrv_start_throttled_reqs(bs); - bs_busy = bdrv_requests_pending(bs); - bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); - return bs_busy; -} - -/* - * Wait for pending requests to complete on a single BlockDriverState subtree - * - * See the warning in bdrv_drain_all(). This function can only be called if - * you are sure nothing can generate I/O because you have op blockers - * installed. - * - * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState - * AioContext. - */ -void bdrv_drain(BlockDriverState *bs) -{ - while (bdrv_drain_one(bs)) { - /* Keep iterating */ - } -} - /* * Wait for pending requests to complete across all BlockDriverStates * @@ -2327,152 +2117,6 @@ int bdrv_commit_all(void) return 0; } -/** - * Remove an active request from the tracked requests list - * - * This function should be called when a tracked request is completing. - */ -static void tracked_request_end(BdrvTrackedRequest *req) -{ - if (req->serialising) { - req->bs->serialising_in_flight--; - } - - QLIST_REMOVE(req, list); - qemu_co_queue_restart_all(&req->wait_queue); -} - -/** - * Add an active request to the tracked requests list - */ -static void tracked_request_begin(BdrvTrackedRequest *req, - BlockDriverState *bs, - int64_t offset, - unsigned int bytes, bool is_write) -{ - *req = (BdrvTrackedRequest){ - .bs = bs, - .offset = offset, - .bytes = bytes, - .is_write = is_write, - .co = qemu_coroutine_self(), - .serialising = false, - .overlap_offset = offset, - .overlap_bytes = bytes, - }; - - qemu_co_queue_init(&req->wait_queue); - - QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); -} - -static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) -{ - int64_t overlap_offset = req->offset & ~(align - 1); - unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - - overlap_offset; - - if (!req->serialising) { - req->bs->serialising_in_flight++; - req->serialising = true; - } - - req->overlap_offset = MIN(req->overlap_offset, overlap_offset); - req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); -} - -/** - * Round a region to cluster boundaries - */ -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) -{ - BlockDriverInfo bdi; - - if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { - *cluster_sector_num = sector_num; - *cluster_nb_sectors = nb_sectors; - } else { - int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; - *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); - *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + - nb_sectors, c); - } -} - -static int bdrv_get_cluster_size(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - int ret; - - ret = bdrv_get_info(bs, &bdi); - if (ret < 0 || bdi.cluster_size == 0) { - return bs->request_alignment; - } else { - return bdi.cluster_size; - } -} - -static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, unsigned int bytes) -{ - /* aaaa bbbb */ - if (offset >= req->overlap_offset + req->overlap_bytes) { - return false; - } - /* bbbb aaaa */ - if (req->overlap_offset >= offset + bytes) { - return false; - } - return true; -} - -static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) -{ - BlockDriverState *bs = self->bs; - BdrvTrackedRequest *req; - bool retry; - bool waited = false; - - if (!bs->serialising_in_flight) { - return false; - } - - do { - retry = false; - QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self || (!req->serialising && !self->serialising)) { - continue; - } - if (tracked_request_overlaps(req, self->overlap_offset, - self->overlap_bytes)) - { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. - */ - assert(qemu_coroutine_self() != req->co); - - /* If the request is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ - if (!req->waiting_for) { - self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue); - self->waiting_for = NULL; - retry = true; - waited = true; - break; - } - } - } - } while (retry); - - return waited; -} - /* * Return values: * 0 - success @@ -2641,172 +2285,6 @@ exit: return ret; } - -static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, - size_t size) -{ - int64_t len; - - if (size > INT_MAX) { - return -EIO; - } - - if (!bdrv_is_inserted(bs)) - return -ENOMEDIUM; - - if (bs->growable) - return 0; - - len = bdrv_getlength(bs); - - if (offset < 0) - return -EIO; - - if ((offset > len) || (len - offset < size)) - return -EIO; - - return 0; -} - -static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { - return -EIO; - } - - return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE); -} - -typedef struct RwCo { - BlockDriverState *bs; - int64_t offset; - QEMUIOVector *qiov; - bool is_write; - int ret; - BdrvRequestFlags flags; -} RwCo; - -static void coroutine_fn bdrv_rw_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } -} - -/* - * Process a vectored synchronous request using coroutines - */ -static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .offset = offset, - .qiov = qiov, - .is_write = is_write, - .ret = NOT_DONE, - .flags = flags, - }; - - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_rw_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_rw_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - return rwco.ret; -} - -/* - * Process a synchronous request using coroutines - */ -static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors, bool is_write, BdrvRequestFlags flags) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - - if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, - &qiov, is_write, flags); -} - -/* return < 0 if error. See bdrv_write() for the return codes */ -int bdrv_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); -} - -/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ -int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - bool enabled; - int ret; - - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; - return ret; -} - -/* Return < 0 if error. Important errors are: - -EIO generic I/O error (may happen for all errors) - -ENOMEDIUM No media inserted. - -EINVAL Invalid sector number or nb_sectors - -EACCES Trying to write a read-only device -*/ -int bdrv_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); -} - -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); -} - /* * Completely zero out a block device with the help of bdrv_write_zeroes. * The operation is sped up by checking the block status and only writing @@ -2853,2162 +2331,909 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +/** + * Truncate file to 'offset' bytes (needed only for file protocols) + */ +int bdrv_truncate(BlockDriverState *bs, int64_t offset) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = bytes, - }; + BlockDriver *drv = bs->drv; int ret; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_truncate) + return -ENOTSUP; + if (bs->read_only) + return -EACCES; - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); - if (ret < 0) { - return ret; + ret = drv->bdrv_truncate(bs, offset); + if (ret == 0) { + ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); + if (bs->blk) { + blk_dev_resize_cb(bs->blk); + } } - - return bytes; + return ret; } -int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) -{ - int ret; - - ret = bdrv_prwv_co(bs, offset, qiov, true, 0); - if (ret < 0) { - return ret; - } - - return qiov->size; -} - -int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int bytes) +/** + * Length of a allocated file in bytes. Sparse files are counted by actual + * allocated space. Return < 0 if error or unknown. + */ +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) { - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = bytes, - }; - - if (bytes < 0) { - return -EINVAL; + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_pwritev(bs, offset, &qiov); + if (drv->bdrv_get_allocated_file_size) { + return drv->bdrv_get_allocated_file_size(bs); + } + if (bs->file) { + return bdrv_get_allocated_file_size(bs->file); + } + return -ENOTSUP; } -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. +/** + * Return number of sectors on success, -errno on error. */ -int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, - const void *buf, int count) +int64_t bdrv_nb_sectors(BlockDriverState *bs) { - int ret; + BlockDriver *drv = bs->drv; - ret = bdrv_pwrite(bs, offset, buf, count); - if (ret < 0) { - return ret; - } + if (!drv) + return -ENOMEDIUM; - /* No flush needed for cache modes that already do it */ - if (bs->enable_write_cache) { - bdrv_flush(bs); + if (drv->has_variable_length) { + int ret = refresh_total_sectors(bs, bs->total_sectors); + if (ret < 0) { + return ret; + } } - - return 0; + return bs->total_sectors; } -static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +/** + * Return length in bytes on success, -errno on error. + * The length is always a multiple of BDRV_SECTOR_SIZE. + */ +int64_t bdrv_getlength(BlockDriverState *bs) { - /* Perform I/O through a temporary buffer so that users who scribble over - * their read buffer while the operation is in progress do not end up - * modifying the image file. This is critical for zero-copy guest I/O - * where anything might happen inside guest memory. - */ - void *bounce_buffer; - - BlockDriver *drv = bs->drv; - struct iovec iov; - QEMUIOVector bounce_qiov; - int64_t cluster_sector_num; - int cluster_nb_sectors; - size_t skip_bytes; - int ret; + int64_t ret = bdrv_nb_sectors(bs); - /* Cover entire cluster so no additional backing file I/O is required when - * allocating cluster in the image file. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; +} - trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); +/* return 0 as number of sectors if no device present or error */ +void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) +{ + int64_t nb_sectors = bdrv_nb_sectors(bs); - iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); - if (bounce_buffer == NULL) { - ret = -ENOMEM; - goto err; - } + *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; +} - qemu_iovec_init_external(&bounce_qiov, &iov, 1); +void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, + BlockdevOnError on_write_error) +{ + bs->on_read_error = on_read_error; + bs->on_write_error = on_write_error; +} - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - if (ret < 0) { - goto err; - } +BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) +{ + return is_read ? bs->on_read_error : bs->on_write_error; +} - if (drv->bdrv_co_write_zeroes && - buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); - } else { - /* This does not change the data on the disk, it is not necessary - * to flush even in cache=writethrough mode. - */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - } +BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) +{ + BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; - if (ret < 0) { - /* It might be okay to ignore write errors for guest requests. If this - * is a deliberate copy-on-read then we don't want to ignore the error. - * Simply report it in all cases. - */ - goto err; + switch (on_err) { + case BLOCKDEV_ON_ERROR_ENOSPC: + return (error == ENOSPC) ? + BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_STOP: + return BLOCK_ERROR_ACTION_STOP; + case BLOCKDEV_ON_ERROR_REPORT: + return BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_IGNORE: + return BLOCK_ERROR_ACTION_IGNORE; + default: + abort(); } +} - skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); +static void send_qmp_error_event(BlockDriverState *bs, + BlockErrorAction action, + bool is_read, int error) +{ + IoOperationType optype; -err: - qemu_vfree(bounce_buffer); - return ret; + optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; + qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, + bdrv_iostatus_is_enabled(bs), + error == ENOSPC, strerror(error), + &error_abort); } -/* - * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. +/* This is done by device models because, while the block layer knows + * about the error, it does not know whether an operation comes from + * the device or the block layer (from a job, for example). */ -static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - int64_t align, QEMUIOVector *qiov, int flags) +void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, + bool is_read, int error) { - BlockDriver *drv = bs->drv; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + assert(error >= 0); - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); + if (action == BLOCK_ERROR_ACTION_STOP) { + /* First set the iostatus, so that "info block" returns an iostatus + * that matches the events raised so far (an additional error iostatus + * is fine, but not a lost one). + */ + bdrv_iostatus_set_err(bs, error); - /* Handle Copy on Read and associated serialisation */ - if (flags & BDRV_REQ_COPY_ON_READ) { - /* If we touch the same cluster it counts as an overlap. This - * guarantees that allocating writes will be serialized and not race - * with each other for the same cluster. For example, in copy-on-read - * it ensures that the CoR read and write operations are atomic and - * guest writes cannot interleave between them. */ - mark_request_serialising(req, bdrv_get_cluster_size(bs)); + /* Then raise the request to stop the VM and the event. + * qemu_system_vmstop_request_prepare has two effects. First, + * it ensures that the STOP event always comes after the + * BLOCK_IO_ERROR event. Second, it ensures that even if management + * can observe the STOP event and do a "cont" before the STOP + * event is issued, the VM will not stop. In this case, vm_start() + * also ensures that the STOP/RESUME pair of events is emitted. + */ + qemu_system_vmstop_request_prepare(); + send_qmp_error_event(bs, action, is_read, error); + qemu_system_vmstop_request(RUN_STATE_IO_ERROR); + } else { + send_qmp_error_event(bs, action, is_read, error); } +} - wait_serialising_requests(req); +int bdrv_is_read_only(BlockDriverState *bs) +{ + return bs->read_only; +} - if (flags & BDRV_REQ_COPY_ON_READ) { - int pnum; +int bdrv_is_sg(BlockDriverState *bs) +{ + return bs->sg; +} - ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); - if (ret < 0) { - goto out; - } +int bdrv_enable_write_cache(BlockDriverState *bs) +{ + return bs->enable_write_cache; +} - if (!ret || pnum != nb_sectors) { - ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); - goto out; - } - } +void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) +{ + bs->enable_write_cache = wce; - /* Forward the request to the BlockDriver */ - if (!(bs->zero_beyond_eof && bs->growable)) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + /* so a reopen() will preserve wce */ + if (wce) { + bs->open_flags |= BDRV_O_CACHE_WB; } else { - /* Read zeros after EOF of growable BDSes */ - int64_t total_sectors, max_nb_sectors; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - ret = total_sectors; - goto out; - } - - max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), - align >> BDRV_SECTOR_BITS); - if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else if (max_nb_sectors > 0) { - QEMUIOVector local_qiov; - - qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, - max_nb_sectors * BDRV_SECTOR_SIZE); - - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); - - qemu_iovec_destroy(&local_qiov); - } else { - ret = 0; - } - - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_sectors < sector_num + nb_sectors) { - uint64_t offset = MAX(0, total_sectors - sector_num); - uint64_t bytes = (sector_num + nb_sectors - offset) * - BDRV_SECTOR_SIZE; - qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); - } + bs->open_flags &= ~BDRV_O_CACHE_WB; } - -out: - return ret; } -/* - * Handle a read request in coroutine context - */ -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) +int bdrv_is_encrypted(BlockDriverState *bs) { - BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; - - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; + if (bs->backing_hd && bs->backing_hd->encrypted) + return 1; + return bs->encrypted; +} - if (!drv) { - return -ENOMEDIUM; - } - if (bdrv_check_byte_request(bs, offset, bytes)) { - return -EIO; - } +int bdrv_key_required(BlockDriverState *bs) +{ + BlockDriverState *backing_hd = bs->backing_hd; - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } + if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) + return 1; + return (bs->encrypted && !bs->valid_key); +} - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, bytes, false); +int bdrv_set_key(BlockDriverState *bs, const char *key) +{ + int ret; + if (bs->backing_hd && bs->backing_hd->encrypted) { + ret = bdrv_set_key(bs->backing_hd, key); + if (ret < 0) + return ret; + if (!bs->encrypted) + return 0; } - - /* Align read if necessary by padding qiov */ - if (offset & (align - 1)) { - head_buf = qemu_blockalign(bs, align); - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); + if (!bs->encrypted) { + return -EINVAL; + } else if (!bs->drv || !bs->drv->bdrv_set_key) { + return -ENOMEDIUM; } - - if ((offset + bytes) & (align - 1)) { - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; + ret = bs->drv->bdrv_set_key(bs, key); + if (ret < 0) { + bs->valid_key = 0; + } else if (!bs->valid_key) { + bs->valid_key = 1; + if (bs->blk) { + /* call the change callback now, we skipped it on open */ + blk_dev_change_media_cb(bs->blk, true); } - tail_buf = qemu_blockalign(bs, align); - qemu_iovec_add(&local_qiov, tail_buf, - align - ((offset + bytes) & (align - 1))); - - bytes = ROUND_UP(bytes, align); - } - - tracked_request_begin(&req, bs, offset, bytes, false); - ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, - use_local_qiov ? &local_qiov : qiov, - flags); - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - qemu_vfree(head_buf); - qemu_vfree(tail_buf); } - return ret; } -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +const char *bdrv_get_format_name(BlockDriverState *bs) { - if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { - return -EINVAL; - } - - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); + return bs->drv ? bs->drv->format_name : NULL; } -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +static int qsort_strcmp(const void *a, const void *b) { - trace_bdrv_co_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); + return strcmp(a, b); } -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +void bdrv_iterate_format(void (*it)(void *opaque, const char *name), + void *opaque) { - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - -/* if no limit is specified in the BlockLimits use a default - * of 32768 512-byte sectors (16 MiB) per request. - */ -#define MAX_WRITE_ZEROES_DEFAULT 32768 + BlockDriver *drv; + int count = 0; + int i; + const char **formats = NULL; -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - BlockDriver *drv = bs->drv; - QEMUIOVector qiov; - struct iovec iov = {0}; - int ret = 0; - - int max_write_zeroes = bs->bl.max_write_zeroes ? - bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; - - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; - - /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. - */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } - } - - /* limit request size */ - if (num > max_write_zeroes) { - num = max_write_zeroes; - } - - ret = -ENOTSUP; - /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); - } - - if (ret == -ENOTSUP) { - /* Fall back to bounce buffer if write zeroes is unsupported */ - int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, - MAX_WRITE_ZEROES_DEFAULT); - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; - if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); - if (iov.iov_base == NULL) { - ret = -ENOMEM; - goto fail; - } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); - } - qemu_iovec_init_external(&qiov, &iov, 1); - - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); - - /* Keep bounce buffer around if it is big enough for all - * all future requests. - */ - if (num < max_xfer_len) { - qemu_vfree(iov.iov_base); - iov.iov_base = NULL; - } - } - - sector_num += num; - nb_sectors -= num; - } - -fail: - qemu_vfree(iov.iov_base); - return ret; -} - -/* - * Forwards an already correctly aligned write request to the BlockDriver. - */ -static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) -{ - BlockDriver *drv = bs->drv; - bool waited; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - - waited = wait_serialising_requests(req); - assert(!waited || !req->serialising); - assert(req->overlap_offset <= offset); - assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); - - if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && - qemu_iovec_is_zero(qiov)) { - flags |= BDRV_REQ_ZERO_WRITE; - if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { - flags |= BDRV_REQ_MAY_UNMAP; - } - } - - if (ret < 0) { - /* Do nothing, write notifier decided to fail this request */ - } else if (flags & BDRV_REQ_ZERO_WRITE) { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); - - if (ret == 0 && !bs->enable_write_cache) { - ret = bdrv_co_flush(bs); - } - - bdrv_set_dirty(bs, sector_num, nb_sectors); - - block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); - - if (bs->growable && ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); - } - - return ret; -} - -/* - * Handle a write request in coroutine context - */ -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - BdrvTrackedRequest req; - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EACCES; - } - if (bdrv_check_byte_request(bs, offset, bytes)) { - return -EIO; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, bytes, true); - } - - /* - * Align write if necessary by performing a read-modify-write cycle. - * Pad qiov with the read parts and be sure to have a tracked request not - * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. - */ - tracked_request_begin(&req, bs, offset, bytes, true); - - if (offset & (align - 1)) { - QEMUIOVector head_qiov; - struct iovec head_iov; - - mark_request_serialising(&req, align); - wait_serialising_requests(&req); - - head_buf = qemu_blockalign(bs, align); - head_iov = (struct iovec) { - .iov_base = head_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&head_qiov, &head_iov, 1); - - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); - ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, - align, &head_qiov, 0); - if (ret < 0) { - goto fail; - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); - - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - QEMUIOVector tail_qiov; - struct iovec tail_iov; - size_t tail_bytes; - bool waited; - - mark_request_serialising(&req, align); - waited = wait_serialising_requests(&req); - assert(!waited || !use_local_qiov); - - tail_buf = qemu_blockalign(bs, align); - tail_iov = (struct iovec) { - .iov_base = tail_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); - - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); - ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, - align, &tail_qiov, 0); - if (ret < 0) { - goto fail; - } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); - - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - - tail_bytes = (offset + bytes) & (align - 1); - qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); - - bytes = ROUND_UP(bytes, align); - } - - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, - use_local_qiov ? &local_qiov : qiov, - flags); - -fail: - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - } - qemu_vfree(head_buf); - qemu_vfree(tail_buf); - - return ret; -} - -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { - return -EINVAL; - } - - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_writev(bs, sector_num, nb_sectors); - - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) -{ - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); - - if (!(bs->open_flags & BDRV_O_UNMAP)) { - flags &= ~BDRV_REQ_MAY_UNMAP; - } - - return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); -} - -/** - * Truncate file to 'offset' bytes (needed only for file protocols) - */ -int bdrv_truncate(BlockDriverState *bs, int64_t offset) -{ - BlockDriver *drv = bs->drv; - int ret; - if (!drv) - return -ENOMEDIUM; - if (!drv->bdrv_truncate) - return -ENOTSUP; - if (bs->read_only) - return -EACCES; - - ret = drv->bdrv_truncate(bs, offset); - if (ret == 0) { - ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS); - if (bs->blk) { - blk_dev_resize_cb(bs->blk); - } - } - return ret; -} - -/** - * Length of a allocated file in bytes. Sparse files are counted by actual - * allocated space. Return < 0 if error or unknown. - */ -int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_get_allocated_file_size) { - return drv->bdrv_get_allocated_file_size(bs); - } - if (bs->file) { - return bdrv_get_allocated_file_size(bs->file); - } - return -ENOTSUP; -} - -/** - * Return number of sectors on success, -errno on error. - */ -int64_t bdrv_nb_sectors(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - - if (!drv) - return -ENOMEDIUM; - - if (drv->has_variable_length) { - int ret = refresh_total_sectors(bs, bs->total_sectors); - if (ret < 0) { - return ret; - } - } - return bs->total_sectors; -} - -/** - * Return length in bytes on success, -errno on error. - * The length is always a multiple of BDRV_SECTOR_SIZE. - */ -int64_t bdrv_getlength(BlockDriverState *bs) -{ - int64_t ret = bdrv_nb_sectors(bs); - - return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; -} - -/* return 0 as number of sectors if no device present or error */ -void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) -{ - int64_t nb_sectors = bdrv_nb_sectors(bs); - - *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; -} - -void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, - BlockdevOnError on_write_error) -{ - bs->on_read_error = on_read_error; - bs->on_write_error = on_write_error; -} - -BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read) -{ - return is_read ? bs->on_read_error : bs->on_write_error; -} - -BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error) -{ - BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error; - - switch (on_err) { - case BLOCKDEV_ON_ERROR_ENOSPC: - return (error == ENOSPC) ? - BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; - case BLOCKDEV_ON_ERROR_STOP: - return BLOCK_ERROR_ACTION_STOP; - case BLOCKDEV_ON_ERROR_REPORT: - return BLOCK_ERROR_ACTION_REPORT; - case BLOCKDEV_ON_ERROR_IGNORE: - return BLOCK_ERROR_ACTION_IGNORE; - default: - abort(); - } -} - -static void send_qmp_error_event(BlockDriverState *bs, - BlockErrorAction action, - bool is_read, int error) -{ - IoOperationType optype; - - optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; - qapi_event_send_block_io_error(bdrv_get_device_name(bs), optype, action, - bdrv_iostatus_is_enabled(bs), - error == ENOSPC, strerror(error), - &error_abort); -} - -/* This is done by device models because, while the block layer knows - * about the error, it does not know whether an operation comes from - * the device or the block layer (from a job, for example). - */ -void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action, - bool is_read, int error) -{ - assert(error >= 0); - - if (action == BLOCK_ERROR_ACTION_STOP) { - /* First set the iostatus, so that "info block" returns an iostatus - * that matches the events raised so far (an additional error iostatus - * is fine, but not a lost one). - */ - bdrv_iostatus_set_err(bs, error); - - /* Then raise the request to stop the VM and the event. - * qemu_system_vmstop_request_prepare has two effects. First, - * it ensures that the STOP event always comes after the - * BLOCK_IO_ERROR event. Second, it ensures that even if management - * can observe the STOP event and do a "cont" before the STOP - * event is issued, the VM will not stop. In this case, vm_start() - * also ensures that the STOP/RESUME pair of events is emitted. - */ - qemu_system_vmstop_request_prepare(); - send_qmp_error_event(bs, action, is_read, error); - qemu_system_vmstop_request(RUN_STATE_IO_ERROR); - } else { - send_qmp_error_event(bs, action, is_read, error); - } -} - -int bdrv_is_read_only(BlockDriverState *bs) -{ - return bs->read_only; -} - -int bdrv_is_sg(BlockDriverState *bs) -{ - return bs->sg; -} - -int bdrv_enable_write_cache(BlockDriverState *bs) -{ - return bs->enable_write_cache; -} - -void bdrv_set_enable_write_cache(BlockDriverState *bs, bool wce) -{ - bs->enable_write_cache = wce; - - /* so a reopen() will preserve wce */ - if (wce) { - bs->open_flags |= BDRV_O_CACHE_WB; - } else { - bs->open_flags &= ~BDRV_O_CACHE_WB; - } -} - -int bdrv_is_encrypted(BlockDriverState *bs) -{ - if (bs->backing_hd && bs->backing_hd->encrypted) - return 1; - return bs->encrypted; -} - -int bdrv_key_required(BlockDriverState *bs) -{ - BlockDriverState *backing_hd = bs->backing_hd; - - if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key) - return 1; - return (bs->encrypted && !bs->valid_key); -} - -int bdrv_set_key(BlockDriverState *bs, const char *key) -{ - int ret; - if (bs->backing_hd && bs->backing_hd->encrypted) { - ret = bdrv_set_key(bs->backing_hd, key); - if (ret < 0) - return ret; - if (!bs->encrypted) - return 0; - } - if (!bs->encrypted) { - return -EINVAL; - } else if (!bs->drv || !bs->drv->bdrv_set_key) { - return -ENOMEDIUM; - } - ret = bs->drv->bdrv_set_key(bs, key); - if (ret < 0) { - bs->valid_key = 0; - } else if (!bs->valid_key) { - bs->valid_key = 1; - if (bs->blk) { - /* call the change callback now, we skipped it on open */ - blk_dev_change_media_cb(bs->blk, true); - } - } - return ret; -} - -const char *bdrv_get_format_name(BlockDriverState *bs) -{ - return bs->drv ? bs->drv->format_name : NULL; -} - -static int qsort_strcmp(const void *a, const void *b) -{ - return strcmp(a, b); -} - -void bdrv_iterate_format(void (*it)(void *opaque, const char *name), - void *opaque) -{ - BlockDriver *drv; - int count = 0; - int i; - const char **formats = NULL; - - QLIST_FOREACH(drv, &bdrv_drivers, list) { - if (drv->format_name) { - bool found = false; - int i = count; - while (formats && i && !found) { - found = !strcmp(formats[--i], drv->format_name); - } - - if (!found) { - formats = g_renew(const char *, formats, count + 1); - formats[count++] = drv->format_name; - } - } - } - - qsort(formats, count, sizeof(formats[0]), qsort_strcmp); - - for (i = 0; i < count; i++) { - it(opaque, formats[i]); - } - - g_free(formats); -} - -/* This function is to find block backend bs */ -/* TODO convert callers to blk_by_name(), then remove */ -BlockDriverState *bdrv_find(const char *name) -{ - BlockBackend *blk = blk_by_name(name); - - return blk ? blk_bs(blk) : NULL; -} - -/* This function is to find a node in the bs graph */ -BlockDriverState *bdrv_find_node(const char *node_name) -{ - BlockDriverState *bs; - - assert(node_name); - - QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { - if (!strcmp(node_name, bs->node_name)) { - return bs; - } - } - return NULL; -} - -/* Put this QMP function here so it can access the static graph_bdrv_states. */ -BlockDeviceInfoList *bdrv_named_nodes_list(void) -{ - BlockDeviceInfoList *list, *entry; - BlockDriverState *bs; - - list = NULL; - QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { - entry = g_malloc0(sizeof(*entry)); - entry->value = bdrv_block_device_info(bs); - entry->next = list; - list = entry; - } - - return list; -} - -BlockDriverState *bdrv_lookup_bs(const char *device, - const char *node_name, - Error **errp) -{ - BlockBackend *blk; - BlockDriverState *bs; - - if (device) { - blk = blk_by_name(device); - - if (blk) { - return blk_bs(blk); - } - } - - if (node_name) { - bs = bdrv_find_node(node_name); - - if (bs) { - return bs; - } - } - - error_setg(errp, "Cannot find device=%s nor node_name=%s", - device ? device : "", - node_name ? node_name : ""); - return NULL; -} - -/* If 'base' is in the same chain as 'top', return true. Otherwise, - * return false. If either argument is NULL, return false. */ -bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) -{ - while (top && top != base) { - top = top->backing_hd; - } - - return top != NULL; -} - -BlockDriverState *bdrv_next_node(BlockDriverState *bs) -{ - if (!bs) { - return QTAILQ_FIRST(&graph_bdrv_states); - } - return QTAILQ_NEXT(bs, node_list); -} - -BlockDriverState *bdrv_next(BlockDriverState *bs) -{ - if (!bs) { - return QTAILQ_FIRST(&bdrv_states); - } - return QTAILQ_NEXT(bs, device_list); -} - -const char *bdrv_get_node_name(const BlockDriverState *bs) -{ - return bs->node_name; -} - -/* TODO check what callers really want: bs->node_name or blk_name() */ -const char *bdrv_get_device_name(const BlockDriverState *bs) -{ - return bs->blk ? blk_name(bs->blk) : ""; -} - -int bdrv_get_flags(BlockDriverState *bs) -{ - return bs->open_flags; -} - -int bdrv_flush_all(void) -{ - BlockDriverState *bs; - int result = 0; - - QTAILQ_FOREACH(bs, &bdrv_states, device_list) { - AioContext *aio_context = bdrv_get_aio_context(bs); - int ret; - - aio_context_acquire(aio_context); - ret = bdrv_flush(bs); - if (ret < 0 && !result) { - result = ret; - } - aio_context_release(aio_context); - } - - return result; -} - -int bdrv_has_zero_init_1(BlockDriverState *bs) -{ - return 1; -} - -int bdrv_has_zero_init(BlockDriverState *bs) -{ - assert(bs->drv); - - /* If BS is a copy on write image, it is initialized to - the contents of the base image, which may not be zeroes. */ - if (bs->backing_hd) { - return 0; - } - if (bs->drv->bdrv_has_zero_init) { - return bs->drv->bdrv_has_zero_init(bs); - } - - /* safe default */ - return 0; -} - -bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - - if (bs->backing_hd) { - return false; - } - - if (bdrv_get_info(bs, &bdi) == 0) { - return bdi.unallocated_blocks_are_zero; - } - - return false; -} - -bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - - if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { - return false; - } - - if (bdrv_get_info(bs, &bdi) == 0) { - return bdi.can_write_zeroes_with_unmap; - } - - return false; -} - -typedef struct BdrvCoGetBlockStatusData { - BlockDriverState *bs; - BlockDriverState *base; - int64_t sector_num; - int nb_sectors; - int *pnum; - int64_t ret; - bool done; -} BdrvCoGetBlockStatusData; - -/* - * Returns the allocation status of the specified sectors. - * Drivers not implementing the functionality are assumed to not support - * backing files, hence all their sectors are reported as allocated. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - */ -static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t total_sectors; - int64_t n; - int64_t ret, ret2; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - return total_sectors; - } - - if (sector_num >= total_sectors) { - *pnum = 0; - return 0; - } - - n = total_sectors - sector_num; - if (n < nb_sectors) { - nb_sectors = n; - } - - if (!bs->drv->bdrv_co_get_block_status) { - *pnum = nb_sectors; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; - if (bs->drv->protocol_name) { - ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); - } - return ret; - } - - ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); - if (ret < 0) { - *pnum = 0; - return ret; - } - - if (ret & BDRV_BLOCK_RAW) { - assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, pnum); - } - - if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { - ret |= BDRV_BLOCK_ALLOCATED; - } - - if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { - if (bdrv_unallocated_blocks_are_zero(bs)) { - ret |= BDRV_BLOCK_ZERO; - } else if (bs->backing_hd) { - BlockDriverState *bs2 = bs->backing_hd; - int64_t nb_sectors2 = bdrv_nb_sectors(bs2); - if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { - ret |= BDRV_BLOCK_ZERO; + QLIST_FOREACH(drv, &bdrv_drivers, list) { + if (drv->format_name) { + bool found = false; + int i = count; + while (formats && i && !found) { + found = !strcmp(formats[--i], drv->format_name); } - } - } - - if (bs->file && - (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && - (ret & BDRV_BLOCK_OFFSET_VALID)) { - int file_pnum; - ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, &file_pnum); - if (ret2 >= 0) { - /* Ignore errors. This is just providing extra information, it - * is useful but not necessary. - */ - if (!file_pnum) { - /* !file_pnum indicates an offset at or beyond the EOF; it is - * perfectly valid for the format block driver to point to such - * offsets, so catch it and mark everything as zero */ - ret |= BDRV_BLOCK_ZERO; - } else { - /* Limit request to the range reported by the protocol driver */ - *pnum = file_pnum; - ret |= (ret2 & BDRV_BLOCK_ZERO); + if (!found) { + formats = g_renew(const char *, formats, count + 1); + formats[count++] = drv->format_name; } } } - return ret; -} - -/* Coroutine wrapper for bdrv_get_block_status() */ -static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) -{ - BdrvCoGetBlockStatusData *data = opaque; - BlockDriverState *bs = data->bs; - - data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, - data->pnum); - data->done = true; -} - -/* - * Synchronous wrapper around bdrv_co_get_block_status(). - * - * See bdrv_co_get_block_status() for details. - */ -int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - Coroutine *co; - BdrvCoGetBlockStatusData data = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .pnum = pnum, - .done = false, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_get_block_status_co_entry(&data); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_get_block_status_co_entry); - qemu_coroutine_enter(co, &data); - while (!data.done) { - aio_poll(aio_context, true); - } - } - return data.ret; -} - -int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); - if (ret < 0) { - return ret; - } - return !!(ret & BDRV_BLOCK_ALLOCATED); -} - -/* - * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] - * - * Return true if the given sector is allocated in any image between - * BASE and TOP (inclusive). BASE can be NULL to check if the given - * sector is allocated in any image of the chain. Return false otherwise. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - */ -int bdrv_is_allocated_above(BlockDriverState *top, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - BlockDriverState *intermediate; - int ret, n = nb_sectors; - - intermediate = top; - while (intermediate && intermediate != base) { - int pnum_inter; - ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, - &pnum_inter); - if (ret < 0) { - return ret; - } else if (ret) { - *pnum = pnum_inter; - return 1; - } - - /* - * [sector_num, nb_sectors] is unallocated on top but intermediate - * might have - * - * [sector_num+x, nr_sectors] allocated. - */ - if (n > pnum_inter && - (intermediate == top || - sector_num + pnum_inter < intermediate->total_sectors)) { - n = pnum_inter; - } - - intermediate = intermediate->backing_hd; - } - - *pnum = n; - return 0; -} - -const char *bdrv_get_encrypted_filename(BlockDriverState *bs) -{ - if (bs->backing_hd && bs->backing_hd->encrypted) - return bs->backing_file; - else if (bs->encrypted) - return bs->filename; - else - return NULL; -} - -void bdrv_get_backing_filename(BlockDriverState *bs, - char *filename, int filename_size) -{ - pstrcpy(filename, filename_size, bs->backing_file); -} - -int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (!drv->bdrv_write_compressed) - return -ENOTSUP; - if (bdrv_check_request(bs, sector_num, nb_sectors)) - return -EIO; - - assert(QLIST_EMPTY(&bs->dirty_bitmaps)); - - return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); -} - -int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (!drv->bdrv_get_info) - return -ENOTSUP; - memset(bdi, 0, sizeof(*bdi)); - return drv->bdrv_get_info(bs, bdi); -} - -ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_get_specific_info) { - return drv->bdrv_get_specific_info(bs); - } - return NULL; -} - -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = size, - }; - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_writev_vmstate(bs, &qiov, pos); -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) -{ - BlockDriver *drv = bs->drv; - - if (!drv) { - return -ENOMEDIUM; - } else if (drv->bdrv_save_vmstate) { - return drv->bdrv_save_vmstate(bs, qiov, pos); - } else if (bs->file) { - return bdrv_writev_vmstate(bs->file, qiov, pos); - } - - return -ENOTSUP; -} - -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (drv->bdrv_load_vmstate) - return drv->bdrv_load_vmstate(bs, buf, pos, size); - if (bs->file) - return bdrv_load_vmstate(bs->file, buf, pos, size); - return -ENOTSUP; -} - -void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) -{ - if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { - return; - } - - bs->drv->bdrv_debug_event(bs, event); -} - -int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, - const char *tag) -{ - while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { - return bs->drv->bdrv_debug_breakpoint(bs, event, tag); - } - - return -ENOTSUP; -} - -int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) -{ - while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { - return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); + qsort(formats, count, sizeof(formats[0]), qsort_strcmp); + + for (i = 0; i < count; i++) { + it(opaque, formats[i]); } - return -ENOTSUP; + g_free(formats); } -int bdrv_debug_resume(BlockDriverState *bs, const char *tag) +/* This function is to find block backend bs */ +/* TODO convert callers to blk_by_name(), then remove */ +BlockDriverState *bdrv_find(const char *name) { - while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_resume) { - return bs->drv->bdrv_debug_resume(bs, tag); - } + BlockBackend *blk = blk_by_name(name); - return -ENOTSUP; + return blk ? blk_bs(blk) : NULL; } -bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) +/* This function is to find a node in the bs graph */ +BlockDriverState *bdrv_find_node(const char *node_name) { - while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { - bs = bs->file; - } - - if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { - return bs->drv->bdrv_debug_is_suspended(bs, tag); - } + BlockDriverState *bs; - return false; -} + assert(node_name); -int bdrv_is_snapshot(BlockDriverState *bs) -{ - return !!(bs->open_flags & BDRV_O_SNAPSHOT); + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + if (!strcmp(node_name, bs->node_name)) { + return bs; + } + } + return NULL; } -/* backing_file can either be relative, or absolute, or a protocol. If it is - * relative, it must be relative to the chain. So, passing in bs->filename - * from a BDS as backing_file should not be done, as that may be relative to - * the CWD rather than the chain. */ -BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, - const char *backing_file) +/* Put this QMP function here so it can access the static graph_bdrv_states. */ +BlockDeviceInfoList *bdrv_named_nodes_list(void) { - char *filename_full = NULL; - char *backing_file_full = NULL; - char *filename_tmp = NULL; - int is_protocol = 0; - BlockDriverState *curr_bs = NULL; - BlockDriverState *retval = NULL; + BlockDeviceInfoList *list, *entry; + BlockDriverState *bs; - if (!bs || !bs->drv || !backing_file) { - return NULL; + list = NULL; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + entry = g_malloc0(sizeof(*entry)); + entry->value = bdrv_block_device_info(bs); + entry->next = list; + list = entry; } - filename_full = g_malloc(PATH_MAX); - backing_file_full = g_malloc(PATH_MAX); - filename_tmp = g_malloc(PATH_MAX); - - is_protocol = path_has_protocol(backing_file); - - for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { + return list; +} - /* If either of the filename paths is actually a protocol, then - * compare unmodified paths; otherwise make paths relative */ - if (is_protocol || path_has_protocol(curr_bs->backing_file)) { - if (strcmp(backing_file, curr_bs->backing_file) == 0) { - retval = curr_bs->backing_hd; - break; - } - } else { - /* If not an absolute filename path, make it relative to the current - * image's filename path */ - path_combine(filename_tmp, PATH_MAX, curr_bs->filename, - backing_file); +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp) +{ + BlockBackend *blk; + BlockDriverState *bs; - /* We are going to compare absolute pathnames */ - if (!realpath(filename_tmp, filename_full)) { - continue; - } + if (device) { + blk = blk_by_name(device); - /* We need to make sure the backing filename we are comparing against - * is relative to the current image filename (or absolute) */ - path_combine(filename_tmp, PATH_MAX, curr_bs->filename, - curr_bs->backing_file); + if (blk) { + return blk_bs(blk); + } + } - if (!realpath(filename_tmp, backing_file_full)) { - continue; - } + if (node_name) { + bs = bdrv_find_node(node_name); - if (strcmp(backing_file_full, filename_full) == 0) { - retval = curr_bs->backing_hd; - break; - } + if (bs) { + return bs; } } - g_free(filename_full); - g_free(backing_file_full); - g_free(filename_tmp); - return retval; + error_setg(errp, "Cannot find device=%s nor node_name=%s", + device ? device : "", + node_name ? node_name : ""); + return NULL; } -int bdrv_get_backing_file_depth(BlockDriverState *bs) +/* If 'base' is in the same chain as 'top', return true. Otherwise, + * return false. If either argument is NULL, return false. */ +bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base) { - if (!bs->drv) { - return 0; - } - - if (!bs->backing_hd) { - return 0; + while (top && top != base) { + top = top->backing_hd; } - return 1 + bdrv_get_backing_file_depth(bs->backing_hd); + return top != NULL; } -/**************************************************************/ -/* async I/Os */ - -BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockDriverState *bdrv_next_node(BlockDriverState *bs) { - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); + if (!bs) { + return QTAILQ_FIRST(&graph_bdrv_states); + } + return QTAILQ_NEXT(bs, node_list); } -BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +BlockDriverState *bdrv_next(BlockDriverState *bs) { - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); + if (!bs) { + return QTAILQ_FIRST(&bdrv_states); + } + return QTAILQ_NEXT(bs, device_list); } -BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) +const char *bdrv_get_node_name(const BlockDriverState *bs) { - trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, - BDRV_REQ_ZERO_WRITE | flags, - cb, opaque, true); + return bs->node_name; } - -typedef struct MultiwriteCB { - int error; - int num_requests; - int num_callbacks; - struct { - BlockCompletionFunc *cb; - void *opaque; - QEMUIOVector *free_qiov; - } callbacks[]; -} MultiwriteCB; - -static void multiwrite_user_cb(MultiwriteCB *mcb) +/* TODO check what callers really want: bs->node_name or blk_name() */ +const char *bdrv_get_device_name(const BlockDriverState *bs) { - int i; + return bs->blk ? blk_name(bs->blk) : ""; +} - for (i = 0; i < mcb->num_callbacks; i++) { - mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); - if (mcb->callbacks[i].free_qiov) { - qemu_iovec_destroy(mcb->callbacks[i].free_qiov); - } - g_free(mcb->callbacks[i].free_qiov); - } +int bdrv_get_flags(BlockDriverState *bs) +{ + return bs->open_flags; } -static void multiwrite_cb(void *opaque, int ret) +int bdrv_flush_all(void) { - MultiwriteCB *mcb = opaque; + BlockDriverState *bs; + int result = 0; - trace_multiwrite_cb(mcb, ret); + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + AioContext *aio_context = bdrv_get_aio_context(bs); + int ret; - if (ret < 0 && !mcb->error) { - mcb->error = ret; + aio_context_acquire(aio_context); + ret = bdrv_flush(bs); + if (ret < 0 && !result) { + result = ret; + } + aio_context_release(aio_context); } - mcb->num_requests--; - if (mcb->num_requests == 0) { - multiwrite_user_cb(mcb); - g_free(mcb); - } + return result; } -static int multiwrite_req_compare(const void *a, const void *b) +int bdrv_has_zero_init_1(BlockDriverState *bs) { - const BlockRequest *req1 = a, *req2 = b; - - /* - * Note that we can't simply subtract req2->sector from req1->sector - * here as that could overflow the return value. - */ - if (req1->sector > req2->sector) { - return 1; - } else if (req1->sector < req2->sector) { - return -1; - } else { - return 0; - } + return 1; } -/* - * Takes a bunch of requests and tries to merge them. Returns the number of - * requests that remain after merging. - */ -static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, - int num_reqs, MultiwriteCB *mcb) +int bdrv_has_zero_init(BlockDriverState *bs) { - int i, outidx; - - // Sort requests by start sector - qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); - - // Check if adjacent requests touch the same clusters. If so, combine them, - // filling up gaps with zero sectors. - outidx = 0; - for (i = 1; i < num_reqs; i++) { - int merge = 0; - int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; - - // Handle exactly sequential writes and overlapping writes. - if (reqs[i].sector <= oldreq_last) { - merge = 1; - } + assert(bs->drv); - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { - merge = 0; - } + /* If BS is a copy on write image, it is initialized to + the contents of the base image, which may not be zeroes. */ + if (bs->backing_hd) { + return 0; + } + if (bs->drv->bdrv_has_zero_init) { + return bs->drv->bdrv_has_zero_init(bs); + } - if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + - reqs[i].nb_sectors > bs->bl.max_transfer_length) { - merge = 0; - } + /* safe default */ + return 0; +} - if (merge) { - size_t size; - QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); - qemu_iovec_init(qiov, - reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); +bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) +{ + BlockDriverInfo bdi; - // Add the first request to the merged one. If the requests are - // overlapping, drop the last sectors of the first request. - size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + if (bs->backing_hd) { + return false; + } - // We should need to add any zeros between the two requests - assert (reqs[i].sector <= oldreq_last); + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.unallocated_blocks_are_zero; + } - // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + return false; +} - // Add tail of first request, if necessary - if (qiov->size < reqs[outidx].qiov->size) { - qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, - reqs[outidx].qiov->size - qiov->size); - } +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) +{ + BlockDriverInfo bdi; - reqs[outidx].nb_sectors = qiov->size >> 9; - reqs[outidx].qiov = qiov; + if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { + return false; + } - mcb->callbacks[i].free_qiov = reqs[outidx].qiov; - } else { - outidx++; - reqs[outidx].sector = reqs[i].sector; - reqs[outidx].nb_sectors = reqs[i].nb_sectors; - reqs[outidx].qiov = reqs[i].qiov; - } + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.can_write_zeroes_with_unmap; } - return outidx + 1; + return false; } +typedef struct BdrvCoGetBlockStatusData { + BlockDriverState *bs; + BlockDriverState *base; + int64_t sector_num; + int nb_sectors; + int *pnum; + int64_t ret; + bool done; +} BdrvCoGetBlockStatusData; + /* - * Submit multiple AIO write requests at once. + * Returns the allocation status of the specified sectors. + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. * - * On success, the function returns 0 and all requests in the reqs array have - * been submitted. In error case this function returns -1, and any of the - * requests may or may not be submitted yet. In particular, this means that the - * callback will be called for some of the requests, for others it won't. The - * caller must check the error field of the BlockRequest to wait for the right - * callbacks (if error != 0, no callback will be called). + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. * - * The implementation may modify the contents of the reqs array, e.g. to merge - * requests. However, the fields opaque and error are left unmodified as they - * are used to signal failure for a single request to the caller. + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. */ -int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) { - MultiwriteCB *mcb; - int i; + int64_t total_sectors; + int64_t n; + int64_t ret, ret2; - /* don't submit writes if we don't have a medium */ - if (bs->drv == NULL) { - for (i = 0; i < num_reqs; i++) { - reqs[i].error = -ENOMEDIUM; - } - return -1; + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; } - if (num_reqs == 0) { + if (sector_num >= total_sectors) { + *pnum = 0; return 0; } - // Create MultiwriteCB structure - mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); - mcb->num_requests = 0; - mcb->num_callbacks = num_reqs; - - for (i = 0; i < num_reqs; i++) { - mcb->callbacks[i].cb = reqs[i].cb; - mcb->callbacks[i].opaque = reqs[i].opaque; + n = total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; } - // Check for mergable requests - num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + if (!bs->drv->bdrv_co_get_block_status) { + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; + if (bs->drv->protocol_name) { + ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); + } + return ret; + } - trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + *pnum = 0; + return ret; + } - /* Run the aio requests. */ - mcb->num_requests = num_reqs; - for (i = 0; i < num_reqs; i++) { - bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, reqs[i].flags, - multiwrite_cb, mcb, - true); + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID); + return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); } - return 0; -} + if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { + ret |= BDRV_BLOCK_ALLOCATED; + } -void bdrv_aio_cancel(BlockAIOCB *acb) -{ - qemu_aio_ref(acb); - bdrv_aio_cancel_async(acb); - while (acb->refcnt > 1) { - if (acb->aiocb_info->get_aio_context) { - aio_poll(acb->aiocb_info->get_aio_context(acb), true); - } else if (acb->bs) { - aio_poll(bdrv_get_aio_context(acb->bs), true); - } else { - abort(); + if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { + if (bdrv_unallocated_blocks_are_zero(bs)) { + ret |= BDRV_BLOCK_ZERO; + } else if (bs->backing_hd) { + BlockDriverState *bs2 = bs->backing_hd; + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { + ret |= BDRV_BLOCK_ZERO; + } } } - qemu_aio_unref(acb); -} -/* Async version of aio cancel. The caller is not blocked if the acb implements - * cancel_async, otherwise we do nothing and let the request normally complete. - * In either case the completion callback must be called. */ -void bdrv_aio_cancel_async(BlockAIOCB *acb) -{ - if (acb->aiocb_info->cancel_async) { - acb->aiocb_info->cancel_async(acb); - } -} + if (bs->file && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + int file_pnum; -/**************************************************************/ -/* async block device emulation */ + ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it + * is useful but not necessary. + */ + if (!file_pnum) { + /* !file_pnum indicates an offset at or beyond the EOF; it is + * perfectly valid for the format block driver to point to such + * offsets, so catch it and mark everything as zero */ + ret |= BDRV_BLOCK_ZERO; + } else { + /* Limit request to the range reported by the protocol driver */ + *pnum = file_pnum; + ret |= (ret2 & BDRV_BLOCK_ZERO); + } + } + } -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; + return ret; +} -static void bdrv_aio_bh_cb(void *opaque) +/* Coroutine wrapper for bdrv_get_block_status() */ +static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) { - BlockAIOCBSync *acb = opaque; + BdrvCoGetBlockStatusData *data = opaque; + BlockDriverState *bs = data->bs; - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); + data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, + data->pnum); + data->done = true; } -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - +/* + * Synchronous wrapper around bdrv_co_get_block_status(). + * + * See bdrv_co_get_block_status() for details. + */ +int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) { - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + Coroutine *co; + BdrvCoGetBlockStatusData data = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_get_block_status_co_entry(&data); } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); + AioContext *aio_context = bdrv_get_aio_context(bs); - return &acb->common; + co = qemu_coroutine_create(bdrv_get_block_status_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + aio_poll(aio_context, true); + } + } + return data.ret; } -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) { - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + return ret; + } + return !!(ret & BDRV_BLOCK_ALLOCATED); } -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int bdrv_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) { - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); -} - - -typedef struct BlockAIOCBCoroutine { - BlockAIOCB common; - BlockRequest req; - bool is_write; - bool *done; - QEMUBH* bh; -} BlockAIOCBCoroutine; + BlockDriverState *intermediate; + int ret, n = nb_sectors; -static const AIOCBInfo bdrv_em_co_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBCoroutine), -}; + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; + } -static void bdrv_co_em_bh(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter && + (intermediate == top || + sector_num + pnum_inter < intermediate->total_sectors)) { + n = pnum_inter; + } - acb->common.cb(acb->common.opaque, acb->req.error); + intermediate = intermediate->backing_hd; + } - qemu_bh_delete(acb->bh); - qemu_aio_unref(acb); + *pnum = n; + return 0; } -/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ -static void coroutine_fn bdrv_co_do_rw(void *opaque) +const char *bdrv_get_encrypted_filename(BlockDriverState *bs) { - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } else { - acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } - - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); + if (bs->backing_hd && bs->backing_hd->encrypted) + return bs->backing_file; + else if (bs->encrypted) + return bs->filename; + else + return NULL; } -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) +void bdrv_get_backing_filename(BlockDriverState *bs, + char *filename, int filename_size) { - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - acb->req.qiov = qiov; - acb->req.flags = flags; - acb->is_write = is_write; - - co = qemu_coroutine_create(bdrv_co_do_rw); - qemu_coroutine_enter(co, acb); - - return &acb->common; + pstrcpy(filename, filename_size, bs->backing_file); } -static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - acb->req.error = bdrv_co_flush(bs); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_get_info) + return -ENOTSUP; + memset(bdi, 0, sizeof(*bdi)); + return drv->bdrv_get_info(bs, bdi); } -BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) +ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) { - trace_bdrv_aio_flush(bs, opaque); - - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - - co = qemu_coroutine_create(bdrv_aio_flush_co_entry); - qemu_coroutine_enter(co, acb); - - return &acb->common; + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_get_specific_info) { + return drv->bdrv_get_specific_info(bs); + } + return NULL; } -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) { - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_writev_vmstate(bs, &qiov, pos); } -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - Coroutine *co; - BlockAIOCBCoroutine *acb; - - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry); - qemu_coroutine_enter(co, acb); + BlockDriver *drv = bs->drv; - return &acb->common; -} + if (!drv) { + return -ENOMEDIUM; + } else if (drv->bdrv_save_vmstate) { + return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (bs->file) { + return bdrv_writev_vmstate(bs->file, qiov, pos); + } -void bdrv_init(void) -{ - module_call_init(MODULE_INIT_BLOCK); + return -ENOTSUP; } -void bdrv_init_with_whitelist(void) +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) { - use_bdrv_whitelist = 1; - bdrv_init(); + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (drv->bdrv_load_vmstate) + return drv->bdrv_load_vmstate(bs, buf, pos, size); + if (bs->file) + return bdrv_load_vmstate(bs->file, buf, pos, size); + return -ENOTSUP; } -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) +void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event) { - BlockAIOCB *acb; - - acb = g_slice_alloc(aiocb_info->aiocb_size); - acb->aiocb_info = aiocb_info; - acb->bs = bs; - acb->cb = cb; - acb->opaque = opaque; - acb->refcnt = 1; - return acb; -} + if (!bs || !bs->drv || !bs->drv->bdrv_debug_event) { + return; + } -void qemu_aio_ref(void *p) -{ - BlockAIOCB *acb = p; - acb->refcnt++; + bs->drv->bdrv_debug_event(bs, event); } -void qemu_aio_unref(void *p) +int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag) { - BlockAIOCB *acb = p; - assert(acb->refcnt > 0); - if (--acb->refcnt == 0) { - g_slice_free1(acb->aiocb_info->aiocb_size, acb); + while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) { + bs = bs->file; } -} -/**************************************************************/ -/* Coroutine block device emulation */ + if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) { + return bs->drv->bdrv_debug_breakpoint(bs, event, tag); + } -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; - int ret; -} CoroutineIOCompletion; + return -ENOTSUP; +} -static void bdrv_co_io_em_complete(void *opaque, int ret) +int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) { - CoroutineIOCompletion *co = opaque; + while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { + bs = bs->file; + } + + if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { + return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); + } - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); + return -ENOTSUP; } -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) +int bdrv_debug_resume(BlockDriverState *bs, const char *tag) { - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); + while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { + bs = bs->file; } - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; + if (bs && bs->drv && bs->drv->bdrv_debug_resume) { + return bs->drv->bdrv_debug_resume(bs, tag); } - qemu_coroutine_yield(); - return co.ret; + return -ENOTSUP; } -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) +bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag) { - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} + while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) { + bs = bs->file; + } -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); + if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) { + return bs->drv->bdrv_debug_is_suspended(bs, tag); + } + + return false; } -static void coroutine_fn bdrv_flush_co_entry(void *opaque) +int bdrv_is_snapshot(BlockDriverState *bs) { - RwCo *rwco = opaque; - - rwco->ret = bdrv_co_flush(rwco->bs); + return !!(bs->open_flags & BDRV_O_SNAPSHOT); } -int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +/* backing_file can either be relative, or absolute, or a protocol. If it is + * relative, it must be relative to the chain. So, passing in bs->filename + * from a BDS as backing_file should not be done, as that may be relative to + * the CWD rather than the chain. */ +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file) { - int ret; + char *filename_full = NULL; + char *backing_file_full = NULL; + char *filename_tmp = NULL; + int is_protocol = 0; + BlockDriverState *curr_bs = NULL; + BlockDriverState *retval = NULL; - if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - return 0; + if (!bs || !bs->drv || !backing_file) { + return NULL; } - /* Write back cached data to the OS even with cache=unsafe */ - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); - if (bs->drv->bdrv_co_flush_to_os) { - ret = bs->drv->bdrv_co_flush_to_os(bs); - if (ret < 0) { - return ret; - } - } + filename_full = g_malloc(PATH_MAX); + backing_file_full = g_malloc(PATH_MAX); + filename_tmp = g_malloc(PATH_MAX); - /* But don't actually force it to the disk with cache=unsafe */ - if (bs->open_flags & BDRV_O_NO_FLUSH) { - goto flush_parent; - } + is_protocol = path_has_protocol(backing_file); - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); - if (bs->drv->bdrv_co_flush_to_disk) { - ret = bs->drv->bdrv_co_flush_to_disk(bs); - } else if (bs->drv->bdrv_aio_flush) { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; + for (curr_bs = bs; curr_bs->backing_hd; curr_bs = curr_bs->backing_hd) { - acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; + /* If either of the filename paths is actually a protocol, then + * compare unmodified paths; otherwise make paths relative */ + if (is_protocol || path_has_protocol(curr_bs->backing_file)) { + if (strcmp(backing_file, curr_bs->backing_file) == 0) { + retval = curr_bs->backing_hd; + break; + } } else { - qemu_coroutine_yield(); - ret = co.ret; + /* If not an absolute filename path, make it relative to the current + * image's filename path */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + backing_file); + + /* We are going to compare absolute pathnames */ + if (!realpath(filename_tmp, filename_full)) { + continue; + } + + /* We need to make sure the backing filename we are comparing against + * is relative to the current image filename (or absolute) */ + path_combine(filename_tmp, PATH_MAX, curr_bs->filename, + curr_bs->backing_file); + + if (!realpath(filename_tmp, backing_file_full)) { + continue; + } + + if (strcmp(backing_file_full, filename_full) == 0) { + retval = curr_bs->backing_hd; + break; + } } - } else { - /* - * Some block drivers always operate in either writethrough or unsafe - * mode and don't support bdrv_flush therefore. Usually qemu doesn't - * know how the server works (because the behaviour is hardcoded or - * depends on server-side configuration), so we can't ensure that - * everything is safe on disk. Returning an error doesn't work because - * that would break guests even if the server operates in writethrough - * mode. - * - * Let's hope the user knows what he's doing. - */ - ret = 0; } - if (ret < 0) { - return ret; + + g_free(filename_full); + g_free(backing_file_full); + g_free(filename_tmp); + return retval; +} + +int bdrv_get_backing_file_depth(BlockDriverState *bs) +{ + if (!bs->drv) { + return 0; } - /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH - * in the case of cache=unsafe, so there are no useless flushes. - */ -flush_parent: - return bdrv_co_flush(bs->file); + if (!bs->backing_hd) { + return 0; + } + + return 1 + bdrv_get_backing_file_depth(bs->backing_hd); +} + +void bdrv_init(void) +{ + module_call_init(MODULE_INIT_BLOCK); +} + +void bdrv_init_with_whitelist(void) +{ + use_bdrv_whitelist = 1; + bdrv_init(); } void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) @@ -5060,145 +3285,6 @@ void bdrv_invalidate_cache_all(Error **errp) } } -int bdrv_flush(BlockDriverState *bs) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_flush_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_flush_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - -typedef struct DiscardCo { - BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; - int ret; -} DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) -{ - DiscardCo *rwco = opaque; - - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); -} - -/* if no limit is specified in the BlockLimits use a default - * of 32768 512-byte sectors (16 MiB) per request. - */ -#define MAX_DISCARD_DEFAULT 32768 - -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - int max_discard; - - if (!bs->drv) { - return -ENOMEDIUM; - } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } else if (bs->read_only) { - return -EROFS; - } - - bdrv_reset_dirty(bs, sector_num, nb_sectors); - - /* Do nothing if disabled. */ - if (!(bs->open_flags & BDRV_O_UNMAP)) { - return 0; - } - - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { - return 0; - } - - max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; - while (nb_sectors > 0) { - int ret; - int num = nb_sectors; - - /* align request */ - if (bs->bl.discard_alignment && - num >= bs->bl.discard_alignment && - sector_num % bs->bl.discard_alignment) { - if (num > bs->bl.discard_alignment) { - num = bs->bl.discard_alignment; - } - num -= sector_num % bs->bl.discard_alignment; - } - - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } - - if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); - } else { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - return -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } - if (ret && ret != -ENOTSUP) { - return ret; - } - - sector_num += num; - nb_sectors -= num; - } - return 0; -} - -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) -{ - Coroutine *co; - DiscardCo rwco = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_discard_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - /**************************************************************/ /* removable device support */ @@ -5264,28 +3350,6 @@ void bdrv_lock_medium(BlockDriverState *bs, bool locked) } } -/* needed for generic scsi interface */ - -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - BlockDriver *drv = bs->drv; - - if (drv && drv->bdrv_ioctl) - return drv->bdrv_ioctl(bs, req, buf); - return -ENOTSUP; -} - -BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) -{ - BlockDriver *drv = bs->drv; - - if (drv && drv->bdrv_aio_ioctl) - return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); - return NULL; -} - void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { bs->guest_block_size = align; @@ -5427,8 +3491,7 @@ void bdrv_reset_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); } -static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { BdrvDirtyBitmap *bitmap; QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { @@ -5436,8 +3499,7 @@ static void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, } } -static void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { BdrvDirtyBitmap *bitmap; QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { @@ -5859,12 +3921,6 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs, abort(); } -void bdrv_add_before_write_notifier(BlockDriverState *bs, - NotifierWithReturn *notifier) -{ - notifier_with_return_list_add(&bs->before_write_notifiers, notifier); -} - int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, BlockDriverAmendStatusCB *status_cb) { @@ -5965,36 +4021,6 @@ out: return to_replace_bs; } -void bdrv_io_plug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file); - } -} - -void bdrv_io_unplug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file); - } -} - -void bdrv_flush_io_queue(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file); - } -} - static bool append_open_options(QDict *d, BlockDriverState *bs) { const QDictEntry *entry; diff --git a/block/Makefile.objs b/block/Makefile.objs index 04b0e43..ba8de70 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,3 +1,4 @@ +block-obj-y += io.o block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o diff --git a/block/io.c b/block/io.c new file mode 100644 index 0000000..57176ba --- /dev/null +++ b/block/io.c @@ -0,0 +1,1997 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "trace.h" +#include "block/block_int.h" + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + +void bdrv_setup_io_funcs(BlockDriver *bdrv) +{ + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } + } +} + +/* throttling disk I/O limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + ThrottleConfig *cfg) +{ + int i; + + throttle_config(&bs->throttle_state, cfg); + + for (i = 0; i < 2; i++) { + qemu_co_enter_next(&bs->throttled_reqs[i]); + } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ + bool drained = false; + bool enabled = bs->io_limits_enabled; + int i; + + bs->io_limits_enabled = false; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + drained = true; + } + } + + bs->io_limits_enabled = enabled; + + return drained; +} + +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + + bdrv_start_throttled_reqs(bs); + + throttle_destroy(&bs->throttle_state); +} + +static void bdrv_throttle_read_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[0]); +} + +static void bdrv_throttle_write_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[1]); +} + +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs) +{ + assert(!bs->io_limits_enabled); + throttle_init(&bs->throttle_state, + bdrv_get_aio_context(bs), + QEMU_CLOCK_VIRTUAL, + bdrv_throttle_read_timer_cb, + bdrv_throttle_write_timer_cb, + bs); + bs->io_limits_enabled = true; +} + +/* This function makes an IO wait if needed + * + * @nb_sectors: the number of sectors of the IO + * @is_write: is the IO a write + */ +static void bdrv_io_limits_intercept(BlockDriverState *bs, + unsigned int bytes, + bool is_write) +{ + /* does this io must wait */ + bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); + + /* if must wait or any request of this type throttled queue the IO */ + if (must_wait || + !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { + qemu_co_queue_wait(&bs->throttled_reqs[is_write]); + } + + /* the IO will be executed, do the accounting */ + throttle_account(&bs->throttle_state, is_write, bytes); + + + /* if the next request must wait -> do nothing */ + if (throttle_schedule_timer(&bs->throttle_state, is_write)) { + return; + } + + /* else queue next request for execution */ + qemu_co_queue_next(&bs->throttled_reqs[is_write]); +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + +/* Check if any requests are in-flight (including throttled requests) */ +static bool bdrv_requests_pending(BlockDriverState *bs) +{ + if (!QLIST_EMPTY(&bs->tracked_requests)) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { + return true; + } + if (bs->file && bdrv_requests_pending(bs->file)) { + return true; + } + if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { + return true; + } + return false; +} + +bool bdrv_drain_one(BlockDriverState *bs) +{ + bool bs_busy; + + bdrv_flush_io_queue(bs); + bdrv_start_throttled_reqs(bs); + bs_busy = bdrv_requests_pending(bs); + bs_busy |= aio_poll(bdrv_get_aio_context(bs), bs_busy); + return bs_busy; +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree + * + * See the warning in bdrv_drain_all(). This function can only be called if + * you are sure nothing can generate I/O because you have op blockers + * installed. + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + */ +void bdrv_drain(BlockDriverState *bs) +{ + while (bdrv_drain_one(bs)) { + /* Keep iterating */ + } +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + if (req->serialising) { + req->bs->serialising_in_flight--; + } + + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t offset, + unsigned int bytes, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t offset, unsigned int bytes) +{ + /* aaaa bbbb */ + if (offset >= req->overlap_offset + req->overlap_bytes) { + return false; + } + /* bbbb aaaa */ + if (req->overlap_offset >= offset + bytes) { + return false; + } + return true; +} + +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +{ + BlockDriverState *bs = self->bs; + BdrvTrackedRequest *req; + bool retry; + bool waited = false; + + if (!bs->serialising_in_flight) { + return false; + } + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } + } + } + } while (retry); + + return waited; +} + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, + size_t size) +{ + int64_t len; + + if (size > INT_MAX) { + return -EIO; + } + + if (!bdrv_is_inserted(bs)) + return -ENOMEDIUM; + + if (bs->growable) + return 0; + + len = bdrv_getlength(bs); + + if (offset < 0) + return -EIO; + + if ((offset > len) || (len - offset < size)) + return -EIO; + + return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + return -EIO; + } + + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { + BlockDriverState *bs; + int64_t offset; + QEMUIOVector *qiov; + bool is_write; + int ret; + BdrvRequestFlags flags; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .offset = offset, + .qiov = qiov, + .is_write = is_write, + .ret = NOT_DONE, + .flags = flags, + }; + + /** + * In sync call context, when the vcpu is blocked, this throttling timer + * will not fire; so the I/O throttling function has to be disabled here + * if it has been enabled. + */ + if (bs->io_limits_enabled) { + fprintf(stderr, "Disabling I/O throttling on '%s' due " + "to synchronous I/O.\n", bdrv_get_device_name(bs)); + bdrv_io_limits_disable(bs); + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write, BdrvRequestFlags flags) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, sector_num, buf, nb_sectors); + bs->io_limits_enabled = enabled; + return ret; +} + +/* Return < 0 if error. Important errors are: + -EIO generic I/O error (may happen for all errors) + -ENOMEDIUM No media inserted. + -EINVAL Invalid sector number or nb_sectors + -EACCES Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; + int ret; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; + } + + return bytes; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = bytes, + }; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count) +{ + int ret; + + ret = bdrv_pwrite(bs, offset, buf, count); + if (ret < 0) { + return ret; + } + + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { + bdrv_flush(bs); + } + + return 0; +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (!drv->bdrv_write_compressed) + return -ENOTSUP; + if (bdrv_check_request(bs, sector_num, nb_sectors)) + return -EIO; + + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + + return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors, 0); + } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + +/* + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. + */ +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + wait_serialising_requests(req); + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + /* Forward the request to the BlockDriver */ + if (!(bs->zero_beyond_eof && bs->growable)) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + /* Read zeros after EOF of growable BDSes */ + int64_t total_sectors, max_nb_sectors; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; + goto out; + } + + max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), + align >> BDRV_SECTOR_BITS); + if (nb_sectors < max_nb_sectors) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else if (max_nb_sectors > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, 0, + max_nb_sectors * BDRV_SECTOR_SIZE); + + ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, + &local_qiov); + + qemu_iovec_destroy(&local_qiov); + } else { + ret = 0; + } + + /* Reading beyond end of file is supposed to produce zeroes */ + if (ret == 0 && total_sectors < sector_num + nb_sectors) { + uint64_t offset = MAX(0, total_sectors - sector_num); + uint64_t bytes = (sector_num + nb_sectors - offset) * + BDRV_SECTOR_SIZE; + qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + } + } + +out: + return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_WRITE_ZEROES_DEFAULT 32768 + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov = {0}; + int ret = 0; + + int max_write_zeroes = bs->bl.max_write_zeroes ? + bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; + + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } + } + + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } + + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + MAX_WRITE_ZEROES_DEFAULT); + num = MIN(num, max_xfer_len); + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_xfer_len) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } + +fail: + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + bool waited; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + + if (bs->growable && ret >= 0) { + bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + } + + return ret; +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EACCES; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + + return ret; +} + +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) +{ + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE | flags); +} + +/**************************************************************/ +/* async I/Os */ + +BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, false); +} + +BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, + cb, opaque, true); +} + +typedef struct MultiwriteCB { + int error; + int num_requests; + int num_callbacks; + struct { + BlockCompletionFunc *cb; + void *opaque; + QEMUIOVector *free_qiov; + } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ + int i; + + for (i = 0; i < mcb->num_callbacks; i++) { + mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); + if (mcb->callbacks[i].free_qiov) { + qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + } + g_free(mcb->callbacks[i].free_qiov); + } +} + +static void multiwrite_cb(void *opaque, int ret) +{ + MultiwriteCB *mcb = opaque; + + trace_multiwrite_cb(mcb, ret); + + if (ret < 0 && !mcb->error) { + mcb->error = ret; + } + + mcb->num_requests--; + if (mcb->num_requests == 0) { + multiwrite_user_cb(mcb); + g_free(mcb); + } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ + const BlockRequest *req1 = a, *req2 = b; + + /* + * Note that we can't simply subtract req2->sector from req1->sector + * here as that could overflow the return value. + */ + if (req1->sector > req2->sector) { + return 1; + } else if (req1->sector < req2->sector) { + return -1; + } else { + return 0; + } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs, MultiwriteCB *mcb) +{ + int i, outidx; + + // Sort requests by start sector + qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + + // Check if adjacent requests touch the same clusters. If so, combine them, + // filling up gaps with zero sectors. + outidx = 0; + for (i = 1; i < num_reqs; i++) { + int merge = 0; + int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + + // Handle exactly sequential writes and overlapping writes. + if (reqs[i].sector <= oldreq_last) { + merge = 1; + } + + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + merge = 0; + } + + if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + + reqs[i].nb_sectors > bs->bl.max_transfer_length) { + merge = 0; + } + + if (merge) { + size_t size; + QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); + qemu_iovec_init(qiov, + reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + + // Add the first request to the merged one. If the requests are + // overlapping, drop the last sectors of the first request. + size = (reqs[i].sector - reqs[outidx].sector) << 9; + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + + // We should need to add any zeros between the two requests + assert (reqs[i].sector <= oldreq_last); + + // Add the second request + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + + // Add tail of first request, if necessary + if (qiov->size < reqs[outidx].qiov->size) { + qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, + reqs[outidx].qiov->size - qiov->size); + } + + reqs[outidx].nb_sectors = qiov->size >> 9; + reqs[outidx].qiov = qiov; + + mcb->callbacks[i].free_qiov = reqs[outidx].qiov; + } else { + outidx++; + reqs[outidx].sector = reqs[i].sector; + reqs[outidx].nb_sectors = reqs[i].nb_sectors; + reqs[outidx].qiov = reqs[i].qiov; + } + } + + return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ + MultiwriteCB *mcb; + int i; + + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + + if (num_reqs == 0) { + return 0; + } + + // Create MultiwriteCB structure + mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); + mcb->num_requests = 0; + mcb->num_callbacks = num_reqs; + + for (i = 0; i < num_reqs; i++) { + mcb->callbacks[i].cb = reqs[i].cb; + mcb->callbacks[i].opaque = reqs[i].opaque; + } + + // Check for mergable requests + num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + + trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + + /* Run the aio requests. */ + mcb->num_requests = num_reqs; + for (i = 0; i < num_reqs; i++) { + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); + } + + return 0; +} + +void bdrv_aio_cancel(BlockAIOCB *acb) +{ + qemu_aio_ref(acb); + bdrv_aio_cancel_async(acb); + while (acb->refcnt > 1) { + if (acb->aiocb_info->get_aio_context) { + aio_poll(acb->aiocb_info->get_aio_context(acb), true); + } else if (acb->bs) { + aio_poll(bdrv_get_aio_context(acb->bs), true); + } else { + abort(); + } + } + qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockAIOCB *acb) +{ + if (acb->aiocb_info->cancel_async) { + acb->aiocb_info->cancel_async(acb); + } +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockAIOCBSync { + BlockAIOCB common; + QEMUBH *bh; + int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; +} BlockAIOCBSync; + +static const AIOCBInfo bdrv_em_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBSync), +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ + BlockAIOCBSync *acb = opaque; + + if (!acb->is_write && acb->ret >= 0) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_unref(acb); +} + +static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + int is_write) + +{ + BlockAIOCBSync *acb; + + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_try_blockalign(bs, qiov->size); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + + qemu_bh_schedule(acb->bh); + + return &acb->common; +} + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + +typedef struct BlockAIOCBCoroutine { + BlockAIOCB common; + BlockRequest req; + bool is_write; + bool *done; + QEMUBH* bh; +} BlockAIOCBCoroutine; + +static const AIOCBInfo bdrv_em_co_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBCoroutine), +}; + +static void bdrv_co_em_bh(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + + acb->common.cb(acb->common.opaque, acb->req.error); + + qemu_bh_delete(acb->bh); + qemu_aio_unref(acb); +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } else { + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } + + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); +} + +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->req.flags = flags; + acb->is_write = is_write; + + co = qemu_coroutine_create(bdrv_co_do_rw); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_flush(bs); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); +} + +BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_flush(bs, opaque); + + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); +} + +BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); + + return &acb->common; +} +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCB *acb; + + acb = g_slice_alloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + acb->refcnt = 1; + return acb; +} + +void qemu_aio_ref(void *p) +{ + BlockAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ + BlockAIOCB *acb = p; + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_slice_free1(acb->aiocb_info->aiocb_size, acb); + } +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; + + if (is_write) { + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + + if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + /* Write back cached data to the OS even with cache=unsafe */ + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + return ret; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + goto flush_parent; + } + + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + if (bs->drv->bdrv_co_flush_to_disk) { + ret = bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + ret = 0; + } + if (ret < 0) { + return ret; + } + + /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH + * in the case of cache=unsafe, so there are no useless flushes. + */ +flush_parent: + return bdrv_co_flush(bs->file); +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + DiscardCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_DISCARD_DEFAULT 32768 + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + int max_discard; + + if (!bs->drv) { + return -ENOMEDIUM; + } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { + return -EIO; + } else if (bs->read_only) { + return -EROFS; + } + + bdrv_reset_dirty(bs, sector_num, nb_sectors); + + /* Do nothing if disabled. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + return 0; + } + + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } + + max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + if (ret && ret != -ENOTSUP) { + return ret; + } + + sector_num += num; + nb_sectors -= num; + } + return 0; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + DiscardCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +/* needed for generic scsi interface */ + +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_ioctl) + return drv->bdrv_ioctl(bs, req, buf); + return -ENOTSUP; +} + +BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + BlockDriver *drv = bs->drv; + + if (drv && drv->bdrv_aio_ioctl) + return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); + return NULL; +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ + notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} + +void bdrv_io_plug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } else if (bs->file) { + bdrv_io_plug(bs->file); + } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } else if (bs->file) { + bdrv_io_unplug(bs->file); + } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_flush_io_queue) { + drv->bdrv_flush_io_queue(bs); + } else if (bs->file) { + bdrv_flush_io_queue(bs->file); + } +} diff --git a/include/block/block_int.h b/include/block/block_int.h index e264be9..dd9a376 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -421,6 +421,14 @@ extern BlockDriver bdrv_file; extern BlockDriver bdrv_raw; extern BlockDriver bdrv_qcow2; +/** + * bdrv_setup_io_funcs: + * + * Prepare a #BlockDriver for I/O request processing by populating + * unimplemented coroutine and AIO interfaces with generic wrapper functions + * that fall back to implemented interfaces. + */ +void bdrv_setup_io_funcs(BlockDriver *bdrv); int get_tmp_filename(char *filename, int size); BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, @@ -606,4 +614,10 @@ bool blk_dev_is_tray_open(BlockBackend *blk); bool blk_dev_is_medium_locked(BlockBackend *blk); void blk_dev_resize_cb(BlockBackend *blk); +bool bdrv_drain_one(BlockDriverState *bs); + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors); + #endif /* BLOCK_INT_H */ -- 2.1.0