From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] [PATCH 15/17] block: align and serialize I/O when guest_block_size < host_block_size
Date: Tue, 13 Dec 2011 13:37:18 +0100 [thread overview]
Message-ID: <1323779840-4235-16-git-send-email-pbonzini@redhat.com> (raw)
In-Reply-To: <1323779840-4235-1-git-send-email-pbonzini@redhat.com>
When the guest sees a lower alignment than the host, and O_DIRECT
is in place, I/O might not span an integer number of host sectors.
If this is the case, add a few sectors at the beginning and the end.
When reading, copy interesting data from there to the guest buffer.
When writing, merge data from there with data from the guest buffer.
Since writes potentially become read-modify-write sequences, they have
to be serialized against other operations. Reads are still atomic
(they only need some postprocessing), so they need not be
serialized.
The math is a bit tricky, but luckily all the primitives are already in
cutils.c and iov.c.
On top of this, the guest's buffers might be misaligned with respect
to the host block size. However, that's caught by raw-posix.c and
posix-aio-compat.c, so we need not care about it.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
Makefile.objs | 4 +-
block.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
trace-events | 1 +
3 files changed, 175 insertions(+), 4 deletions(-)
diff --git a/Makefile.objs b/Makefile.objs
index 3a699ee..2033c96 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,7 +23,7 @@ coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
#######################################################################
# block-obj-y is code used by both qemu system emulation and qemu-img
-block-obj-y = cutils.o cache-utils.o qemu-option.o module.o async.o
+block-obj-y = cutils.o cache-utils.o qemu-option.o module.o async.o iov.o
block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o qemu-progress.o qemu-sockets.o
block-obj-y += $(coroutine-obj-y) $(qobject-obj-y) $(version-obj-y)
block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
@@ -157,7 +157,7 @@ endif
common-obj-y += $(addprefix ui/, $(ui-obj-y))
common-obj-$(CONFIG_VNC) += $(addprefix ui/, $(vnc-obj-y))
-common-obj-y += iov.o acl.o
+common-obj-y += acl.o
common-obj-$(CONFIG_POSIX) += compatfd.o
common-obj-y += notify.o event_notifier.o
common-obj-y += qemu-timer.o qemu-timer-common.o
diff --git a/block.c b/block.c
index 9e35c85..44f5d83 100644
--- a/block.c
+++ b/block.c
@@ -28,6 +28,7 @@
#include "block_int.h"
#include "module.h"
#include "qjson.h"
+#include "iov.h"
#include "qemu-coroutine.h"
#include "qmp-commands.h"
#include "qemu-timer.h"
@@ -1175,6 +1176,21 @@ static void round_sectors(int64_t alignment,
}
}
+static bool req_is_aligned(int64_t alignment,
+ int64_t sector_num, int nb_sectors)
+{
+ if (alignment != BDRV_SECTOR_SIZE) {
+ int64_t c = alignment / BDRV_SECTOR_SIZE;
+ if ((sector_num & (c - 1)) != 0) {
+ return false;
+ }
+ if ((nb_sectors & (c - 1)) != 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
static int64_t get_cluster_size(BlockDriverState *bs)
{
BlockDriverInfo bdi;
@@ -1571,6 +1587,135 @@ err:
return ret;
}
+static int coroutine_fn bdrv_rw_co_align(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, bool is_write)
+{
+ BlockDriver *drv = bs->drv;
+ QEMUIOVector aligned_qiov;
+
+ int alignment = bs->host_block_size / BDRV_SECTOR_SIZE;
+ int64_t io_sector_num;
+ int io_nb_sectors;
+ int head_extra, tail_sectors;
+ int head_sectors = 0;
+ int tail_offset = 0;
+ int ret;
+ uint8_t *head = NULL;
+ uint8_t *tail = NULL;
+
+ trace_bdrv_rw_co_align(bs, sector_num, nb_sectors, is_write);
+ round_sectors(bs->host_block_size, sector_num, nb_sectors,
+ &io_sector_num, &io_nb_sectors);
+
+ /*
+ * Here is a drawing of some involved quantities. Double lines
+ * mark points that are aligned to a host block.
+ *
+ * io_sector_num + io_nb_sectors ---.
+ * .-- io_sector_num \
+ * / sector_num + nb_sectors ---. \
+ * /______________ ________________ _____ ______________\____\
+ * || | || || | ||
+ * || head_extra | head_sectors || ... || tail_sectors | ... ||
+ * ||______________|________________||_____||______________|_____||
+ * \ \ \
+ * `--- sector_num \ `--- sector_num + tail_offset
+ * \
+ * `--- io_sector_num + alignment
+ */
+ head_extra = sector_num & (alignment - 1);
+ tail_sectors = (sector_num + nb_sectors) & (alignment - 1);
+
+ qemu_iovec_init(&aligned_qiov, qiov->niov + 2);
+
+ /* If the initial sector is not aligned, we need a "head" block. If a
+ * single block will satisfy the request, we put it in the "head" too.
+ * This way, the tail always starts on an aligned sector.
+ */
+ if (head_extra != 0 || io_nb_sectors == alignment) {
+ head_sectors = MIN(nb_sectors, alignment - head_extra);
+ head = qemu_blockalign(bs, bs->host_block_size);
+ qemu_iovec_add(&aligned_qiov, head, bs->host_block_size);
+ }
+
+ /* If the aligned request is just 1 block, we will read it in head,
+ * so nothing else to do.
+ */
+ if (io_nb_sectors > alignment) {
+ /* Otherwise, see where the last full block ends... */
+ tail_offset = nb_sectors - tail_sectors;
+ assert(((sector_num + tail_offset) & (alignment - 1)) == 0);
+
+ /* ... add everything after the head and up to it. */
+ qemu_iovec_copy(&aligned_qiov, qiov,
+ head_sectors * BDRV_SECTOR_SIZE,
+ (tail_offset - head_sectors) * BDRV_SECTOR_SIZE);
+
+ /* If the final sector is not aligned, add an extra block at
+ * the end.
+ */
+ if (tail_sectors != 0) {
+ tail = qemu_blockalign(bs, bs->host_block_size);
+ qemu_iovec_add(&aligned_qiov, tail, bs->host_block_size);
+ }
+ }
+
+ assert(head || tail);
+ if (is_write) {
+ QEMUIOVector rmw_qiov;
+ struct iovec rmw_iov;
+
+ /* Merge the user data with data read from the first block... */
+ if (head != NULL) {
+ rmw_iov.iov_base = head;
+ rmw_iov.iov_len = bs->host_block_size;
+ qemu_iovec_init_external(&rmw_qiov, &rmw_iov, 1);
+ ret = drv->bdrv_co_readv(bs, io_sector_num, alignment, &rmw_qiov);
+ if (ret < 0) {
+ return ret;
+ }
+ iov_to_buf(qiov->iov, qiov->size,
+ (head + head_extra * BDRV_SECTOR_SIZE), 0,
+ head_sectors * BDRV_SECTOR_SIZE);
+ }
+
+ /* ... and from the last block. The two reads might be merged in
+ * some cases, but it is rare enough that we do not care. */
+ if (tail != NULL) {
+ rmw_iov.iov_base = tail;
+ rmw_iov.iov_len = bs->host_block_size;
+ qemu_iovec_init_external(&rmw_qiov, &rmw_iov, 1);
+ ret = drv->bdrv_co_readv(bs, sector_num + tail_offset, alignment, &rmw_qiov);
+ if (ret < 0) {
+ return ret;
+ }
+ iov_to_buf(qiov->iov, qiov->size,
+ tail, tail_offset * BDRV_SECTOR_SIZE,
+ tail_sectors * BDRV_SECTOR_SIZE);
+ }
+ ret = drv->bdrv_co_writev(bs, io_sector_num, io_nb_sectors, &aligned_qiov);
+ } else {
+ ret = drv->bdrv_co_readv(bs, io_sector_num, io_nb_sectors, &aligned_qiov);
+ /* Copy the misaligned data that the user wants. */
+ if (head != NULL) {
+ iov_from_buf(qiov->iov, qiov->size,
+ (head + head_extra * BDRV_SECTOR_SIZE), 0,
+ head_sectors * BDRV_SECTOR_SIZE);
+ }
+ if (tail != NULL) {
+ iov_from_buf(qiov->iov, qiov->size,
+ tail, tail_offset * BDRV_SECTOR_SIZE,
+ tail_sectors * BDRV_SECTOR_SIZE);
+ }
+ }
+
+ qemu_iovec_destroy(&aligned_qiov);
+ g_free(head);
+ g_free(tail);
+ return ret;
+}
+
+
/*
* Handle a read request in coroutine context
*/
@@ -1624,7 +1769,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
}
}
- ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ if (!req_is_aligned(bs->host_block_size, sector_num, nb_sectors)) {
+ ret = bdrv_rw_co_align(bs, sector_num, nb_sectors, qiov, false);
+ } else {
+ ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+ }
out:
tracked_request_end(&req);
@@ -1669,9 +1818,24 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
get_cluster_size(bs), false);
}
+ /* When the guest sees a lower alignment than the host, and O_DIRECT
+ * is in place, assume that the write will become a read-modify-write
+ * sequence. These have to be serialized against each other, so that
+ * they look atomic to the guest.
+ */
+ if ((bs->open_flags & BDRV_O_NOCACHE) &&
+ bs->guest_block_size < bs->host_block_size) {
+ wait_for_overlapping_requests(bs, sector_num, nb_sectors,
+ bs->host_block_size, true);
+ }
+
tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
- ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ if (!req_is_aligned(bs->host_block_size, sector_num, nb_sectors)) {
+ ret = bdrv_rw_co_align(bs, sector_num, nb_sectors, qiov, true);
+ } else {
+ ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ }
if (bs->dirty_bitmap) {
set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
@@ -3592,6 +3756,12 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
{
bs->guest_block_size = align;
+ if ((bs->open_flags & BDRV_O_NOCACHE) &&
+ bs->guest_block_size < bs->host_block_size) {
+ error_report("Host block size is %d, guest block size is %d. Direct\n"
+ "access to the host device may cause performance degradation.",
+ bs->host_block_size, bs->guest_block_size);
+ }
if ((bs->open_flags & BDRV_O_RDWR) &&
bs->host_block_size < bs->guest_block_size) {
error_report("Host block size is %d, guest block size is %d. Due to partially\n"
diff --git a/trace-events b/trace-events
index 514849a..d33191d 100644
--- a/trace-events
+++ b/trace-events
@@ -68,6 +68,7 @@ bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"P
bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
+bdrv_rw_co_align(void *bs, int64_t sector_num, int nb_sectors, bool is_write) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d"
# hw/virtio-blk.c
virtio_blk_req_complete(void *req, int status) "req %p status %d"
--
1.7.7.1
next prev parent reply other threads:[~2011-12-13 12:38 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-12-13 12:37 [Qemu-devel] [PATCH 00/17] Support mismatched host and guest logical block sizes Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 01/17] block: do not rely on open_flags for bdrv_is_snapshot Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 02/17] block: store actual flags in bs->open_flags Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 03/17] block: pass protocol flags up to the format Paolo Bonzini
2011-12-15 4:10 ` Zhi Yong Wu
2011-12-13 12:37 ` [Qemu-devel] [PATCH 04/17] block: non-raw protocols never cache Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 05/17] block: remove enable_write_cache Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 06/17] block: move flag bits together Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 07/17] raw: remove the aligned_buf Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 08/17] block: rename buffer_alignment to guest_block_size Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 09/17] block: add host_block_size Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 10/17] raw: probe host_block_size Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 11/17] iscsi: save host block size Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 12/17] block: allow waiting only for overlapping writes Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 13/17] block: allow waiting at arbitrary granularity Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 14/17] block: protect against "torn reads" for guest_block_size > host_block_size Paolo Bonzini
2011-12-13 12:37 ` Paolo Bonzini [this message]
2011-12-13 12:37 ` [Qemu-devel] [PATCH 16/17] block: default physical block size to host block size Paolo Bonzini
2011-12-13 12:37 ` [Qemu-devel] [PATCH 17/17] qemu-io: add blocksize argument to open Paolo Bonzini
2011-12-14 11:13 ` [Qemu-devel] [PATCH 00/17] Support mismatched host and guest logical block sizes Kevin Wolf
2011-12-14 11:47 ` Paolo Bonzini
2011-12-14 12:05 ` Kevin Wolf
2011-12-14 12:40 ` Paolo Bonzini
2011-12-21 16:55 ` Christoph Hellwig
2011-12-21 17:00 ` Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1323779840-4235-16-git-send-email-pbonzini@redhat.com \
--to=pbonzini@redhat.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).