From: Kevin Wolf <kwolf@redhat.com>
To: qemu-block@nongnu.org
Cc: kwolf@redhat.com, qemu-devel@nongnu.org
Subject: [PULL 3/4] linux-aio: Resubmit tails of short reads/writes
Date: Tue, 24 Mar 2026 19:56:18 +0100 [thread overview]
Message-ID: <20260324185619.296946-4-kwolf@redhat.com> (raw)
In-Reply-To: <20260324185619.296946-1-kwolf@redhat.com>
From: Hanna Czenczek <hreitz@redhat.com>
Short reads/writes can happen. One way to reproduce them is via our
FUSE export, with the following diff applied (%s/escaped // to apply --
if you put plain diffs in commit messages, git-am will apply them, and I
would rather avoid breaking FUSE accidentally via this patch):
escaped diff --git a/block/export/fuse.c b/block/export/fuse.c
escaped index a2a478d293..67dc50a412 100644
escaped --- a/block/export/fuse.c
escaped +++ b/block/export/fuse.c
@@ -828,7 +828,7 @@ static ssize_t coroutine_fn GRAPH_RDLOCK
fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
const struct fuse_init_in_compat *in)
{
- const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
+ const uint32_t supported_flags = FUSE_ASYNC_READ;
if (in->major != 7) {
error_report("FUSE major version mismatch: We have 7, but kernel has %"
@@ -1060,6 +1060,8 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
void *buf;
int ret;
+ size = MIN(size, 4096);
+
/* Limited by max_read, should not happen */
if (size > FUSE_MAX_READ_BYTES) {
return -EINVAL;
@@ -1110,6 +1112,8 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
int64_t blk_len;
int ret;
+ size = MIN(size, 4096);
+
QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
/* Limited by max_write, should not happen */
if (size > FUSE_MAX_WRITE_BYTES) {
Then:
$ ./qemu-img create -f raw test.raw 8k
Formatting 'test.raw', fmt=raw size=8192
$ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw
wrote 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec)
$ hexdump -C test.raw
00000000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************|
*
00002000
With aio=threads, short I/O works:
$ storage-daemon/qemu-storage-daemon \
--blockdev file,node-name=test,filename=test.raw \
--export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true
Other shell:
$ ./qemu-io --image-opts -c 'read -P 42 0 8k' \
driver=file,filename=test.raw,cache.direct=on,aio=threads
read 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (36.563 MiB/sec and 4680.0923 ops/sec)
$ ./qemu-io --image-opts -c 'write -P 23 0 8k' \
driver=file,filename=test.raw,cache.direct=on,aio=threads
wrote 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (35.995 MiB/sec and 4607.2970 ops/sec)
$ hexdump -C test.raw
00000000 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 |................|
*
00002000
But with aio=native, it does not:
$ ./qemu-io --image-opts -c 'read -P 23 0 8k' \
driver=file,filename=test.raw,cache.direct=on,aio=native
Pattern verification failed at offset 0, 8192 bytes
read 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (86.155 MiB/sec and 11027.7900 ops/sec)
$ ./qemu-io --image-opts -c 'write -P 42 0 8k' \
driver=file,filename=test.raw,cache.direct=on,aio=native
write failed: No space left on device
$ hexdump -C test.raw
00000000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************|
*
00001000 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 |................|
*
00002000
This patch fixes that.
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
Message-ID: <20260324084338.37453-3-hreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/linux-aio.c | 56 ++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 50 insertions(+), 6 deletions(-)
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 3843f45eac8..0a7424fbb33 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -45,6 +45,10 @@ struct qemu_laiocb {
size_t nbytes;
QEMUIOVector *qiov;
+ /* For handling short reads/writes */
+ size_t total_done;
+ QEMUIOVector resubmit_qiov;
+
int fd;
int type;
BdrvRequestFlags flags;
@@ -74,28 +78,61 @@ struct LinuxAioState {
};
static void ioq_submit(LinuxAioState *s);
+static int laio_do_submit(struct qemu_laiocb *laiocb);
static inline ssize_t io_event_ret(struct io_event *ev)
{
return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
}
+/**
+ * Retry tail of short requests.
+ */
+static int laio_resubmit_short_io(struct qemu_laiocb *laiocb, size_t done)
+{
+ QEMUIOVector *resubmit_qiov = &laiocb->resubmit_qiov;
+
+ laiocb->total_done += done;
+
+ if (!resubmit_qiov->iov) {
+ qemu_iovec_init(resubmit_qiov, laiocb->qiov->niov);
+ } else {
+ qemu_iovec_reset(resubmit_qiov);
+ }
+ qemu_iovec_concat(resubmit_qiov, laiocb->qiov,
+ laiocb->total_done, laiocb->nbytes - laiocb->total_done);
+
+ return laio_do_submit(laiocb);
+}
+
/*
* Completes an AIO request.
*/
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
{
- int ret;
+ ssize_t ret;
ret = laiocb->ret;
if (ret != -ECANCELED) {
- if (ret == laiocb->nbytes) {
+ if (ret == laiocb->nbytes - laiocb->total_done) {
ret = 0;
+ } else if (ret > 0 && (laiocb->type == QEMU_AIO_READ ||
+ laiocb->type == QEMU_AIO_WRITE)) {
+ ret = laio_resubmit_short_io(laiocb, ret);
+ if (!ret) {
+ return;
+ }
} else if (ret >= 0) {
- /* Short reads mean EOF, pad with zeros. */
+ /*
+ * For normal reads and writes, we only get here if ret == 0, which
+ * means EOF for reads and ENOSPC for writes.
+ * For zone-append, we get here with any ret >= 0, which we just
+ * treat as ENOSPC, too (safer than resubmitting, probably, but not
+ * 100 % clear).
+ */
if (laiocb->type == QEMU_AIO_READ) {
- qemu_iovec_memset(laiocb->qiov, ret, 0,
- laiocb->qiov->size - ret);
+ qemu_iovec_memset(laiocb->qiov, laiocb->total_done, 0,
+ laiocb->qiov->size - laiocb->total_done);
} else {
ret = -ENOSPC;
}
@@ -103,6 +140,9 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
}
laiocb->ret = ret;
+ if (laiocb->resubmit_qiov.iov) {
+ qemu_iovec_destroy(&laiocb->resubmit_qiov);
+ }
/*
* If the coroutine is already entered it must be in ioq_submit() and
@@ -379,7 +419,11 @@ static int laio_do_submit(struct qemu_laiocb *laiocb)
struct iocb *iocbs = &laiocb->iocb;
QEMUIOVector *qiov = laiocb->qiov;
int fd = laiocb->fd;
- off_t offset = laiocb->offset;
+ off_t offset = laiocb->offset + laiocb->total_done;
+
+ if (laiocb->resubmit_qiov.iov) {
+ qiov = &laiocb->resubmit_qiov;
+ }
switch (laiocb->type) {
case QEMU_AIO_WRITE:
--
2.53.0
next prev parent reply other threads:[~2026-03-24 18:57 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-24 18:56 [PULL 0/4] Block layer patches Kevin Wolf
2026-03-24 18:56 ` [PULL 1/4] block/curl: free s->password in cleanup paths Kevin Wolf
2026-03-24 18:56 ` [PULL 2/4] linux-aio: Put all parameters into qemu_laiocb Kevin Wolf
2026-03-24 18:56 ` Kevin Wolf [this message]
2026-03-24 18:56 ` [PULL 4/4] io-uring: Resubmit tails of short writes Kevin Wolf
2026-03-25 16:52 ` [PULL 0/4] Block layer patches Peter Maydell
2026-03-26 5:02 ` Michael Tokarev
2026-03-26 12:50 ` Kevin Wolf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260324185619.296946-4-kwolf@redhat.com \
--to=kwolf@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.