From: Kevin Wolf <kwolf@redhat.com>
To: qemu-block@nongnu.org
Cc: kwolf@redhat.com, qemu-devel@nongnu.org
Subject: [PULL 4/4] io-uring: Resubmit tails of short writes
Date: Tue, 24 Mar 2026 19:56:19 +0100 [thread overview]
Message-ID: <20260324185619.296946-5-kwolf@redhat.com> (raw)
In-Reply-To: <20260324185619.296946-1-kwolf@redhat.com>
From: Hanna Czenczek <hreitz@redhat.com>
Short writes can happen, too, not just short reads. The difference to
aio=native is that the kernel will actually retry the tail of short
requests internally already -- so it is harder to reproduce. But if the
tail of a short request returns an error to the kernel, we will see it
in userspace still. To reproduce this, apply the following patch on top
of the one shown in HEAD^ (again %s/escaped // to apply):
escaped diff --git a/block/export/fuse.c b/block/export/fuse.c
escaped index 67dc50a412..2b98489a32 100644
escaped --- a/block/export/fuse.c
escaped +++ b/block/export/fuse.c
@@ -1059,8 +1059,15 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
int64_t blk_len;
void *buf;
int ret;
+ static uint32_t error_size;
- size = MIN(size, 4096);
+ if (error_size == size) {
+ error_size = 0;
+ return -EIO;
+ } else if (size > 4096) {
+ error_size = size - 4096;
+ size = 4096;
+ }
/* Limited by max_read, should not happen */
if (size > FUSE_MAX_READ_BYTES) {
@@ -1111,8 +1118,15 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
{
int64_t blk_len;
int ret;
+ static uint32_t error_size;
- size = MIN(size, 4096);
+ if (error_size == size) {
+ error_size = 0;
+ return -EIO;
+ } else if (size > 4096) {
+ error_size = size - 4096;
+ size = 4096;
+ }
QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
/* Limited by max_write, should not happen */
I know this is a bit artificial because to produce this, there must be
an I/O error somewhere anyway, but if it does happen, qemu will
understand it to mean ENOSPC for short writes, which is incorrect. So I
believe we need to resubmit the tail to maybe have it succeed now, or at
least get the correct error code.
Reproducer as before:
$ ./qemu-img create -f raw test.raw 8k
Formatting 'test.raw', fmt=raw size=8192
$ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw
wrote 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec)
$ hexdump -C test.raw
00000000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************|
*
00002000
$ storage-daemon/qemu-storage-daemon \
--blockdev file,node-name=test,filename=test.raw \
--export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true
$ ./qemu-io --image-opts -c 'read -P 23 0 8k' \
driver=file,filename=test.raw,cache.direct=on,aio=io_uring
read 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (58.481 MiB/sec and 7485.5342 ops/sec)
$ ./qemu-io --image-opts -c 'write -P 23 0 8k' \
driver=file,filename=test.raw,cache.direct=on,aio=io_uring
write failed: No space left on device
$ hexdump -C test.raw
00000000 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 |................|
*
00001000 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a |****************|
*
00002000
So short reads already work (because there is code for that), but short
writes incorrectly produce ENOSPC. This patch fixes that by
resubmitting not only the tail of short reads but short writes also.
(And this patch uses the opportunity to make it so qemu_iovec_destroy()
is called only if req->resubmit_qiov.iov is non-NULL. Functionally a
non-op, but this is how the code generally checks whether the
resubmit_qiov has been set up or not.)
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
Message-ID: <20260324084338.37453-4-hreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
block/io_uring.c | 82 +++++++++++++++++++++++++---------------------
block/trace-events | 2 +-
2 files changed, 46 insertions(+), 38 deletions(-)
diff --git a/block/io_uring.c b/block/io_uring.c
index cb131d3b8b5..c48a72d37eb 100644
--- a/block/io_uring.c
+++ b/block/io_uring.c
@@ -27,10 +27,10 @@ typedef struct {
BdrvRequestFlags flags;
/*
- * Buffered reads may require resubmission, see
- * luring_resubmit_short_read().
+ * Short reads/writes require resubmission, see
+ * luring_resubmit_short_io().
*/
- int total_read;
+ int total_done;
QEMUIOVector resubmit_qiov;
CqeHandler cqe_handler;
@@ -40,10 +40,14 @@ static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
{
LuringRequest *req = opaque;
QEMUIOVector *qiov = req->qiov;
- uint64_t offset = req->offset;
+ uint64_t offset = req->offset + req->total_done;
int fd = req->fd;
BdrvRequestFlags flags = req->flags;
+ if (req->resubmit_qiov.iov) {
+ qiov = &req->resubmit_qiov;
+ }
+
switch (req->type) {
case QEMU_AIO_WRITE:
{
@@ -73,17 +77,12 @@ static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
break;
case QEMU_AIO_READ:
{
- if (req->resubmit_qiov.iov != NULL) {
- qiov = &req->resubmit_qiov;
- }
if (qiov->niov > 1) {
- io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov,
- offset + req->total_read);
+ io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov, offset);
} else {
/* The man page says non-vectored is faster than vectored */
struct iovec *iov = qiov->iov;
- io_uring_prep_read(sqe, fd, iov->iov_base, iov->iov_len,
- offset + req->total_read);
+ io_uring_prep_read(sqe, fd, iov->iov_base, iov->iov_len, offset);
}
break;
}
@@ -98,21 +97,26 @@ static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
}
/**
- * luring_resubmit_short_read:
+ * luring_resubmit_short_io:
*
- * Short reads are rare but may occur. The remaining read request needs to be
- * resubmitted.
+ * Short reads and writes are rare but may occur. The remaining request needs
+ * to be resubmitted.
+ *
+ * For example, short reads can be reproduced by a FUSE export deliberately
+ * executing short reads. The tail of short writes is generally resubmitted by
+ * io-uring in the kernel, but if that resubmission encounters an I/O error, the
+ * already submitted portion will be returned as a short write.
*/
-static void luring_resubmit_short_read(LuringRequest *req, int nread)
+static void luring_resubmit_short_io(LuringRequest *req, int ndone)
{
QEMUIOVector *resubmit_qiov;
size_t remaining;
- trace_luring_resubmit_short_read(req, nread);
+ trace_luring_resubmit_short_io(req, ndone);
- /* Update read position */
- req->total_read += nread;
- remaining = req->qiov->size - req->total_read;
+ /* Update I/O position */
+ req->total_done += ndone;
+ remaining = req->qiov->size - req->total_done;
/* Shorten qiov */
resubmit_qiov = &req->resubmit_qiov;
@@ -121,7 +125,7 @@ static void luring_resubmit_short_read(LuringRequest *req, int nread)
} else {
qemu_iovec_reset(resubmit_qiov);
}
- qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining);
+ qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_done, remaining);
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
}
@@ -153,31 +157,35 @@ static void luring_cqe_handler(CqeHandler *cqe_handler)
return;
}
} else if (req->qiov) {
- /* total_read is non-zero only for resubmitted read requests */
- int total_bytes = ret + req->total_read;
+ /* total_done is non-zero only for resubmitted requests */
+ int total_bytes = ret + req->total_done;
if (total_bytes == req->qiov->size) {
ret = 0;
- } else {
+ } else if (ret > 0 && (req->type == QEMU_AIO_READ ||
+ req->type == QEMU_AIO_WRITE)) {
/* Short Read/Write */
- if (req->type == QEMU_AIO_READ) {
- if (ret > 0) {
- luring_resubmit_short_read(req, ret);
- return;
- }
-
- /* Pad with zeroes */
- qemu_iovec_memset(req->qiov, total_bytes, 0,
- req->qiov->size - total_bytes);
- ret = 0;
- } else {
- ret = -ENOSPC;
- }
+ luring_resubmit_short_io(req, ret);
+ return;
+ } else if (req->type == QEMU_AIO_READ) {
+ /* Read ret == 0: EOF, pad with zeroes */
+ qemu_iovec_memset(req->qiov, total_bytes, 0,
+ req->qiov->size - total_bytes);
+ ret = 0;
+ } else {
+ /*
+ * Normal write ret == 0 means ENOSPC.
+ * For zone-append, we treat any 0 <= ret < qiov->size as ENOSPC,
+ * too, because resubmitting the tail seems a little unsafe.
+ */
+ ret = -ENOSPC;
}
}
req->ret = ret;
- qemu_iovec_destroy(&req->resubmit_qiov);
+ if (req->resubmit_qiov.iov) {
+ qemu_iovec_destroy(&req->resubmit_qiov);
+ }
/*
* If the coroutine is already entered it must be in luring_co_submit() and
diff --git a/block/trace-events b/block/trace-events
index d170fc96f15..950c82d4b80 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -64,7 +64,7 @@ file_paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "
# io_uring.c
luring_cqe_handler(void *req, int ret) "req %p ret %d"
luring_co_submit(void *bs, void *req, int fd, uint64_t offset, size_t nbytes, int type) "bs %p req %p fd %d offset %" PRId64 " nbytes %zd type %d"
-luring_resubmit_short_read(void *req, int nread) "req %p nread %d"
+luring_resubmit_short_io(void *req, int ndone) "req %p ndone %d"
# qcow2.c
qcow2_add_task(void *co, void *bs, void *pool, const char *action, int cluster_type, uint64_t host_offset, uint64_t offset, uint64_t bytes, void *qiov, size_t qiov_offset) "co %p bs %p pool %p: %s: cluster_type %d file_cluster_offset %" PRIu64 " offset %" PRIu64 " bytes %" PRIu64 " qiov %p qiov_offset %zu"
--
2.53.0
next prev parent reply other threads:[~2026-03-24 18:57 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-24 18:56 [PULL 0/4] Block layer patches Kevin Wolf
2026-03-24 18:56 ` [PULL 1/4] block/curl: free s->password in cleanup paths Kevin Wolf
2026-03-24 18:56 ` [PULL 2/4] linux-aio: Put all parameters into qemu_laiocb Kevin Wolf
2026-03-24 18:56 ` [PULL 3/4] linux-aio: Resubmit tails of short reads/writes Kevin Wolf
2026-03-24 18:56 ` Kevin Wolf [this message]
2026-03-25 16:52 ` [PULL 0/4] Block layer patches Peter Maydell
2026-03-26 5:02 ` Michael Tokarev
2026-03-26 12:50 ` Kevin Wolf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260324185619.296946-5-kwolf@redhat.com \
--to=kwolf@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox