public inbox for qemu-devel@nongnu.org
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: qemu-block@nongnu.org
Cc: kwolf@redhat.com, qemu-devel@nongnu.org
Subject: [PULL 3/4] linux-aio: Resubmit tails of short reads/writes
Date: Tue, 24 Mar 2026 19:56:18 +0100	[thread overview]
Message-ID: <20260324185619.296946-4-kwolf@redhat.com> (raw)
In-Reply-To: <20260324185619.296946-1-kwolf@redhat.com>

From: Hanna Czenczek <hreitz@redhat.com>

Short reads/writes can happen.  One way to reproduce them is via our
FUSE export, with the following diff applied (%s/escaped // to apply --
if you put plain diffs in commit messages, git-am will apply them, and I
would rather avoid breaking FUSE accidentally via this patch):

escaped diff --git a/block/export/fuse.c b/block/export/fuse.c
escaped index a2a478d293..67dc50a412 100644
escaped --- a/block/export/fuse.c
escaped +++ b/block/export/fuse.c
@@ -828,7 +828,7 @@ static ssize_t coroutine_fn GRAPH_RDLOCK
 fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
              const struct fuse_init_in_compat *in)
 {
-    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
+    const uint32_t supported_flags = FUSE_ASYNC_READ;

     if (in->major != 7) {
         error_report("FUSE major version mismatch: We have 7, but kernel has %"
@@ -1060,6 +1060,8 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
     void *buf;
     int ret;

+    size = MIN(size, 4096);
+
     /* Limited by max_read, should not happen */
     if (size > FUSE_MAX_READ_BYTES) {
         return -EINVAL;
@@ -1110,6 +1112,8 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
     int64_t blk_len;
     int ret;

+    size = MIN(size, 4096);
+
     QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
     /* Limited by max_write, should not happen */
     if (size > FUSE_MAX_WRITE_BYTES) {

Then:
$ ./qemu-img create -f raw test.raw 8k
Formatting 'test.raw', fmt=raw size=8192
$ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw
wrote 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec)
$ hexdump -C test.raw
00000000  2a 2a 2a 2a 2a 2a 2a 2a  2a 2a 2a 2a 2a 2a 2a 2a  |****************|
*
00002000

With aio=threads, short I/O works:
$ storage-daemon/qemu-storage-daemon \
    --blockdev file,node-name=test,filename=test.raw \
    --export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true

Other shell:
$ ./qemu-io --image-opts -c 'read -P 42 0 8k' \
    driver=file,filename=test.raw,cache.direct=on,aio=threads
read 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (36.563 MiB/sec and 4680.0923 ops/sec)
$ ./qemu-io --image-opts -c 'write -P 23 0 8k' \
    driver=file,filename=test.raw,cache.direct=on,aio=threads
wrote 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (35.995 MiB/sec and 4607.2970 ops/sec)
$ hexdump -C test.raw
00000000  17 17 17 17 17 17 17 17  17 17 17 17 17 17 17 17  |................|
*
00002000

But with aio=native, it does not:
$ ./qemu-io --image-opts -c 'read -P 23 0 8k' \
    driver=file,filename=test.raw,cache.direct=on,aio=native
Pattern verification failed at offset 0, 8192 bytes
read 8192/8192 bytes at offset 0
8 KiB, 1 ops; 00.00 sec (86.155 MiB/sec and 11027.7900 ops/sec)
$ ./qemu-io --image-opts -c 'write -P 42 0 8k' \
    driver=file,filename=test.raw,cache.direct=on,aio=native
write failed: No space left on device
$ hexdump -C test.raw
00000000  2a 2a 2a 2a 2a 2a 2a 2a  2a 2a 2a 2a 2a 2a 2a 2a  |****************|
*
00001000  17 17 17 17 17 17 17 17  17 17 17 17 17 17 17 17  |................|
*
00002000

This patch fixes that.

Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
Message-ID: <20260324084338.37453-3-hreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/linux-aio.c | 56 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index 3843f45eac8..0a7424fbb33 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -45,6 +45,10 @@ struct qemu_laiocb {
     size_t nbytes;
     QEMUIOVector *qiov;
 
+    /* For handling short reads/writes */
+    size_t total_done;
+    QEMUIOVector resubmit_qiov;
+
     int fd;
     int type;
     BdrvRequestFlags flags;
@@ -74,28 +78,61 @@ struct LinuxAioState {
 };
 
 static void ioq_submit(LinuxAioState *s);
+static int laio_do_submit(struct qemu_laiocb *laiocb);
 
 static inline ssize_t io_event_ret(struct io_event *ev)
 {
     return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
 }
 
+/**
+ * Retry tail of short requests.
+ */
+static int laio_resubmit_short_io(struct qemu_laiocb *laiocb, size_t done)
+{
+    QEMUIOVector *resubmit_qiov = &laiocb->resubmit_qiov;
+
+    laiocb->total_done += done;
+
+    if (!resubmit_qiov->iov) {
+        qemu_iovec_init(resubmit_qiov, laiocb->qiov->niov);
+    } else {
+        qemu_iovec_reset(resubmit_qiov);
+    }
+    qemu_iovec_concat(resubmit_qiov, laiocb->qiov,
+                      laiocb->total_done, laiocb->nbytes - laiocb->total_done);
+
+    return laio_do_submit(laiocb);
+}
+
 /*
  * Completes an AIO request.
  */
 static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
 {
-    int ret;
+    ssize_t ret;
 
     ret = laiocb->ret;
     if (ret != -ECANCELED) {
-        if (ret == laiocb->nbytes) {
+        if (ret == laiocb->nbytes - laiocb->total_done) {
             ret = 0;
+        } else if (ret > 0 && (laiocb->type == QEMU_AIO_READ ||
+                               laiocb->type == QEMU_AIO_WRITE)) {
+            ret = laio_resubmit_short_io(laiocb, ret);
+            if (!ret) {
+                return;
+            }
         } else if (ret >= 0) {
-            /* Short reads mean EOF, pad with zeros. */
+            /*
+             * For normal reads and writes, we only get here if ret == 0, which
+             * means EOF for reads and ENOSPC for writes.
+             * For zone-append, we get here with any ret >= 0, which we just
+             * treat as ENOSPC, too (safer than resubmitting, probably, but not
+             * 100 % clear).
+             */
             if (laiocb->type == QEMU_AIO_READ) {
-                qemu_iovec_memset(laiocb->qiov, ret, 0,
-                    laiocb->qiov->size - ret);
+                qemu_iovec_memset(laiocb->qiov, laiocb->total_done, 0,
+                                  laiocb->qiov->size - laiocb->total_done);
             } else {
                 ret = -ENOSPC;
             }
@@ -103,6 +140,9 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
     }
 
     laiocb->ret = ret;
+    if (laiocb->resubmit_qiov.iov) {
+        qemu_iovec_destroy(&laiocb->resubmit_qiov);
+    }
 
     /*
      * If the coroutine is already entered it must be in ioq_submit() and
@@ -379,7 +419,11 @@ static int laio_do_submit(struct qemu_laiocb *laiocb)
     struct iocb *iocbs = &laiocb->iocb;
     QEMUIOVector *qiov = laiocb->qiov;
     int fd = laiocb->fd;
-    off_t offset = laiocb->offset;
+    off_t offset = laiocb->offset + laiocb->total_done;
+
+    if (laiocb->resubmit_qiov.iov) {
+        qiov = &laiocb->resubmit_qiov;
+    }
 
     switch (laiocb->type) {
     case QEMU_AIO_WRITE:
-- 
2.53.0



  parent reply	other threads:[~2026-03-24 18:57 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-24 18:56 [PULL 0/4] Block layer patches Kevin Wolf
2026-03-24 18:56 ` [PULL 1/4] block/curl: free s->password in cleanup paths Kevin Wolf
2026-03-24 18:56 ` [PULL 2/4] linux-aio: Put all parameters into qemu_laiocb Kevin Wolf
2026-03-24 18:56 ` Kevin Wolf [this message]
2026-03-24 18:56 ` [PULL 4/4] io-uring: Resubmit tails of short writes Kevin Wolf
2026-03-25 16:52 ` [PULL 0/4] Block layer patches Peter Maydell
2026-03-26  5:02 ` Michael Tokarev
2026-03-26 12:50   ` Kevin Wolf

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260324185619.296946-4-kwolf@redhat.com \
    --to=kwolf@redhat.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox