All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kevin Wolf <kwolf@redhat.com>
To: Hanna Czenczek <hreitz@redhat.com>
Cc: qemu-block@nongnu.org, qemu-devel@nongnu.org,
	Aarushi Mehta <mehta.aaru20@gmail.com>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	Stefano Garzarella <sgarzare@redhat.com>
Subject: Re: [PATCH for-11.0 2/3] linux-aio: Resubmit tails of short reads/writes
Date: Mon, 23 Mar 2026 18:12:01 +0100	[thread overview]
Message-ID: <acF0V3W4aWhP-Zlg@redhat.com> (raw)
In-Reply-To: <20260318153206.171494-3-hreitz@redhat.com>

Am 18.03.2026 um 16:32 hat Hanna Czenczek geschrieben:
> Short reads/writes can happen.  One way to reproduce them is via our
> FUSE export, with the following diff applied (%s/escaped // to apply --
> if you put plain diffs in commit messages, git-am will apply them, and I
> would rather avoid breaking FUSE accidentally via this patch):
> 
> escaped diff --git a/block/export/fuse.c b/block/export/fuse.c
> escaped index a2a478d293..67dc50a412 100644
> escaped --- a/block/export/fuse.c
> escaped +++ b/block/export/fuse.c
> @@ -828,7 +828,7 @@ static ssize_t coroutine_fn GRAPH_RDLOCK
>  fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
>               const struct fuse_init_in_compat *in)
>  {
> -    const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
> +    const uint32_t supported_flags = FUSE_ASYNC_READ;
> 
>      if (in->major != 7) {
>          error_report("FUSE major version mismatch: We have 7, but kernel has %"
> @@ -1060,6 +1060,8 @@ fuse_co_read(FuseExport *exp, void **bufptr, uint64_t offset, uint32_t size)
>      void *buf;
>      int ret;
> 
> +    size = MIN(size, 4096);
> +
>      /* Limited by max_read, should not happen */
>      if (size > FUSE_MAX_READ_BYTES) {
>          return -EINVAL;
> @@ -1110,6 +1112,8 @@ fuse_co_write(FuseExport *exp, struct fuse_write_out *out,
>      int64_t blk_len;
>      int ret;
> 
> +    size = MIN(size, 4096);
> +
>      QEMU_BUILD_BUG_ON(FUSE_MAX_WRITE_BYTES > BDRV_REQUEST_MAX_BYTES);
>      /* Limited by max_write, should not happen */
>      if (size > FUSE_MAX_WRITE_BYTES) {
> 
> Then:
> $ ./qemu-img create -f raw test.raw 8k
> Formatting 'test.raw', fmt=raw size=8192
> $ ./qemu-io -f raw -c 'write -P 42 0 8k' test.raw
> wrote 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (64.804 MiB/sec and 8294.9003 ops/sec)
> $ hexdump -C test.raw
> 00000000  2a 2a 2a 2a 2a 2a 2a 2a  2a 2a 2a 2a 2a 2a 2a 2a  |****************|
> *
> 00002000
> 
> With aio=threads, short I/O works:
> $ storage-daemon/qemu-storage-daemon \
>     --blockdev file,node-name=test,filename=test.raw \
>     --export fuse,id=exp,node-name=test,mountpoint=test.raw,writable=true
> 
> Other shell:
> $ ./qemu-io --image-opts -c 'read -P 42 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=threads
> read 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (36.563 MiB/sec and 4680.0923 ops/sec)
> $ ./qemu-io --image-opts -c 'write -P 23 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=threads
> wrote 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (35.995 MiB/sec and 4607.2970 ops/sec)
> $ hexdump -C test.raw
> 00000000  17 17 17 17 17 17 17 17  17 17 17 17 17 17 17 17  |................|
> *
> 00002000
> 
> But with aio=native, it does not:
> $ ./qemu-io --image-opts -c 'read -P 23 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=native
> Pattern verification failed at offset 0, 8192 bytes
> read 8192/8192 bytes at offset 0
> 8 KiB, 1 ops; 00.00 sec (86.155 MiB/sec and 11027.7900 ops/sec)
> $ ./qemu-io --image-opts -c 'write -P 42 0 8k' \
>     driver=file,filename=test.raw,cache.direct=on,aio=native
> write failed: No space left on device
> $ hexdump -C test.raw
> 00000000  2a 2a 2a 2a 2a 2a 2a 2a  2a 2a 2a 2a 2a 2a 2a 2a  |****************|
> *
> 00001000  17 17 17 17 17 17 17 17  17 17 17 17 17 17 17 17  |................|
> *
> 00002000
> 
> This patch fixes that.
> 
> Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
> ---
>  block/linux-aio.c | 61 ++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 53 insertions(+), 8 deletions(-)
> 
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 1f25339dc9..01621d4794 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -46,6 +46,10 @@ struct qemu_laiocb {
>      size_t nbytes;
>      QEMUIOVector *qiov;
>  
> +    /* For handling short reads/writes */
> +    size_t total_done;
> +    QEMUIOVector resubmit_qiov;
> +
>      int type;
>      BdrvRequestFlags flags;
>      uint64_t dev_max_batch;
> @@ -73,28 +77,61 @@ struct LinuxAioState {
>  };
>  
>  static void ioq_submit(LinuxAioState *s);
> +static int laio_do_submit(struct qemu_laiocb *laiocb);
>  
>  static inline ssize_t io_event_ret(struct io_event *ev)
>  {
>      return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
>  }
>  
> +/**
> + * Retry tail of short requests.
> + */
> +static int laio_resubmit_short_io(struct qemu_laiocb *laiocb, size_t done)
> +{
> +    QEMUIOVector *resubmit_qiov = &laiocb->resubmit_qiov;
> +
> +    laiocb->total_done += done;
> +
> +    if (!resubmit_qiov->iov) {
> +        qemu_iovec_init(resubmit_qiov, laiocb->qiov->niov);
> +    } else {
> +        qemu_iovec_reset(resubmit_qiov);
> +    }
> +    qemu_iovec_concat(resubmit_qiov, laiocb->qiov,
> +                      laiocb->total_done, laiocb->nbytes - laiocb->total_done);
> +
> +    return laio_do_submit(laiocb);
> +}
> +
>  /*
>   * Completes an AIO request.
>   */
>  static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
>  {
> -    int ret;
> +    ssize_t ret;
>  
>      ret = laiocb->ret;
>      if (ret != -ECANCELED) {
> -        if (ret == laiocb->nbytes) {
> +        if (ret == laiocb->nbytes - laiocb->total_done) {
>              ret = 0;
> +        } else if (ret > 0 && (laiocb->type == QEMU_AIO_READ ||
> +                               laiocb->type == QEMU_AIO_WRITE)) {
> +            ret = laio_resubmit_short_io(laiocb, ret);
> +            if (!ret) {
> +                return;
> +            }
>          } else if (ret >= 0) {
> -            /* Short reads mean EOF, pad with zeros. */
> +            /*
> +             * For normal reads and writes, we only get here if ret == 0, which
> +             * means EOF for reads and ENOSPC for writes.
> +             * For zone-append, we get here with any ret >= 0, which we just
> +             * treat as ENOSPC, too (safer than resubmitting, probably, but not
> +             * 100 % clear).
> +             */
>              if (laiocb->type == QEMU_AIO_READ) {
> -                qemu_iovec_memset(laiocb->qiov, ret, 0,
> -                    laiocb->qiov->size - ret);
> +                qemu_iovec_memset(laiocb->qiov, laiocb->total_done, 0,
> +                                  laiocb->qiov->size - laiocb->total_done);
>              } else {
>                  ret = -ENOSPC;
>              }
> @@ -102,6 +139,7 @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
>      }
>  
>      laiocb->ret = ret;
> +    qemu_iovec_destroy(&laiocb->resubmit_qiov);

Calling qemu_iovec_destroy() for a qiov that has potentially never been
initialised feels a bit unsafe to me. It will work in practice for the
current implementation, but maybe make this one conditional on
laiocb->resubmit_qiov.iov, too? (Which is already making assumptions
about the internals of QEMUIOVector, but that we'll have an iov after
initialising the qiov with qemu_iovec_init() above will probably never
change.)

>      /*
>       * If the coroutine is already entered it must be in ioq_submit() and
> @@ -380,23 +418,30 @@ static int laio_do_submit(struct qemu_laiocb *laiocb)
>      int fd = laiocb->fd;
>      off_t offset = laiocb->offset;

I wonder if making it laiocb->offset + laiocb->total_done here wouldn't
be more robust than having the addition in every call below.

> +    if (laiocb->resubmit_qiov.iov) {
> +        qiov = &laiocb->resubmit_qiov;
> +    }
> +
>      switch (laiocb->type) {
>      case QEMU_AIO_WRITE:
>  #ifdef HAVE_IO_PREP_PWRITEV2
>      {
>          int laio_flags = (laiocb->flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
> -        io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov, offset, laio_flags);
> +        io_prep_pwritev2(iocbs, fd, qiov->iov, qiov->niov,
> +                         offset + laiocb->total_done, laio_flags);
>      }
>  #else
>          assert(laiocb->flags == 0);
> -        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
> +        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov,
> +                        offset + laiocb->total_done);
>  #endif
>          break;
>      case QEMU_AIO_ZONE_APPEND:
>          io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
>          break;
>      case QEMU_AIO_READ:
> -        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
> +        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov,
> +                       offset + laiocb->total_done);
>          break;
>      case QEMU_AIO_FLUSH:
>          io_prep_fdsync(iocbs, fd);

Reviewed-by: Kevin Wolf <kwolf@redhat.com>



  reply	other threads:[~2026-03-23 17:13 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-18 15:32 [PATCH for-11.0 0/3] linux-aio/io-uring: Resubmit tails of short requests Hanna Czenczek
2026-03-18 15:32 ` [PATCH for-11.0 1/3] linux-aio: Put all parameters into qemu_laiocb Hanna Czenczek
2026-03-23 16:36   ` Kevin Wolf
2026-03-23 17:02     ` Hanna Czenczek
2026-03-23 17:04       ` Hanna Czenczek
2026-03-23 19:10         ` Kevin Wolf
2026-03-18 15:32 ` [PATCH for-11.0 2/3] linux-aio: Resubmit tails of short reads/writes Hanna Czenczek
2026-03-23 17:12   ` Kevin Wolf [this message]
2026-03-24  8:12     ` Hanna Czenczek
2026-03-24  8:22       ` Hanna Czenczek
2026-03-24  9:22         ` Kevin Wolf
2026-03-24 10:04           ` Hanna Czenczek
2026-03-18 15:32 ` [PATCH for-11.0 3/3] io-uring: Resubmit tails of short writes Hanna Czenczek
2026-03-23 19:05   ` Kevin Wolf
2026-03-23 16:28 ` [PATCH for-11.0 0/3] linux-aio/io-uring: Resubmit tails of short requests Kevin Wolf
2026-03-23 16:59   ` Hanna Czenczek

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=acF0V3W4aWhP-Zlg@redhat.com \
    --to=kwolf@redhat.com \
    --cc=hreitz@redhat.com \
    --cc=mehta.aaru20@gmail.com \
    --cc=qemu-block@nongnu.org \
    --cc=qemu-devel@nongnu.org \
    --cc=sgarzare@redhat.com \
    --cc=stefanha@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.