* [PATCH v5] block/file-posix.c: Use pwritev2() with RWF_DSYNC for FUA
@ 2025-04-05 23:52 Pinku Deb Nath
2025-04-07 13:55 ` Stefan Hajnoczi
0 siblings, 1 reply; 2+ messages in thread
From: Pinku Deb Nath @ 2025-04-05 23:52 UTC (permalink / raw)
To: Kevin Wolf, Stefan Hajnoczi; +Cc: qemu-block, qemu-devel, Pinku Deb Nath
Full Unit Access (FUA) is an optimization where a disk write with the
flag set will be persisted to disk immediately instead of potentially
remaining in the disk's write cache.
This commit address the todo task
for using pwritev2() with RWF_DSYNC in the thread pool section of
raw_co_prw(), if pwritev2() with RWF_DSYNC is available in the host,
which is always the case for Linux kernel >= 4.7.
The intent for FUA is indicated with the BDRV_REQ_FUA flag.
The old code paths are preserved in case BDRV_REQ_FUA is off
or pwritev2() with RWF_DSYNC is not available.
Support for disk writes with FUA is handled in qemu_pwritev_fua(),
which uses pwritev2() with RWF_DSYNC if available, otherwise falls
back to pwritev2() with no flags followed by flush using
handle_aiocb_flush().
If pwritev2() is not implemented, then disk write in the linear FUA
will fallback to pwrite() + handle_aiocb_flush().
Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>
---
v4:
- Add fallback when qemu_pwritev_fua() returns ENOSYS
- Similar fallback was not added for handle_aiocb_rw_vector()
since there is a preadv_present check in handle_aiocb_rw()
v3:
- Changed signature to add fd, iov, nr_iov
- Return -ENOSYS for non-Linux hosts
v2:
- Moved handle_aiocb_flush() into qemu_pwritev_fua()
- In handle_aiocb_rw_linear(), iovec with iovcnt=1 is created
based on the assumption that there will be only one buffer
---
block/file-posix.c | 68 ++++++++++++++++++++++++++++++++++++++--------
1 file changed, 56 insertions(+), 12 deletions(-)
diff --git a/block/file-posix.c b/block/file-posix.c
index 56d1972d15..59bed7866a 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
unsigned long op;
} zone_mgmt;
};
+ BdrvRequestFlags flags;
} RawPosixAIOData;
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1674,6 +1675,20 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
return pwritev(fd, iov, nr_iov, offset);
}
+static ssize_t
+qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const RawPosixAIOData *aiocb)
+{
+#ifdef RWF_DSYNC
+ return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
+#else
+ ssize_t len = pwritev2(fd, iov, nr_iov, offset, 0);
+ if (len == 0) {
+ len = handle_aiocb_flush(aiocb);
+ }
+ return len;
+#endif
+}
+
#else
static bool preadv_present = false;
@@ -1690,6 +1705,11 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
return -ENOSYS;
}
+static ssize_t
+qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const RawPosixAIOData *aiocb)
+{
+ return -ENOSYS;
+}
#endif
static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
@@ -1698,10 +1718,16 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
len = RETRY_ON_EINTR(
(aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
- qemu_pwritev(aiocb->aio_fildes,
- aiocb->io.iov,
- aiocb->io.niov,
- aiocb->aio_offset) :
+ (aiocb->flags & BDRV_REQ_FUA) ?
+ qemu_pwritev_fua(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset,
+ aiocb) :
+ qemu_pwritev(aiocb->aio_fildes,
+ aiocb->io.iov,
+ aiocb->io.niov,
+ aiocb->aio_offset) :
qemu_preadv(aiocb->aio_fildes,
aiocb->io.iov,
aiocb->io.niov,
@@ -1727,10 +1753,31 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
while (offset < aiocb->aio_nbytes) {
if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
- len = pwrite(aiocb->aio_fildes,
- (const char *)buf + offset,
- aiocb->aio_nbytes - offset,
- aiocb->aio_offset + offset);
+ if (aiocb->flags & BDRV_REQ_FUA) {
+ struct iovec iov = {
+ .iov_base = buf + offset,
+ .iov_len = aiocb->aio_nbytes - offset,
+ };
+ len = qemu_pwritev_fua(aiocb->aio_fildes,
+ &iov,
+ 1,
+ aiocb->aio_offset + offset,
+ aiocb);
+ if (len == -ENOSYS) {
+ len = pwrite(aiocb->aio_fildes,
+ (const char *)buf + offset,
+ aiocb->aio_nbytes - offset,
+ aiocb->aio_offset + offset);
+ if (len == 0) {
+ len = handle_aiocb_flush(aiocb);
+ }
+ }
+ } else {
+ len = pwrite(aiocb->aio_fildes,
+ (const char *)buf + offset,
+ aiocb->aio_nbytes - offset,
+ aiocb->aio_offset + offset);
+ }
} else {
len = pread(aiocb->aio_fildes,
buf + offset,
@@ -2539,14 +2586,11 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
.iov = qiov->iov,
.niov = qiov->niov,
},
+ .flags = flags,
};
assert(qiov->size == bytes);
ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
- if (ret == 0 && (flags & BDRV_REQ_FUA)) {
- /* TODO Use pwritev2() instead if it's available */
- ret = raw_co_flush_to_disk(bs);
- }
goto out; /* Avoid the compiler err of unused label */
out:
--
2.43.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v5] block/file-posix.c: Use pwritev2() with RWF_DSYNC for FUA
2025-04-05 23:52 [PATCH v5] block/file-posix.c: Use pwritev2() with RWF_DSYNC for FUA Pinku Deb Nath
@ 2025-04-07 13:55 ` Stefan Hajnoczi
0 siblings, 0 replies; 2+ messages in thread
From: Stefan Hajnoczi @ 2025-04-07 13:55 UTC (permalink / raw)
To: Pinku Deb Nath; +Cc: Kevin Wolf, qemu-block, qemu-devel
[-- Attachment #1: Type: text/plain, Size: 6788 bytes --]
On Sat, Apr 05, 2025 at 04:52:29PM -0700, Pinku Deb Nath wrote:
> Full Unit Access (FUA) is an optimization where a disk write with the
> flag set will be persisted to disk immediately instead of potentially
> remaining in the disk's write cache.
>
> This commit address the todo task
> for using pwritev2() with RWF_DSYNC in the thread pool section of
> raw_co_prw(), if pwritev2() with RWF_DSYNC is available in the host,
> which is always the case for Linux kernel >= 4.7.
>
> The intent for FUA is indicated with the BDRV_REQ_FUA flag.
> The old code paths are preserved in case BDRV_REQ_FUA is off
> or pwritev2() with RWF_DSYNC is not available.
>
> Support for disk writes with FUA is handled in qemu_pwritev_fua(),
> which uses pwritev2() with RWF_DSYNC if available, otherwise falls
> back to pwritev2() with no flags followed by flush using
> handle_aiocb_flush().
>
> If pwritev2() is not implemented, then disk write in the linear FUA
> will fallback to pwrite() + handle_aiocb_flush().
>
> Signed-off-by: Pinku Deb Nath <prantoran@gmail.com>
>
> ---
>
> v4:
> - Add fallback when qemu_pwritev_fua() returns ENOSYS
> - Similar fallback was not added for handle_aiocb_rw_vector()
> since there is a preadv_present check in handle_aiocb_rw()
>
> v3:
> - Changed signature to add fd, iov, nr_iov
> - Return -ENOSYS for non-Linux hosts
>
> v2:
> - Moved handle_aiocb_flush() into qemu_pwritev_fua()
> - In handle_aiocb_rw_linear(), iovec with iovcnt=1 is created
> based on the assumption that there will be only one buffer
> ---
> block/file-posix.c | 68 ++++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 56 insertions(+), 12 deletions(-)
>
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 56d1972d15..59bed7866a 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -229,6 +229,7 @@ typedef struct RawPosixAIOData {
> unsigned long op;
> } zone_mgmt;
> };
> + BdrvRequestFlags flags;
> } RawPosixAIOData;
>
> #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> @@ -1674,6 +1675,20 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
> return pwritev(fd, iov, nr_iov, offset);
> }
>
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const RawPosixAIOData *aiocb)
> +{
> +#ifdef RWF_DSYNC
> + return pwritev2(fd, iov, nr_iov, offset, RWF_DSYNC);
> +#else
> + ssize_t len = pwritev2(fd, iov, nr_iov, offset, 0);
This will fail to compile on non-Linux OSes that provide preadv(2)
(CONFIG_PREADV) because they do not have pwritev2(2). This can be fixed
by using pwritev() since the flags aren't needed:
ssize_t len = pwritev(fd, iov, nr_iov, offset);
> + if (len == 0) {
> + len = handle_aiocb_flush(aiocb);
> + }
> + return len;
> +#endif
> +}
> +
> #else
>
> static bool preadv_present = false;
> @@ -1690,6 +1705,11 @@ qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset)
> return -ENOSYS;
> }
>
> +static ssize_t
> +qemu_pwritev_fua(int fd, struct iovec *iov, int nr_iov, off_t offset, const RawPosixAIOData *aiocb)
> +{
> + return -ENOSYS;
> +}
> #endif
>
> static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
> @@ -1698,10 +1718,16 @@ static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb)
>
> len = RETRY_ON_EINTR(
> (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) ?
> - qemu_pwritev(aiocb->aio_fildes,
> - aiocb->io.iov,
> - aiocb->io.niov,
> - aiocb->aio_offset) :
> + (aiocb->flags & BDRV_REQ_FUA) ?
> + qemu_pwritev_fua(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset,
> + aiocb) :
> + qemu_pwritev(aiocb->aio_fildes,
> + aiocb->io.iov,
> + aiocb->io.niov,
> + aiocb->aio_offset) :
> qemu_preadv(aiocb->aio_fildes,
> aiocb->io.iov,
> aiocb->io.niov,
> @@ -1727,10 +1753,31 @@ static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf)
>
> while (offset < aiocb->aio_nbytes) {
> if (aiocb->aio_type & (QEMU_AIO_WRITE | QEMU_AIO_ZONE_APPEND)) {
> - len = pwrite(aiocb->aio_fildes,
> - (const char *)buf + offset,
> - aiocb->aio_nbytes - offset,
> - aiocb->aio_offset + offset);
> + if (aiocb->flags & BDRV_REQ_FUA) {
> + struct iovec iov = {
> + .iov_base = buf + offset,
> + .iov_len = aiocb->aio_nbytes - offset,
> + };
> + len = qemu_pwritev_fua(aiocb->aio_fildes,
> + &iov,
> + 1,
> + aiocb->aio_offset + offset,
> + aiocb);
> + if (len == -ENOSYS) {
> + len = pwrite(aiocb->aio_fildes,
> + (const char *)buf + offset,
> + aiocb->aio_nbytes - offset,
> + aiocb->aio_offset + offset);
> + if (len == 0) {
> + len = handle_aiocb_flush(aiocb);
> + }
> + }
> + } else {
> + len = pwrite(aiocb->aio_fildes,
> + (const char *)buf + offset,
> + aiocb->aio_nbytes - offset,
> + aiocb->aio_offset + offset);
> + }
> } else {
> len = pread(aiocb->aio_fildes,
> buf + offset,
> @@ -2539,14 +2586,11 @@ static int coroutine_fn raw_co_prw(BlockDriverState *bs, int64_t *offset_ptr,
> .iov = qiov->iov,
> .niov = qiov->niov,
> },
> + .flags = flags,
> };
>
> assert(qiov->size == bytes);
> ret = raw_thread_pool_submit(handle_aiocb_rw, &acb);
> - if (ret == 0 && (flags & BDRV_REQ_FUA)) {
> - /* TODO Use pwritev2() instead if it's available */
> - ret = raw_co_flush_to_disk(bs);
> - }
> goto out; /* Avoid the compiler err of unused label */
>
> out:
> --
> 2.43.0
>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-04-07 13:56 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-05 23:52 [PATCH v5] block/file-posix.c: Use pwritev2() with RWF_DSYNC for FUA Pinku Deb Nath
2025-04-07 13:55 ` Stefan Hajnoczi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).