From: Zhi Song <hibriansong@gmail.com>
To: qemu-block@nongnu.org
Cc: qemu-devel@nongnu.org, armbru@redhat.com, bernd@bsbernd.com,
fam@euphon.net, hibriansong@gmail.com, hreitz@redhat.com,
kwolf@redhat.com, stefanha@redhat.com
Subject: [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init
Date: Thu, 14 Aug 2025 23:46:17 -0400 [thread overview]
Message-ID: <20250815034619.51980-2-hizhisong@gmail.com> (raw)
In-Reply-To: <20250815034619.51980-1-hizhisong@gmail.com>
From: Brian Song <hibriansong@gmail.com>
This patch adds a new export option for storage-export-daemon to enable
or disable FUSE-over-io_uring via the switch io-uring=on|off (disable
by default). It also implements the protocol handshake with the Linux
kernel during the FUSE-over-io_uring initialization phase.
See: https://docs.kernel.org/filesystems/fuse-io-uring.html
The kernel documentation describes in detail how FUSE-over-io_uring
works. This patch implements the Initial SQE stage shown in thediagram:
it initializes one queue per IOThread, each currently supporting a
single submission queue entry (SQE). When the FUSE driver sends the
first FUSE request (FUSE_INIT), storage-export-daemon calls
fuse_uring_start() to complete initialization, ultimately submitting
the SQE with the FUSE_IO_URING_CMD_REGISTER command to confirm
successful initialization with the kernel.
Suggested-by: Kevin Wolf <kwolf@redhat.com>
Suggested-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Brian Song <hibriansong@gmail.com>
---
block/export/fuse.c | 161 ++++++++++++++++++++++++---
docs/tools/qemu-storage-daemon.rst | 11 +-
qapi/block-export.json | 5 +-
storage-daemon/qemu-storage-daemon.c | 1 +
util/fdmon-io_uring.c | 5 +-
5 files changed, 159 insertions(+), 24 deletions(-)
diff --git a/block/export/fuse.c b/block/export/fuse.c
index c0ad4696ce..59fa79f486 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -48,6 +48,11 @@
#include <linux/fs.h>
#endif
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/* room needed in buffer to accommodate header */
+#define FUSE_BUFFER_HEADER_SIZE 0x1000
+
/* Prevent overly long bounce buffer allocations */
#define FUSE_MAX_READ_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 1 * 1024 * 1024))
/*
@@ -63,12 +68,31 @@
(FUSE_MAX_WRITE_BYTES - FUSE_IN_PLACE_WRITE_BYTES)
typedef struct FuseExport FuseExport;
+typedef struct FuseQueue FuseQueue;
+
+typedef struct FuseRingEnt {
+ /* back pointer */
+ FuseQueue *q;
+
+ /* commit id of a fuse request */
+ uint64_t req_commit_id;
+
+ /* fuse request header and payload */
+ struct fuse_uring_req_header req_header;
+ void *op_payload;
+ size_t req_payload_sz;
+
+ /* The vector passed to the kernel */
+ struct iovec iov[2];
+
+ CqeHandler fuse_cqe_handler;
+} FuseRingEnt;
/*
* One FUSE "queue", representing one FUSE FD from which requests are fetched
* and processed. Each queue is tied to an AioContext.
*/
-typedef struct FuseQueue {
+struct FuseQueue {
FuseExport *exp;
AioContext *ctx;
@@ -109,7 +133,12 @@ typedef struct FuseQueue {
* Free this buffer with qemu_vfree().
*/
void *spillover_buf;
-} FuseQueue;
+
+#ifdef CONFIG_LINUX_IO_URING
+ int qid;
+ FuseRingEnt ent;
+#endif
+};
/*
* Verify that FuseQueue.request_buf plus the spill-over buffer together
@@ -148,6 +177,7 @@ struct FuseExport {
bool growable;
/* Whether allow_other was used as a mount option or not */
bool allow_other;
+ bool is_uring;
mode_t st_mode;
uid_t st_uid;
@@ -257,6 +287,93 @@ static const BlockDevOps fuse_export_blk_dev_ops = {
.drained_poll = fuse_export_drained_poll,
};
+#ifdef CONFIG_LINUX_IO_URING
+
+static void fuse_uring_sqe_set_req_data(struct fuse_uring_cmd_req *req,
+ const unsigned int qid,
+ const unsigned int commit_id)
+{
+ req->qid = qid;
+ req->commit_id = commit_id;
+ req->flags = 0;
+}
+
+static void fuse_uring_sqe_prepare(struct io_uring_sqe *sqe, FuseQueue *q,
+ __u32 cmd_op)
+{
+ sqe->opcode = IORING_OP_URING_CMD;
+
+ sqe->fd = q->fuse_fd;
+ sqe->rw_flags = 0;
+ sqe->ioprio = 0;
+ sqe->off = 0;
+
+ sqe->cmd_op = cmd_op;
+ sqe->__pad1 = 0;
+}
+
+static void fuse_uring_prep_sqe_register(struct io_uring_sqe *sqe, void *opaque)
+{
+ FuseQueue *q = opaque;
+ struct fuse_uring_cmd_req *req = (void *)&sqe->cmd[0];
+
+ fuse_uring_sqe_prepare(sqe, q, FUSE_IO_URING_CMD_REGISTER);
+
+ sqe->addr = (uint64_t)(q->ent.iov);
+ sqe->len = 2;
+
+ fuse_uring_sqe_set_req_data(req, q->qid, 0);
+}
+
+static void fuse_uring_submit_register(void *opaque)
+{
+ FuseQueue *q = opaque;
+ FuseExport *exp = q->exp;
+
+
+ aio_add_sqe(fuse_uring_prep_sqe_register, q, &(q->ent.fuse_cqe_handler));
+}
+
+static void fuse_uring_start(FuseExport *exp, struct fuse_init_out *out)
+{
+ /*
+ * Since we didn't enable the FUSE_MAX_PAGES feature, the value of
+ * fc->max_pages should be FUSE_DEFAULT_MAX_PAGES_PER_REQ, which is set by
+ * the kernel by default. Also, max_write should not exceed
+ * FUSE_DEFAULT_MAX_PAGES_PER_REQ * PAGE_SIZE.
+ */
+ size_t bufsize = out->max_write + FUSE_BUFFER_HEADER_SIZE;
+
+ if (!(out->flags & FUSE_MAX_PAGES)) {
+ bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * qemu_real_host_page_size()
+ + FUSE_BUFFER_HEADER_SIZE;
+ }
+
+ for (int i = 0; i < exp->num_queues; i++) {
+ FuseQueue *q = &exp->queues[i];
+ FuseRingEnt *ent = &q->ent;
+
+ ent->q = q;
+
+ ent->req_payload_sz = bufsize - FUSE_BUFFER_HEADER_SIZE;
+ ent->op_payload = g_malloc0(ent->req_payload_sz);
+
+ ent->iov[0] = (struct iovec) {
+ &(ent->req_header),
+ sizeof(struct fuse_uring_req_header)
+ };
+ ent->iov[1] = (struct iovec) {
+ ent->op_payload,
+ ent->req_payload_sz
+ };
+
+ ent->fuse_cqe_handler.cb = fuse_uring_cqe_handler;
+
+ aio_bh_schedule_oneshot(q->ctx, fuse_uring_submit_register, q);
+ }
+}
+#endif
+
static int fuse_export_create(BlockExport *blk_exp,
BlockExportOptions *blk_exp_args,
AioContext *const *multithread,
@@ -280,6 +397,9 @@ static int fuse_export_create(BlockExport *blk_exp,
for (size_t i = 0; i < mt_count; i++) {
exp->queues[i] = (FuseQueue) {
+#ifdef CONFIG_LINUX_IO_URING
+ .qid = i,
+#endif
.exp = exp,
.ctx = multithread[i],
.fuse_fd = -1,
@@ -293,6 +413,9 @@ static int fuse_export_create(BlockExport *blk_exp,
exp->num_queues = 1;
exp->queues = g_new(FuseQueue, 1);
exp->queues[0] = (FuseQueue) {
+#ifdef CONFIG_LINUX_IO_URING
+ .qid = 0,
+#endif
.exp = exp,
.ctx = exp->common.ctx,
.fuse_fd = -1,
@@ -312,6 +435,8 @@ static int fuse_export_create(BlockExport *blk_exp,
}
}
+ exp->is_uring = args->io_uring ? true : false;
+
blk_set_dev_ops(exp->common.blk, &fuse_export_blk_dev_ops, exp);
/*
@@ -687,15 +812,22 @@ static ssize_t coroutine_fn
fuse_co_init(FuseExport *exp, struct fuse_init_out *out,
uint32_t max_readahead, uint32_t flags)
{
- const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO;
+ const uint32_t supported_flags = FUSE_ASYNC_READ | FUSE_ASYNC_DIO
+ | FUSE_INIT_EXT;
+ uint64_t outargflags = flags;
+
+#ifdef CONFIG_LINUX_IO_URING
+ if (exp->is_uring)
+ outargflags |= FUSE_OVER_IO_URING;
+#endif
*out = (struct fuse_init_out) {
.major = FUSE_KERNEL_VERSION,
.minor = FUSE_KERNEL_MINOR_VERSION,
.max_readahead = max_readahead,
.max_write = FUSE_MAX_WRITE_BYTES,
- .flags = flags & supported_flags,
- .flags2 = 0,
+ .flags = outargflags & supported_flags,
+ .flags2 = outargflags >> 32,
/* libfuse maximum: 2^16 - 1 */
.max_background = UINT16_MAX,
@@ -1393,22 +1525,17 @@ fuse_co_process_request(FuseQueue *q, void *spillover_buf)
struct fuse_out_header *out_hdr = (struct fuse_out_header *)out_buf;
/* For read requests: Data to be returned */
void *out_data_buffer = NULL;
- ssize_t ret;
- /* Limit scope to ensure pointer is no longer used after yielding */
- {
- const struct fuse_in_header *in_hdr =
- (const struct fuse_in_header *)q->request_buf;
-
- opcode = in_hdr->opcode;
- req_id = in_hdr->unique;
- }
+ bool is_uring = exp->is_uring;
switch (opcode) {
case FUSE_INIT: {
- const struct fuse_init_in *in = FUSE_IN_OP_STRUCT(init, q);
- ret = fuse_co_init(exp, FUSE_OUT_OP_STRUCT(init, out_buf),
- in->max_readahead, in->flags);
+#ifdef CONFIG_LINUX_IO_URING
+ /* FUSE-over-io_uring enabled && start from the tradition path */
+ if (is_uring) {
+ fuse_uring_start(exp, out);
+ }
+#endif
break;
}
diff --git a/docs/tools/qemu-storage-daemon.rst b/docs/tools/qemu-storage-daemon.rst
index 35ab2d7807..c5076101e0 100644
--- a/docs/tools/qemu-storage-daemon.rst
+++ b/docs/tools/qemu-storage-daemon.rst
@@ -78,7 +78,7 @@ Standard options:
.. option:: --export [type=]nbd,id=<id>,node-name=<node-name>[,name=<export-name>][,writable=on|off][,bitmap=<name>]
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=unix,addr.path=<socket-path>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
--export [type=]vhost-user-blk,id=<id>,node-name=<node-name>,addr.type=fd,addr.str=<fd>[,writable=on|off][,logical-block-size=<block-size>][,num-queues=<num-queues>]
- --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto]
+ --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>[,growable=on|off][,writable=on|off][,allow-other=on|off|auto][,io-uring=on|off]
--export [type=]vduse-blk,id=<id>,node-name=<node-name>,name=<vduse-name>[,writable=on|off][,num-queues=<num-queues>][,queue-size=<queue-size>][,logical-block-size=<block-size>][,serial=<serial-number>]
is a block export definition. ``node-name`` is the block node that should be
@@ -111,10 +111,11 @@ Standard options:
that enabling this option as a non-root user requires enabling the
user_allow_other option in the global fuse.conf configuration file. Setting
``allow-other`` to auto (the default) will try enabling this option, and on
- error fall back to disabling it.
-
- The ``vduse-blk`` export type takes a ``name`` (must be unique across the host)
- to create the VDUSE device.
+ error fall back to disabling it. Once ``io-uring`` is enabled (off by default),
+ the FUSE-over-io_uring-related settings will be initialized to bypass the
+ traditional /dev/fuse communication mechanism and instead use io_uring to
+ handle FUSE operations. The ``vduse-blk`` export type takes a ``name``
+ (must be unique across the host) to create the VDUSE device.
``num-queues`` sets the number of virtqueues (the default is 1).
``queue-size`` sets the virtqueue descriptor table size (the default is 256).
diff --git a/qapi/block-export.json b/qapi/block-export.json
index 9ae703ad01..37f2fc47e2 100644
--- a/qapi/block-export.json
+++ b/qapi/block-export.json
@@ -184,12 +184,15 @@
# mount the export with allow_other, and if that fails, try again
# without. (since 6.1; default: auto)
#
+# @io-uring: Use FUSE-over-io-uring. (since 10.2; default: false)
+#
# Since: 6.0
##
{ 'struct': 'BlockExportOptionsFuse',
'data': { 'mountpoint': 'str',
'*growable': 'bool',
- '*allow-other': 'FuseExportAllowOther' },
+ '*allow-other': 'FuseExportAllowOther',
+ '*io-uring': 'bool' },
'if': 'CONFIG_FUSE' }
##
diff --git a/storage-daemon/qemu-storage-daemon.c b/storage-daemon/qemu-storage-daemon.c
index eb72561358..0cd4cd2b58 100644
--- a/storage-daemon/qemu-storage-daemon.c
+++ b/storage-daemon/qemu-storage-daemon.c
@@ -107,6 +107,7 @@ static void help(void)
#ifdef CONFIG_FUSE
" --export [type=]fuse,id=<id>,node-name=<node-name>,mountpoint=<file>\n"
" [,growable=on|off][,writable=on|off][,allow-other=on|off|auto]\n"
+" [,io-uring=on|off]"
" export the specified block node over FUSE\n"
"\n"
#endif /* CONFIG_FUSE */
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
index d2433d1d99..68d3fe8e01 100644
--- a/util/fdmon-io_uring.c
+++ b/util/fdmon-io_uring.c
@@ -452,10 +452,13 @@ static const FDMonOps fdmon_io_uring_ops = {
void fdmon_io_uring_setup(AioContext *ctx, Error **errp)
{
int ret;
+ int flags;
ctx->io_uring_fd_tag = NULL;
+ flags = IORING_SETUP_SQE128;
- ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+ ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES,
+ &ctx->fdmon_io_uring, flags);
if (ret != 0) {
error_setg_errno(errp, -ret, "Failed to initialize io_uring");
return;
--
2.45.2
next prev parent reply other threads:[~2025-08-15 3:48 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-15 3:46 [RFC PATCH 0/3] block/export: Add FUSE-over-io_uring for Storage Exports Zhi Song
2025-08-15 3:46 ` Zhi Song [this message]
2025-08-16 23:13 ` [PATCH 1/3] fuse: add FUSE-over-io_uring enable opt and init Brian Song
2025-08-17 13:42 ` Stefan Hajnoczi
2025-08-18 23:04 ` Bernd Schubert
2025-08-19 1:15 ` Brian Song
2025-08-19 22:26 ` Bernd Schubert
2025-08-19 23:23 ` Brian Song
2025-08-20 3:31 ` Brian Song
2025-08-15 3:46 ` [PATCH 2/3] fuse: Handle FUSE-uring requests Zhi Song
2025-08-15 3:46 ` [PATCH 3/3] fuse: Safe termination for FUSE-uring Zhi Song
2025-08-17 13:45 ` [RFC PATCH 0/3] block/export: Add FUSE-over-io_uring for Storage Exports Stefan Hajnoczi
2025-08-18 22:54 ` Bernd Schubert
2025-08-21 1:32 ` Brian Song
2025-08-21 14:20 ` Stefan Hajnoczi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250815034619.51980-2-hizhisong@gmail.com \
--to=hibriansong@gmail.com \
--cc=armbru@redhat.com \
--cc=bernd@bsbernd.com \
--cc=fam@euphon.net \
--cc=hreitz@redhat.com \
--cc=kwolf@redhat.com \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).