From: Joanne Koong <joannelkoong@gmail.com>
To: miklos@szeredi.hu
Cc: bernd@bsbernd.com, axboe@kernel.dk, linux-fsdevel@vger.kernel.org
Subject: [PATCH v2 11/14] fuse: add pinned headers capability for io-uring buffer rings
Date: Thu, 2 Apr 2026 09:28:37 -0700 [thread overview]
Message-ID: <20260402162840.2989717-12-joannelkoong@gmail.com> (raw)
In-Reply-To: <20260402162840.2989717-1-joannelkoong@gmail.com>
Allow fuse servers to pin their header buffers by setting the
FUSE_URING_PINNED_HEADERS flag alongside FUSE_URING_BUFRING on REGISTER
sqes. When set, the kernel pins the header pages, vmaps them for a
kernel virtual address, and uses direct memcpy for copying. This avoids
the per-request overhead of having to pin/unpin user pages and translate
virtual addresses.
Buffers must be page-aligned. The kernel accounts pinned pages against
RLIMIT_MEMLOCK (bypassed with CAP_IPC_LOCK) and tracks mm->pinned_vm.
Unpinning is done in process context during connection abort, since vmap
cannot run in softirq (where final destruction occurs via RCU).
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/dev_uring.c | 228 ++++++++++++++++++++++++++++++++++++--
fs/fuse/dev_uring_i.h | 23 +++-
include/uapi/linux/fuse.h | 2 +
3 files changed, 243 insertions(+), 10 deletions(-)
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 9f14a2bcde3f..79736b02cf9f 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/io_uring/cmd.h>
+#include <linux/vmalloc.h>
static bool __read_mostly enable_uring;
module_param(enable_uring, bool, 0644);
@@ -46,6 +47,11 @@ static inline bool bufring_enabled(struct fuse_ring_queue *queue)
return queue->bufring != NULL;
}
+static inline bool bufring_pinned_headers(struct fuse_ring_queue *queue)
+{
+ return queue->bufring->use_pinned_headers;
+}
+
static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
struct fuse_ring_ent *ring_ent)
{
@@ -200,6 +206,37 @@ bool fuse_uring_request_expired(struct fuse_conn *fc)
return false;
}
+static void fuse_bufring_unpin_mem(struct fuse_bufring_pinned *mem)
+{
+ struct page **pages = mem->pages;
+ unsigned int nr_pages = mem->nr_pages;
+ struct user_struct *user = mem->user;
+ struct mm_struct *mm_account = mem->mm_account;
+
+ vunmap(mem->addr);
+ unpin_user_pages(pages, nr_pages);
+
+ if (user) {
+ atomic_long_sub(nr_pages, &user->locked_vm);
+ free_uid(user);
+ }
+
+ atomic64_sub(nr_pages, &mm_account->pinned_vm);
+ mmdrop(mm_account);
+
+ kvfree(mem->pages);
+}
+
+static void fuse_uring_bufring_unpin(struct fuse_ring_queue *queue)
+{
+ struct fuse_bufring *br = queue->bufring;
+
+ if (bufring_pinned_headers(queue)) {
+ fuse_bufring_unpin_mem(&br->pinned_headers);
+ br->use_pinned_headers = false;
+ }
+}
+
void fuse_uring_destruct(struct fuse_conn *fc)
{
struct fuse_ring *ring = fc->ring;
@@ -227,7 +264,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
}
kfree(queue->fpq.processing);
- kfree(queue->bufring);
+ if (bufring_enabled(queue)) {
+ fuse_uring_bufring_unpin(queue);
+ kfree(queue->bufring);
+ }
kfree(queue);
ring->queues[qid] = NULL;
}
@@ -309,14 +349,131 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
return 0;
}
+static struct page **fuse_uring_pin_user_pages(void __user *uaddr,
+ unsigned long len, int *npages)
+{
+ unsigned long addr = (unsigned long)uaddr;
+ unsigned long start, end, nr_pages;
+ struct page **pages;
+ int pinned;
+
+ if (check_add_overflow(addr, len, &end))
+ return ERR_PTR(-EOVERFLOW);
+ if (check_add_overflow(end, PAGE_SIZE - 1, &end))
+ return ERR_PTR(-EOVERFLOW);
+
+ end = end >> PAGE_SHIFT;
+ start = addr >> PAGE_SHIFT;
+ nr_pages = end - start;
+ if (WARN_ON_ONCE(!nr_pages))
+ return ERR_PTR(-EINVAL);
+ if (WARN_ON_ONCE(nr_pages > INT_MAX))
+ return ERR_PTR(-EOVERFLOW);
+
+ pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ pinned = pin_user_pages_fast(addr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ /* success, mapped all pages */
+ if (pinned == nr_pages) {
+ *npages = nr_pages;
+ return pages;
+ }
+
+ /* remove any partial pins */
+ if (pinned > 0)
+ unpin_user_pages(pages, pinned);
+
+ kvfree(pages);
+
+ return ERR_PTR(pinned < 0 ? pinned : -EFAULT);
+}
+
+static int account_pinned_pages(struct fuse_bufring_pinned *mem,
+ struct page **pages, unsigned int nr_pages)
+{
+ unsigned long page_limit, cur_pages, new_pages;
+ struct user_struct *user = current_user();
+
+ if (!nr_pages)
+ return 0;
+
+ if (!capable(CAP_IPC_LOCK)) {
+ /* Don't allow more pages than we can safely lock */
+ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ cur_pages = atomic_long_read(&user->locked_vm);
+ do {
+ new_pages = cur_pages + nr_pages;
+ if (new_pages > page_limit)
+ return -ENOMEM;
+ } while (!atomic_long_try_cmpxchg(&user->locked_vm,
+ &cur_pages, new_pages));
+
+ mem->user = get_uid(current_user());
+ }
+
+ atomic64_add(nr_pages, ¤t->mm->pinned_vm);
+ mmgrab(current->mm);
+ mem->mm_account = current->mm;
+
+ return 0;
+}
+
+static int fuse_bufring_pin_mem(struct fuse_bufring_pinned *mem,
+ void __user *addr, size_t len)
+{
+ struct page **pages = NULL;
+ int nr_pages;
+ int err;
+
+ if (!PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ pages = fuse_uring_pin_user_pages(addr, len, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ err = account_pinned_pages(mem, pages, nr_pages);
+ if (err)
+ goto unpin;
+
+ mem->addr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!mem->addr) {
+ err = -ENOMEM;
+ goto unaccount;
+ }
+
+ mem->pages = pages;
+ mem->nr_pages = nr_pages;
+
+ return 0;
+
+unaccount:
+ if (mem->user) {
+ atomic_long_sub(nr_pages, &mem->user->locked_vm);
+ free_uid(mem->user);
+ }
+ atomic64_sub(nr_pages, ¤t->mm->pinned_vm);
+ mmdrop(mem->mm_account);
+unpin:
+ unpin_user_pages(pages, nr_pages);
+ kvfree(pages);
+ return err;
+}
+
static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
- struct fuse_ring_queue *queue)
+ struct fuse_ring_queue *queue,
+ u64 init_flags)
{
const struct fuse_uring_cmd_req *cmd_req =
io_uring_sqe128_cmd(cmd->sqe, struct fuse_uring_cmd_req);
u16 queue_depth = READ_ONCE(cmd_req->init.queue_depth);
unsigned int buf_size = READ_ONCE(cmd_req->init.buf_size);
struct iovec iov[FUSE_URING_IOV_SEGS];
+ bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
void __user *payload, *headers;
size_t headers_size, payload_size, ring_size;
struct fuse_bufring *br;
@@ -354,7 +511,17 @@ static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
return -ENOMEM;
br->queue_depth = queue_depth;
- br->headers = headers;
+ if (pinned_headers) {
+ err = fuse_bufring_pin_mem(&br->pinned_headers, headers,
+ headers_size);
+ if (err) {
+ kfree(br);
+ return err;
+ }
+ br->use_pinned_headers = true;
+ } else {
+ br->headers = headers;
+ }
payload_addr = (uintptr_t)payload;
@@ -385,8 +552,15 @@ static bool queue_init_flags_consistent(struct fuse_ring_queue *queue,
u64 init_flags)
{
bool bufring = init_flags & FUSE_URING_BUFRING;
+ bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
+
+ if (bufring_enabled(queue) != bufring)
+ return false;
+
+ if (!bufring)
+ return true;
- return bufring_enabled(queue) == bufring;
+ return bufring_pinned_headers(queue) == pinned_headers;
}
static struct fuse_ring_queue *
@@ -423,7 +597,7 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
fuse_pqueue_init(&queue->fpq);
if (use_bufring) {
- int err = fuse_uring_bufring_setup(cmd, queue);
+ int err = fuse_uring_bufring_setup(cmd, queue, init_flags);
if (err) {
kfree(pq);
@@ -437,8 +611,10 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
if (ring->queues[qid]) {
spin_unlock(&fc->lock);
kfree(queue->fpq.processing);
- if (use_bufring)
+ if (use_bufring) {
+ fuse_uring_bufring_unpin(queue);
kfree(queue->bufring);
+ }
kfree(queue);
queue = ring->queues[qid];
@@ -605,6 +781,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work)
}
}
+static void fuse_uring_unpin_queues(struct fuse_ring *ring)
+{
+ int qid;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+ struct fuse_bufring *br;
+
+ if (!queue)
+ continue;
+
+ br = queue->bufring;
+ if (!br)
+ continue;
+
+ fuse_uring_bufring_unpin(queue);
+ }
+}
+
/*
* Stop the ring queues
*/
@@ -643,6 +838,9 @@ void fuse_uring_abort(struct fuse_conn *fc)
fuse_uring_abort_end_requests(ring);
fuse_uring_stop_queues(ring);
}
+
+ /* unpin while in process context - can't do this in softirq */
+ fuse_uring_unpin_queues(ring);
}
/*
@@ -758,6 +956,11 @@ static int copy_header_to_ring(struct fuse_ring_ent *ent,
int buf_offset = offset +
sizeof(struct fuse_uring_req_header) * ent->id;
+ if (bufring_pinned_headers(ent->queue)) {
+ memcpy(ent->queue->bufring->pinned_headers.addr + buf_offset,
+ header, header_size);
+ return 0;
+ }
ring = ent->queue->bufring->headers + buf_offset;
} else {
ring = (void __user *)ent->headers + offset;
@@ -785,6 +988,11 @@ static int copy_header_from_ring(struct fuse_ring_ent *ent,
int buf_offset = offset +
sizeof(struct fuse_uring_req_header) * ent->id;
+ if (bufring_pinned_headers(ent->queue)) {
+ memcpy(header, ent->queue->bufring->pinned_headers.addr + buf_offset,
+ header_size);
+ return 0;
+ }
ring = ent->queue->bufring->headers + buf_offset;
} else {
ring = (void __user *)ent->headers + offset;
@@ -1399,7 +1607,13 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
static bool init_flags_valid(u64 init_flags)
{
- u64 valid_flags = FUSE_URING_BUFRING;
+ u64 valid_flags =
+ FUSE_URING_BUFRING | FUSE_URING_PINNED_HEADERS;
+ bool bufring = init_flags & FUSE_URING_BUFRING;
+ bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
+
+ if (pinned_headers && !bufring)
+ return false;
return !(init_flags & ~valid_flags);
}
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 66d5d5f8dc3f..05c0f061a882 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -42,12 +42,29 @@ struct fuse_bufring_buf {
unsigned int id;
};
-struct fuse_bufring {
- /* pointer to the headers buffer */
- void __user *headers;
+struct fuse_bufring_pinned {
+ void *addr;
+ struct page **pages;
+ unsigned int nr_pages;
+
+ /*
+ * need to track this so we can unpin / unaccount pages during teardown
+ * when not running in the server's task context
+ */
+ struct user_struct *user;
+ struct mm_struct *mm_account;
+};
+struct fuse_bufring {
+ bool use_pinned_headers: 1;
unsigned int queue_depth;
+ union {
+ /* pointer to the headers buffer */
+ void __user *headers;
+ struct fuse_bufring_pinned pinned_headers;
+ };
+
/* metadata tracking state of the bufring */
unsigned int nbufs;
unsigned int head;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 8753de7eb189..e57244c03d42 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -244,6 +244,7 @@
* 7.46
* - add FUSE_URING_BUFRING flag
* - add fuse_uring_cmd_req init struct
+ * - add FUSE_URING_PINNED_HEADERS flag
*/
#ifndef _LINUX_FUSE_H
@@ -1306,6 +1307,7 @@ enum fuse_uring_cmd {
/* fuse_uring_cmd_req flags */
#define FUSE_URING_BUFRING (1 << 0)
+#define FUSE_URING_PINNED_HEADERS (1 << 1)
/**
* In the 80B command area of the SQE.
--
2.52.0
next prev parent reply other threads:[~2026-04-02 16:30 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-02 16:28 [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Joanne Koong
2026-04-02 16:28 ` [PATCH v2 01/14] fuse: separate next request fetching from sending logic Joanne Koong
2026-04-29 11:52 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 02/14] fuse: refactor io-uring header copying to ring Joanne Koong
2026-04-29 12:05 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 03/14] fuse: refactor io-uring header copying from ring Joanne Koong
2026-04-29 12:06 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 04/14] fuse: use enum types for header copying Joanne Koong
2026-04-30 8:04 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 05/14] fuse: refactor setting up copy state for payload copying Joanne Koong
2026-04-30 8:06 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 06/14] fuse: support buffer copying for kernel addresses Joanne Koong
2026-04-30 8:19 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 07/14] fuse: use named constants for io-uring iovec indices Joanne Koong
2026-04-15 9:36 ` Bernd Schubert
2026-04-30 8:20 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 08/14] fuse: move fuse_uring_abort() from header to dev_uring.c Joanne Koong
2026-04-15 9:40 ` Bernd Schubert
2026-04-30 8:21 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 09/14] fuse: rearrange io-uring iovec and ent allocation logic Joanne Koong
2026-04-15 9:45 ` Bernd Schubert
2026-04-30 8:24 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 10/14] fuse: add io-uring buffer rings Joanne Koong
2026-04-15 9:48 ` Bernd Schubert
2026-04-15 21:40 ` Joanne Koong
2026-04-30 11:08 ` Jeff Layton
2026-04-30 12:44 ` Joanne Koong
2026-05-05 22:47 ` Bernd Schubert
2026-04-02 16:28 ` Joanne Koong [this message]
2026-04-14 12:47 ` [PATCH v2 11/14] fuse: add pinned headers capability for " Bernd Schubert
2026-04-15 0:48 ` Joanne Koong
2026-05-05 22:51 ` Bernd Schubert
2026-04-30 11:22 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 12/14] fuse: add pinned payload buffers " Joanne Koong
2026-04-30 11:29 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 13/14] fuse: add zero-copy over io-uring Joanne Koong
2026-04-30 11:42 ` Jeff Layton
2026-04-30 12:35 ` Joanne Koong
2026-04-30 12:55 ` Jeff Layton
2026-05-05 22:55 ` Bernd Schubert
2026-04-30 12:56 ` Jeff Layton
2026-05-05 23:45 ` Bernd Schubert
2026-04-02 16:28 ` [PATCH v2 14/14] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
2026-04-14 21:05 ` Bernd Schubert
2026-04-15 1:10 ` Joanne Koong
2026-04-15 10:55 ` Bernd Schubert
2026-04-15 22:40 ` Joanne Koong
2026-04-30 12:57 ` Jeff Layton
2026-04-30 12:59 ` [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Jeff Layton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260402162840.2989717-12-joannelkoong@gmail.com \
--to=joannelkoong@gmail.com \
--cc=axboe@kernel.dk \
--cc=bernd@bsbernd.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=miklos@szeredi.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox