From: Jeff Layton <jlayton@kernel.org>
To: Joanne Koong <joannelkoong@gmail.com>, miklos@szeredi.hu
Cc: bernd@bsbernd.com, axboe@kernel.dk, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH v2 11/14] fuse: add pinned headers capability for io-uring buffer rings
Date: Thu, 30 Apr 2026 12:22:32 +0100 [thread overview]
Message-ID: <2bd55e996401939a75a6d03d6608198dc1d4fc53.camel@kernel.org> (raw)
In-Reply-To: <20260402162840.2989717-12-joannelkoong@gmail.com>
On Thu, 2026-04-02 at 09:28 -0700, Joanne Koong wrote:
> Allow fuse servers to pin their header buffers by setting the
> FUSE_URING_PINNED_HEADERS flag alongside FUSE_URING_BUFRING on REGISTER
> sqes. When set, the kernel pins the header pages, vmaps them for a
> kernel virtual address, and uses direct memcpy for copying. This avoids
> the per-request overhead of having to pin/unpin user pages and translate
> virtual addresses.
>
> Buffers must be page-aligned. The kernel accounts pinned pages against
> RLIMIT_MEMLOCK (bypassed with CAP_IPC_LOCK) and tracks mm->pinned_vm.
> Unpinning is done in process context during connection abort, since vmap
> cannot run in softirq (where final destruction occurs via RCU).
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
> fs/fuse/dev_uring.c | 228 ++++++++++++++++++++++++++++++++++++--
> fs/fuse/dev_uring_i.h | 23 +++-
> include/uapi/linux/fuse.h | 2 +
> 3 files changed, 243 insertions(+), 10 deletions(-)
>
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 9f14a2bcde3f..79736b02cf9f 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -11,6 +11,7 @@
>
> #include <linux/fs.h>
> #include <linux/io_uring/cmd.h>
> +#include <linux/vmalloc.h>
>
> static bool __read_mostly enable_uring;
> module_param(enable_uring, bool, 0644);
> @@ -46,6 +47,11 @@ static inline bool bufring_enabled(struct fuse_ring_queue *queue)
> return queue->bufring != NULL;
> }
>
> +static inline bool bufring_pinned_headers(struct fuse_ring_queue *queue)
> +{
> + return queue->bufring->use_pinned_headers;
> +}
> +
> static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
> struct fuse_ring_ent *ring_ent)
> {
> @@ -200,6 +206,37 @@ bool fuse_uring_request_expired(struct fuse_conn *fc)
> return false;
> }
>
> +static void fuse_bufring_unpin_mem(struct fuse_bufring_pinned *mem)
> +{
> + struct page **pages = mem->pages;
> + unsigned int nr_pages = mem->nr_pages;
> + struct user_struct *user = mem->user;
> + struct mm_struct *mm_account = mem->mm_account;
> +
> + vunmap(mem->addr);
> + unpin_user_pages(pages, nr_pages);
> +
> + if (user) {
> + atomic_long_sub(nr_pages, &user->locked_vm);
> + free_uid(user);
> + }
> +
> + atomic64_sub(nr_pages, &mm_account->pinned_vm);
> + mmdrop(mm_account);
> +
> + kvfree(mem->pages);
> +}
> +
> +static void fuse_uring_bufring_unpin(struct fuse_ring_queue *queue)
> +{
> + struct fuse_bufring *br = queue->bufring;
> +
> + if (bufring_pinned_headers(queue)) {
> + fuse_bufring_unpin_mem(&br->pinned_headers);
> + br->use_pinned_headers = false;
> + }
> +}
> +
> void fuse_uring_destruct(struct fuse_conn *fc)
> {
> struct fuse_ring *ring = fc->ring;
> @@ -227,7 +264,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
> }
>
> kfree(queue->fpq.processing);
> - kfree(queue->bufring);
> + if (bufring_enabled(queue)) {
> + fuse_uring_bufring_unpin(queue);
> + kfree(queue->bufring);
> + }
> kfree(queue);
> ring->queues[qid] = NULL;
> }
> @@ -309,14 +349,131 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
> return 0;
> }
>
> +static struct page **fuse_uring_pin_user_pages(void __user *uaddr,
> + unsigned long len, int *npages)
> +{
> + unsigned long addr = (unsigned long)uaddr;
> + unsigned long start, end, nr_pages;
> + struct page **pages;
> + int pinned;
> +
> + if (check_add_overflow(addr, len, &end))
> + return ERR_PTR(-EOVERFLOW);
> + if (check_add_overflow(end, PAGE_SIZE - 1, &end))
> + return ERR_PTR(-EOVERFLOW);
> +
> + end = end >> PAGE_SHIFT;
> + start = addr >> PAGE_SHIFT;
> + nr_pages = end - start;
> + if (WARN_ON_ONCE(!nr_pages))
> + return ERR_PTR(-EINVAL);
> + if (WARN_ON_ONCE(nr_pages > INT_MAX))
> + return ERR_PTR(-EOVERFLOW);
> +
> + pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT);
> + if (!pages)
> + return ERR_PTR(-ENOMEM);
> +
> + pinned = pin_user_pages_fast(addr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
> + pages);
> + /* success, mapped all pages */
> + if (pinned == nr_pages) {
> + *npages = nr_pages;
> + return pages;
> + }
> +
> + /* remove any partial pins */
> + if (pinned > 0)
> + unpin_user_pages(pages, pinned);
> +
> + kvfree(pages);
> +
> + return ERR_PTR(pinned < 0 ? pinned : -EFAULT);
> +}
> +
> +static int account_pinned_pages(struct fuse_bufring_pinned *mem,
> + struct page **pages, unsigned int nr_pages)
> +{
> + unsigned long page_limit, cur_pages, new_pages;
> + struct user_struct *user = current_user();
> +
> + if (!nr_pages)
> + return 0;
> +
> + if (!capable(CAP_IPC_LOCK)) {
> + /* Don't allow more pages than we can safely lock */
> + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> + cur_pages = atomic_long_read(&user->locked_vm);
> + do {
> + new_pages = cur_pages + nr_pages;
> + if (new_pages > page_limit)
> + return -ENOMEM;
> + } while (!atomic_long_try_cmpxchg(&user->locked_vm,
> + &cur_pages, new_pages));
> +
> + mem->user = get_uid(current_user());
> + }
> +
> + atomic64_add(nr_pages, ¤t->mm->pinned_vm);
> + mmgrab(current->mm);
> + mem->mm_account = current->mm;
> +
> + return 0;
> +}
> +
> +static int fuse_bufring_pin_mem(struct fuse_bufring_pinned *mem,
> + void __user *addr, size_t len)
> +{
> + struct page **pages = NULL;
> + int nr_pages;
> + int err;
> +
> + if (!PAGE_ALIGNED(addr))
> + return -EINVAL;
> +
> + pages = fuse_uring_pin_user_pages(addr, len, &nr_pages);
> + if (IS_ERR(pages))
> + return PTR_ERR(pages);
> +
> + err = account_pinned_pages(mem, pages, nr_pages);
> + if (err)
> + goto unpin;
> +
> + mem->addr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
> + if (!mem->addr) {
> + err = -ENOMEM;
> + goto unaccount;
> + }
> +
> + mem->pages = pages;
> + mem->nr_pages = nr_pages;
> +
> + return 0;
> +
> +unaccount:
> + if (mem->user) {
> + atomic_long_sub(nr_pages, &mem->user->locked_vm);
> + free_uid(mem->user);
> + }
> + atomic64_sub(nr_pages, ¤t->mm->pinned_vm);
> + mmdrop(mem->mm_account);
> +unpin:
> + unpin_user_pages(pages, nr_pages);
> + kvfree(pages);
> + return err;
> +}
> +
> static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
> - struct fuse_ring_queue *queue)
> + struct fuse_ring_queue *queue,
> + u64 init_flags)
> {
> const struct fuse_uring_cmd_req *cmd_req =
> io_uring_sqe128_cmd(cmd->sqe, struct fuse_uring_cmd_req);
> u16 queue_depth = READ_ONCE(cmd_req->init.queue_depth);
> unsigned int buf_size = READ_ONCE(cmd_req->init.buf_size);
> struct iovec iov[FUSE_URING_IOV_SEGS];
> + bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
> void __user *payload, *headers;
> size_t headers_size, payload_size, ring_size;
> struct fuse_bufring *br;
> @@ -354,7 +511,17 @@ static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
> return -ENOMEM;
>
> br->queue_depth = queue_depth;
> - br->headers = headers;
> + if (pinned_headers) {
> + err = fuse_bufring_pin_mem(&br->pinned_headers, headers,
> + headers_size);
> + if (err) {
> + kfree(br);
> + return err;
> + }
> + br->use_pinned_headers = true;
> + } else {
> + br->headers = headers;
> + }
>
> payload_addr = (uintptr_t)payload;
>
> @@ -385,8 +552,15 @@ static bool queue_init_flags_consistent(struct fuse_ring_queue *queue,
> u64 init_flags)
> {
> bool bufring = init_flags & FUSE_URING_BUFRING;
> + bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
> +
> + if (bufring_enabled(queue) != bufring)
> + return false;
> +
> + if (!bufring)
> + return true;
>
> - return bufring_enabled(queue) == bufring;
> + return bufring_pinned_headers(queue) == pinned_headers;
> }
>
> static struct fuse_ring_queue *
> @@ -423,7 +597,7 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
> fuse_pqueue_init(&queue->fpq);
>
> if (use_bufring) {
> - int err = fuse_uring_bufring_setup(cmd, queue);
> + int err = fuse_uring_bufring_setup(cmd, queue, init_flags);
>
> if (err) {
> kfree(pq);
> @@ -437,8 +611,10 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
> if (ring->queues[qid]) {
> spin_unlock(&fc->lock);
> kfree(queue->fpq.processing);
> - if (use_bufring)
> + if (use_bufring) {
> + fuse_uring_bufring_unpin(queue);
> kfree(queue->bufring);
> + }
> kfree(queue);
>
> queue = ring->queues[qid];
> @@ -605,6 +781,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work)
> }
> }
>
> +static void fuse_uring_unpin_queues(struct fuse_ring *ring)
> +{
> + int qid;
> +
> + for (qid = 0; qid < ring->nr_queues; qid++) {
> + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
> + struct fuse_bufring *br;
> +
> + if (!queue)
> + continue;
> +
> + br = queue->bufring;
> + if (!br)
> + continue;
> +
> + fuse_uring_bufring_unpin(queue);
> + }
> +}
> +
> /*
> * Stop the ring queues
> */
> @@ -643,6 +838,9 @@ void fuse_uring_abort(struct fuse_conn *fc)
> fuse_uring_abort_end_requests(ring);
> fuse_uring_stop_queues(ring);
> }
> +
> + /* unpin while in process context - can't do this in softirq */
> + fuse_uring_unpin_queues(ring);
> }
>
> /*
> @@ -758,6 +956,11 @@ static int copy_header_to_ring(struct fuse_ring_ent *ent,
> int buf_offset = offset +
> sizeof(struct fuse_uring_req_header) * ent->id;
>
> + if (bufring_pinned_headers(ent->queue)) {
> + memcpy(ent->queue->bufring->pinned_headers.addr + buf_offset,
> + header, header_size);
> + return 0;
> + }
> ring = ent->queue->bufring->headers + buf_offset;
> } else {
> ring = (void __user *)ent->headers + offset;
> @@ -785,6 +988,11 @@ static int copy_header_from_ring(struct fuse_ring_ent *ent,
> int buf_offset = offset +
> sizeof(struct fuse_uring_req_header) * ent->id;
>
> + if (bufring_pinned_headers(ent->queue)) {
> + memcpy(header, ent->queue->bufring->pinned_headers.addr + buf_offset,
> + header_size);
> + return 0;
> + }
> ring = ent->queue->bufring->headers + buf_offset;
> } else {
> ring = (void __user *)ent->headers + offset;
> @@ -1399,7 +1607,13 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
>
> static bool init_flags_valid(u64 init_flags)
> {
> - u64 valid_flags = FUSE_URING_BUFRING;
> + u64 valid_flags =
> + FUSE_URING_BUFRING | FUSE_URING_PINNED_HEADERS;
> + bool bufring = init_flags & FUSE_URING_BUFRING;
> + bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
> +
> + if (pinned_headers && !bufring)
> + return false;
>
> return !(init_flags & ~valid_flags);
> }
> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> index 66d5d5f8dc3f..05c0f061a882 100644
> --- a/fs/fuse/dev_uring_i.h
> +++ b/fs/fuse/dev_uring_i.h
> @@ -42,12 +42,29 @@ struct fuse_bufring_buf {
> unsigned int id;
> };
>
> -struct fuse_bufring {
> - /* pointer to the headers buffer */
> - void __user *headers;
> +struct fuse_bufring_pinned {
> + void *addr;
> + struct page **pages;
> + unsigned int nr_pages;
> +
> + /*
> + * need to track this so we can unpin / unaccount pages during teardown
> + * when not running in the server's task context
> + */
> + struct user_struct *user;
> + struct mm_struct *mm_account;
> +};
>
> +struct fuse_bufring {
> + bool use_pinned_headers: 1;
> unsigned int queue_depth;
>
> + union {
> + /* pointer to the headers buffer */
> + void __user *headers;
> + struct fuse_bufring_pinned pinned_headers;
> + };
> +
> /* metadata tracking state of the bufring */
> unsigned int nbufs;
> unsigned int head;
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 8753de7eb189..e57244c03d42 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -244,6 +244,7 @@
> * 7.46
> * - add FUSE_URING_BUFRING flag
> * - add fuse_uring_cmd_req init struct
> + * - add FUSE_URING_PINNED_HEADERS flag
> */
>
> #ifndef _LINUX_FUSE_H
> @@ -1306,6 +1307,7 @@ enum fuse_uring_cmd {
>
> /* fuse_uring_cmd_req flags */
> #define FUSE_URING_BUFRING (1 << 0)
> +#define FUSE_URING_PINNED_HEADERS (1 << 1)
>
> /**
> * In the 80B command area of the SQE.
Reviewed-by: Jeff Layton <jlayton@kernel.org>
next prev parent reply other threads:[~2026-04-30 11:22 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-02 16:28 [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Joanne Koong
2026-04-02 16:28 ` [PATCH v2 01/14] fuse: separate next request fetching from sending logic Joanne Koong
2026-04-29 11:52 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 02/14] fuse: refactor io-uring header copying to ring Joanne Koong
2026-04-29 12:05 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 03/14] fuse: refactor io-uring header copying from ring Joanne Koong
2026-04-29 12:06 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 04/14] fuse: use enum types for header copying Joanne Koong
2026-04-30 8:04 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 05/14] fuse: refactor setting up copy state for payload copying Joanne Koong
2026-04-30 8:06 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 06/14] fuse: support buffer copying for kernel addresses Joanne Koong
2026-04-30 8:19 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 07/14] fuse: use named constants for io-uring iovec indices Joanne Koong
2026-04-15 9:36 ` Bernd Schubert
2026-04-30 8:20 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 08/14] fuse: move fuse_uring_abort() from header to dev_uring.c Joanne Koong
2026-04-15 9:40 ` Bernd Schubert
2026-04-30 8:21 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 09/14] fuse: rearrange io-uring iovec and ent allocation logic Joanne Koong
2026-04-15 9:45 ` Bernd Schubert
2026-04-30 8:24 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 10/14] fuse: add io-uring buffer rings Joanne Koong
2026-04-15 9:48 ` Bernd Schubert
2026-04-15 21:40 ` Joanne Koong
2026-04-30 11:08 ` Jeff Layton
2026-04-30 12:44 ` Joanne Koong
2026-05-05 22:47 ` Bernd Schubert
2026-04-02 16:28 ` [PATCH v2 11/14] fuse: add pinned headers capability for " Joanne Koong
2026-04-14 12:47 ` Bernd Schubert
2026-04-15 0:48 ` Joanne Koong
2026-05-05 22:51 ` Bernd Schubert
2026-04-30 11:22 ` Jeff Layton [this message]
2026-04-02 16:28 ` [PATCH v2 12/14] fuse: add pinned payload buffers " Joanne Koong
2026-04-30 11:29 ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 13/14] fuse: add zero-copy over io-uring Joanne Koong
2026-04-30 11:42 ` Jeff Layton
2026-04-30 12:35 ` Joanne Koong
2026-04-30 12:55 ` Jeff Layton
2026-05-05 22:55 ` Bernd Schubert
2026-04-30 12:56 ` Jeff Layton
2026-05-05 23:45 ` Bernd Schubert
2026-04-02 16:28 ` [PATCH v2 14/14] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
2026-04-14 21:05 ` Bernd Schubert
2026-04-15 1:10 ` Joanne Koong
2026-04-15 10:55 ` Bernd Schubert
2026-04-15 22:40 ` Joanne Koong
2026-04-30 12:57 ` Jeff Layton
2026-04-30 12:59 ` [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Jeff Layton
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=2bd55e996401939a75a6d03d6608198dc1d4fc53.camel@kernel.org \
--to=jlayton@kernel.org \
--cc=axboe@kernel.dk \
--cc=bernd@bsbernd.com \
--cc=joannelkoong@gmail.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=miklos@szeredi.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox