Linux filesystem development
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: Joanne Koong <joannelkoong@gmail.com>, miklos@szeredi.hu
Cc: bernd@bsbernd.com, axboe@kernel.dk, linux-fsdevel@vger.kernel.org
Subject: Re: [PATCH v2 11/14] fuse: add pinned headers capability for io-uring buffer rings
Date: Thu, 30 Apr 2026 12:22:32 +0100	[thread overview]
Message-ID: <2bd55e996401939a75a6d03d6608198dc1d4fc53.camel@kernel.org> (raw)
In-Reply-To: <20260402162840.2989717-12-joannelkoong@gmail.com>

On Thu, 2026-04-02 at 09:28 -0700, Joanne Koong wrote:
> Allow fuse servers to pin their header buffers by setting the
> FUSE_URING_PINNED_HEADERS flag alongside FUSE_URING_BUFRING on REGISTER
> sqes. When set, the kernel pins the header pages, vmaps them for a
> kernel virtual address, and uses direct memcpy for copying. This avoids
> the per-request overhead of having to pin/unpin user pages and translate
> virtual addresses.
> 
> Buffers must be page-aligned. The kernel accounts pinned pages against
> RLIMIT_MEMLOCK (bypassed with CAP_IPC_LOCK) and tracks mm->pinned_vm.
> Unpinning is done in process context during connection abort, since vmap
> cannot run in softirq (where final destruction occurs via RCU).
> 
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  fs/fuse/dev_uring.c       | 228 ++++++++++++++++++++++++++++++++++++--
>  fs/fuse/dev_uring_i.h     |  23 +++-
>  include/uapi/linux/fuse.h |   2 +
>  3 files changed, 243 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
> index 9f14a2bcde3f..79736b02cf9f 100644
> --- a/fs/fuse/dev_uring.c
> +++ b/fs/fuse/dev_uring.c
> @@ -11,6 +11,7 @@
>  
>  #include <linux/fs.h>
>  #include <linux/io_uring/cmd.h>
> +#include <linux/vmalloc.h>
>  
>  static bool __read_mostly enable_uring;
>  module_param(enable_uring, bool, 0644);
> @@ -46,6 +47,11 @@ static inline bool bufring_enabled(struct fuse_ring_queue *queue)
>  	return queue->bufring != NULL;
>  }
>  
> +static inline bool bufring_pinned_headers(struct fuse_ring_queue *queue)
> +{
> +	return queue->bufring->use_pinned_headers;
> +}
> +
>  static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
>  				   struct fuse_ring_ent *ring_ent)
>  {
> @@ -200,6 +206,37 @@ bool fuse_uring_request_expired(struct fuse_conn *fc)
>  	return false;
>  }
>  
> +static void fuse_bufring_unpin_mem(struct fuse_bufring_pinned *mem)
> +{
> +	struct page **pages = mem->pages;
> +	unsigned int nr_pages = mem->nr_pages;
> +	struct user_struct *user = mem->user;
> +	struct mm_struct *mm_account = mem->mm_account;
> +
> +	vunmap(mem->addr);
> +	unpin_user_pages(pages, nr_pages);
> +
> +	if (user) {
> +		atomic_long_sub(nr_pages, &user->locked_vm);
> +		free_uid(user);
> +	}
> +
> +	atomic64_sub(nr_pages, &mm_account->pinned_vm);
> +	mmdrop(mm_account);
> +
> +	kvfree(mem->pages);
> +}
> +
> +static void fuse_uring_bufring_unpin(struct fuse_ring_queue *queue)
> +{
> +	struct fuse_bufring *br = queue->bufring;
> +
> +	if (bufring_pinned_headers(queue)) {
> +		fuse_bufring_unpin_mem(&br->pinned_headers);
> +		br->use_pinned_headers = false;
> +	}
> +}
> +
>  void fuse_uring_destruct(struct fuse_conn *fc)
>  {
>  	struct fuse_ring *ring = fc->ring;
> @@ -227,7 +264,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
>  		}
>  
>  		kfree(queue->fpq.processing);
> -		kfree(queue->bufring);
> +		if (bufring_enabled(queue)) {
> +			fuse_uring_bufring_unpin(queue);
> +			kfree(queue->bufring);
> +		}
>  		kfree(queue);
>  		ring->queues[qid] = NULL;
>  	}
> @@ -309,14 +349,131 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
>  	return 0;
>  }
>  
> +static struct page **fuse_uring_pin_user_pages(void __user *uaddr,
> +					       unsigned long len, int *npages)
> +{
> +	unsigned long addr = (unsigned long)uaddr;
> +	unsigned long start, end, nr_pages;
> +	struct page **pages;
> +	int pinned;
> +
> +	if (check_add_overflow(addr, len, &end))
> +		return ERR_PTR(-EOVERFLOW);
> +	if (check_add_overflow(end, PAGE_SIZE - 1, &end))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	end = end >> PAGE_SHIFT;
> +	start = addr >> PAGE_SHIFT;
> +	nr_pages = end - start;
> +	if (WARN_ON_ONCE(!nr_pages))
> +		return ERR_PTR(-EINVAL);
> +	if (WARN_ON_ONCE(nr_pages > INT_MAX))
> +		return ERR_PTR(-EOVERFLOW);
> +
> +	pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT);
> +	if (!pages)
> +		return ERR_PTR(-ENOMEM);
> +
> +	pinned = pin_user_pages_fast(addr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
> +				     pages);
> +	/* success, mapped all pages */
> +	if (pinned == nr_pages) {
> +		*npages = nr_pages;
> +		return pages;
> +	}
> +
> +	/* remove any partial pins */
> +	if (pinned > 0)
> +		unpin_user_pages(pages, pinned);
> +
> +	kvfree(pages);
> +
> +	return ERR_PTR(pinned < 0 ? pinned : -EFAULT);
> +}
> +
> +static int account_pinned_pages(struct fuse_bufring_pinned *mem,
> +				struct page **pages, unsigned int nr_pages)
> +{
> +	unsigned long page_limit, cur_pages, new_pages;
> +	struct user_struct *user = current_user();
> +
> +	if (!nr_pages)
> +		return 0;
> +
> +	if (!capable(CAP_IPC_LOCK)) {
> +		/* Don't allow more pages than we can safely lock */
> +		page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +		cur_pages = atomic_long_read(&user->locked_vm);
> +		do {
> +			new_pages = cur_pages + nr_pages;
> +			if (new_pages > page_limit)
> +				return -ENOMEM;
> +		} while (!atomic_long_try_cmpxchg(&user->locked_vm,
> +						  &cur_pages, new_pages));
> +
> +		mem->user = get_uid(current_user());
> +	}
> +
> +	atomic64_add(nr_pages, &current->mm->pinned_vm);
> +	mmgrab(current->mm);
> +	mem->mm_account = current->mm;
> +
> +	return 0;
> +}
> +
> +static int fuse_bufring_pin_mem(struct fuse_bufring_pinned *mem,
> +				void __user *addr, size_t len)
> +{
> +	struct page **pages = NULL;
> +	int nr_pages;
> +	int err;
> +
> +	if (!PAGE_ALIGNED(addr))
> +		return -EINVAL;
> +
> +	pages = fuse_uring_pin_user_pages(addr, len, &nr_pages);
> +	if (IS_ERR(pages))
> +		return PTR_ERR(pages);
> +
> +	err = account_pinned_pages(mem, pages, nr_pages);
> +	if (err)
> +		goto unpin;
> +
> +	mem->addr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
> +	if (!mem->addr) {
> +		err = -ENOMEM;
> +		goto unaccount;
> +	}
> +
> +	mem->pages = pages;
> +	mem->nr_pages = nr_pages;
> +
> +	return 0;
> +
> +unaccount:
> +	if (mem->user) {
> +		atomic_long_sub(nr_pages, &mem->user->locked_vm);
> +		free_uid(mem->user);
> +	}
> +	atomic64_sub(nr_pages, &current->mm->pinned_vm);
> +	mmdrop(mem->mm_account);
> +unpin:
> +	unpin_user_pages(pages, nr_pages);
> +	kvfree(pages);
> +	return err;
> +}
> +
>  static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
> -				     struct fuse_ring_queue *queue)
> +				    struct fuse_ring_queue *queue,
> +				    u64 init_flags)
>  {
>  	const struct fuse_uring_cmd_req *cmd_req =
>  		io_uring_sqe128_cmd(cmd->sqe, struct fuse_uring_cmd_req);
>  	u16 queue_depth = READ_ONCE(cmd_req->init.queue_depth);
>  	unsigned int buf_size = READ_ONCE(cmd_req->init.buf_size);
>  	struct iovec iov[FUSE_URING_IOV_SEGS];
> +	bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
>  	void __user *payload, *headers;
>  	size_t headers_size, payload_size, ring_size;
>  	struct fuse_bufring *br;
> @@ -354,7 +511,17 @@ static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
>  		return -ENOMEM;
>  
>  	br->queue_depth = queue_depth;
> -	br->headers = headers;
> +	if (pinned_headers) {
> +		err = fuse_bufring_pin_mem(&br->pinned_headers, headers,
> +					   headers_size);
> +		if (err) {
> +			kfree(br);
> +			return err;
> +		}
> +		br->use_pinned_headers = true;
> +	} else {
> +		br->headers = headers;
> +	}
>  
>  	payload_addr = (uintptr_t)payload;
>  
> @@ -385,8 +552,15 @@ static bool queue_init_flags_consistent(struct fuse_ring_queue *queue,
>  					u64 init_flags)
>  {
>  	bool bufring = init_flags & FUSE_URING_BUFRING;
> +	bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
> +
> +	if (bufring_enabled(queue) != bufring)
> +		return false;
> +
> +	if (!bufring)
> +		return true;
>  
> -	return bufring_enabled(queue) == bufring;
> +	return bufring_pinned_headers(queue) == pinned_headers;
>  }
>  
>  static struct fuse_ring_queue *
> @@ -423,7 +597,7 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
>  	fuse_pqueue_init(&queue->fpq);
>  
>  	if (use_bufring) {
> -		int err = fuse_uring_bufring_setup(cmd, queue);
> +		int err = fuse_uring_bufring_setup(cmd, queue, init_flags);
>  
>  		if (err) {
>  			kfree(pq);
> @@ -437,8 +611,10 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
>  	if (ring->queues[qid]) {
>  		spin_unlock(&fc->lock);
>  		kfree(queue->fpq.processing);
> -		if (use_bufring)
> +		if (use_bufring) {
> +			fuse_uring_bufring_unpin(queue);
>  			kfree(queue->bufring);
> +		}
>  		kfree(queue);
>  
>  		queue = ring->queues[qid];
> @@ -605,6 +781,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work)
>  	}
>  }
>  
> +static void fuse_uring_unpin_queues(struct fuse_ring *ring)
> +{
> +	int qid;
> +
> +	for (qid = 0; qid < ring->nr_queues; qid++) {
> +		struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
> +		struct fuse_bufring *br;
> +
> +		if (!queue)
> +			continue;
> +
> +		br = queue->bufring;
> +		if (!br)
> +			continue;
> +
> +		fuse_uring_bufring_unpin(queue);
> +	}
> +}
> +
>  /*
>   * Stop the ring queues
>   */
> @@ -643,6 +838,9 @@ void fuse_uring_abort(struct fuse_conn *fc)
>  		fuse_uring_abort_end_requests(ring);
>  		fuse_uring_stop_queues(ring);
>  	}
> +
> +	/* unpin while in process context - can't do this in softirq */
> +	fuse_uring_unpin_queues(ring);
>  }
>  
>  /*
> @@ -758,6 +956,11 @@ static int copy_header_to_ring(struct fuse_ring_ent *ent,
>  		int buf_offset = offset +
>  			sizeof(struct fuse_uring_req_header) * ent->id;
>  
> +		if (bufring_pinned_headers(ent->queue)) {
> +			memcpy(ent->queue->bufring->pinned_headers.addr + buf_offset,
> +			       header, header_size);
> +			return 0;
> +		}
>  		ring = ent->queue->bufring->headers + buf_offset;
>  	} else {
>  		ring = (void __user *)ent->headers + offset;
> @@ -785,6 +988,11 @@ static int copy_header_from_ring(struct fuse_ring_ent *ent,
>  		int buf_offset = offset +
>  			sizeof(struct fuse_uring_req_header) * ent->id;
>  
> +		if (bufring_pinned_headers(ent->queue)) {
> +			memcpy(header, ent->queue->bufring->pinned_headers.addr + buf_offset,
> +			       header_size);
> +			return 0;
> +		}
>  		ring = ent->queue->bufring->headers + buf_offset;
>  	} else {
>  		ring = (void __user *)ent->headers + offset;
> @@ -1399,7 +1607,13 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
>  
>  static bool init_flags_valid(u64 init_flags)
>  {
> -	u64 valid_flags = FUSE_URING_BUFRING;
> +	u64 valid_flags =
> +		FUSE_URING_BUFRING | FUSE_URING_PINNED_HEADERS;
> +	bool bufring = init_flags & FUSE_URING_BUFRING;
> +	bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
> +
> +	if (pinned_headers && !bufring)
> +		return false;
>  
>  	return !(init_flags & ~valid_flags);
>  }
> diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
> index 66d5d5f8dc3f..05c0f061a882 100644
> --- a/fs/fuse/dev_uring_i.h
> +++ b/fs/fuse/dev_uring_i.h
> @@ -42,12 +42,29 @@ struct fuse_bufring_buf {
>  	unsigned int id;
>  };
>  
> -struct fuse_bufring {
> -	/* pointer to the headers buffer */
> -	void __user *headers;
> +struct fuse_bufring_pinned {
> +	void *addr;
> +	struct page **pages;
> +	unsigned int nr_pages;
> +
> +	/*
> +	 * need to track this so we can unpin / unaccount pages during teardown
> +	 * when not running in the server's task context
> +	 */
> +	struct user_struct *user;
> +	struct mm_struct *mm_account;
> +};
>  
> +struct fuse_bufring {
> +	bool use_pinned_headers: 1;
>  	unsigned int queue_depth;
>  
> +	union {
> +		/* pointer to the headers buffer */
> +		void __user *headers;
> +		struct fuse_bufring_pinned pinned_headers;
> +	};
> +
>  	/* metadata tracking state of the bufring */
>  	unsigned int nbufs;
>  	unsigned int head;
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 8753de7eb189..e57244c03d42 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -244,6 +244,7 @@
>   *  7.46
>   *  - add FUSE_URING_BUFRING flag
>   *  - add fuse_uring_cmd_req init struct
> + *  - add FUSE_URING_PINNED_HEADERS flag
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -1306,6 +1307,7 @@ enum fuse_uring_cmd {
>  
>  /* fuse_uring_cmd_req flags */
>  #define FUSE_URING_BUFRING		(1 << 0)
> +#define FUSE_URING_PINNED_HEADERS	(1 << 1)
>  
>  /**
>   * In the 80B command area of the SQE.

Reviewed-by: Jeff Layton <jlayton@kernel.org>

  parent reply	other threads:[~2026-04-30 11:22 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-02 16:28 [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Joanne Koong
2026-04-02 16:28 ` [PATCH v2 01/14] fuse: separate next request fetching from sending logic Joanne Koong
2026-04-29 11:52   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 02/14] fuse: refactor io-uring header copying to ring Joanne Koong
2026-04-29 12:05   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 03/14] fuse: refactor io-uring header copying from ring Joanne Koong
2026-04-29 12:06   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 04/14] fuse: use enum types for header copying Joanne Koong
2026-04-30  8:04   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 05/14] fuse: refactor setting up copy state for payload copying Joanne Koong
2026-04-30  8:06   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 06/14] fuse: support buffer copying for kernel addresses Joanne Koong
2026-04-30  8:19   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 07/14] fuse: use named constants for io-uring iovec indices Joanne Koong
2026-04-15  9:36   ` Bernd Schubert
2026-04-30  8:20   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 08/14] fuse: move fuse_uring_abort() from header to dev_uring.c Joanne Koong
2026-04-15  9:40   ` Bernd Schubert
2026-04-30  8:21   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 09/14] fuse: rearrange io-uring iovec and ent allocation logic Joanne Koong
2026-04-15  9:45   ` Bernd Schubert
2026-04-30  8:24   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 10/14] fuse: add io-uring buffer rings Joanne Koong
2026-04-15  9:48   ` Bernd Schubert
2026-04-15 21:40     ` Joanne Koong
2026-04-30 11:08   ` Jeff Layton
2026-04-30 12:44     ` Joanne Koong
2026-05-05 22:47   ` Bernd Schubert
2026-04-02 16:28 ` [PATCH v2 11/14] fuse: add pinned headers capability for " Joanne Koong
2026-04-14 12:47   ` Bernd Schubert
2026-04-15  0:48     ` Joanne Koong
2026-05-05 22:51       ` Bernd Schubert
2026-04-30 11:22   ` Jeff Layton [this message]
2026-04-02 16:28 ` [PATCH v2 12/14] fuse: add pinned payload buffers " Joanne Koong
2026-04-30 11:29   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 13/14] fuse: add zero-copy over io-uring Joanne Koong
2026-04-30 11:42   ` Jeff Layton
2026-04-30 12:35     ` Joanne Koong
2026-04-30 12:55       ` Jeff Layton
2026-05-05 22:55         ` Bernd Schubert
2026-04-30 12:56   ` Jeff Layton
2026-05-05 23:45   ` Bernd Schubert
2026-04-02 16:28 ` [PATCH v2 14/14] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
2026-04-14 21:05   ` Bernd Schubert
2026-04-15  1:10     ` Joanne Koong
2026-04-15 10:55       ` Bernd Schubert
2026-04-15 22:40         ` Joanne Koong
2026-04-30 12:57   ` Jeff Layton
2026-04-30 12:59 ` [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Jeff Layton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2bd55e996401939a75a6d03d6608198dc1d4fc53.camel@kernel.org \
    --to=jlayton@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bernd@bsbernd.com \
    --cc=joannelkoong@gmail.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=miklos@szeredi.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox