Linux filesystem development
 help / color / mirror / Atom feed
From: Joanne Koong <joannelkoong@gmail.com>
To: miklos@szeredi.hu
Cc: bernd@bsbernd.com, axboe@kernel.dk, linux-fsdevel@vger.kernel.org
Subject: [PATCH v2 11/14] fuse: add pinned headers capability for io-uring buffer rings
Date: Thu,  2 Apr 2026 09:28:37 -0700	[thread overview]
Message-ID: <20260402162840.2989717-12-joannelkoong@gmail.com> (raw)
In-Reply-To: <20260402162840.2989717-1-joannelkoong@gmail.com>

Allow fuse servers to pin their header buffers by setting the
FUSE_URING_PINNED_HEADERS flag alongside FUSE_URING_BUFRING on REGISTER
sqes. When set, the kernel pins the header pages, vmaps them for a
kernel virtual address, and uses direct memcpy for copying. This avoids
the per-request overhead of having to pin/unpin user pages and translate
virtual addresses.

Buffers must be page-aligned. The kernel accounts pinned pages against
RLIMIT_MEMLOCK (bypassed with CAP_IPC_LOCK) and tracks mm->pinned_vm.
Unpinning is done in process context during connection abort, since vmap
cannot run in softirq (where final destruction occurs via RCU).

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 fs/fuse/dev_uring.c       | 228 ++++++++++++++++++++++++++++++++++++--
 fs/fuse/dev_uring_i.h     |  23 +++-
 include/uapi/linux/fuse.h |   2 +
 3 files changed, 243 insertions(+), 10 deletions(-)

diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index 9f14a2bcde3f..79736b02cf9f 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -11,6 +11,7 @@
 
 #include <linux/fs.h>
 #include <linux/io_uring/cmd.h>
+#include <linux/vmalloc.h>
 
 static bool __read_mostly enable_uring;
 module_param(enable_uring, bool, 0644);
@@ -46,6 +47,11 @@ static inline bool bufring_enabled(struct fuse_ring_queue *queue)
 	return queue->bufring != NULL;
 }
 
+static inline bool bufring_pinned_headers(struct fuse_ring_queue *queue)
+{
+	return queue->bufring->use_pinned_headers;
+}
+
 static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd,
 				   struct fuse_ring_ent *ring_ent)
 {
@@ -200,6 +206,37 @@ bool fuse_uring_request_expired(struct fuse_conn *fc)
 	return false;
 }
 
+static void fuse_bufring_unpin_mem(struct fuse_bufring_pinned *mem)
+{
+	struct page **pages = mem->pages;
+	unsigned int nr_pages = mem->nr_pages;
+	struct user_struct *user = mem->user;
+	struct mm_struct *mm_account = mem->mm_account;
+
+	vunmap(mem->addr);
+	unpin_user_pages(pages, nr_pages);
+
+	if (user) {
+		atomic_long_sub(nr_pages, &user->locked_vm);
+		free_uid(user);
+	}
+
+	atomic64_sub(nr_pages, &mm_account->pinned_vm);
+	mmdrop(mm_account);
+
+	kvfree(mem->pages);
+}
+
+static void fuse_uring_bufring_unpin(struct fuse_ring_queue *queue)
+{
+	struct fuse_bufring *br = queue->bufring;
+
+	if (bufring_pinned_headers(queue)) {
+		fuse_bufring_unpin_mem(&br->pinned_headers);
+		br->use_pinned_headers = false;
+	}
+}
+
 void fuse_uring_destruct(struct fuse_conn *fc)
 {
 	struct fuse_ring *ring = fc->ring;
@@ -227,7 +264,10 @@ void fuse_uring_destruct(struct fuse_conn *fc)
 		}
 
 		kfree(queue->fpq.processing);
-		kfree(queue->bufring);
+		if (bufring_enabled(queue)) {
+			fuse_uring_bufring_unpin(queue);
+			kfree(queue->bufring);
+		}
 		kfree(queue);
 		ring->queues[qid] = NULL;
 	}
@@ -309,14 +349,131 @@ static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static struct page **fuse_uring_pin_user_pages(void __user *uaddr,
+					       unsigned long len, int *npages)
+{
+	unsigned long addr = (unsigned long)uaddr;
+	unsigned long start, end, nr_pages;
+	struct page **pages;
+	int pinned;
+
+	if (check_add_overflow(addr, len, &end))
+		return ERR_PTR(-EOVERFLOW);
+	if (check_add_overflow(end, PAGE_SIZE - 1, &end))
+		return ERR_PTR(-EOVERFLOW);
+
+	end = end >> PAGE_SHIFT;
+	start = addr >> PAGE_SHIFT;
+	nr_pages = end - start;
+	if (WARN_ON_ONCE(!nr_pages))
+		return ERR_PTR(-EINVAL);
+	if (WARN_ON_ONCE(nr_pages > INT_MAX))
+		return ERR_PTR(-EOVERFLOW);
+
+	pages = kvmalloc_objs(struct page *, nr_pages, GFP_KERNEL_ACCOUNT);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
+
+	pinned = pin_user_pages_fast(addr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+				     pages);
+	/* success, mapped all pages */
+	if (pinned == nr_pages) {
+		*npages = nr_pages;
+		return pages;
+	}
+
+	/* remove any partial pins */
+	if (pinned > 0)
+		unpin_user_pages(pages, pinned);
+
+	kvfree(pages);
+
+	return ERR_PTR(pinned < 0 ? pinned : -EFAULT);
+}
+
+static int account_pinned_pages(struct fuse_bufring_pinned *mem,
+				struct page **pages, unsigned int nr_pages)
+{
+	unsigned long page_limit, cur_pages, new_pages;
+	struct user_struct *user = current_user();
+
+	if (!nr_pages)
+		return 0;
+
+	if (!capable(CAP_IPC_LOCK)) {
+		/* Don't allow more pages than we can safely lock */
+		page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+		cur_pages = atomic_long_read(&user->locked_vm);
+		do {
+			new_pages = cur_pages + nr_pages;
+			if (new_pages > page_limit)
+				return -ENOMEM;
+		} while (!atomic_long_try_cmpxchg(&user->locked_vm,
+						  &cur_pages, new_pages));
+
+		mem->user = get_uid(current_user());
+	}
+
+	atomic64_add(nr_pages, &current->mm->pinned_vm);
+	mmgrab(current->mm);
+	mem->mm_account = current->mm;
+
+	return 0;
+}
+
+static int fuse_bufring_pin_mem(struct fuse_bufring_pinned *mem,
+				void __user *addr, size_t len)
+{
+	struct page **pages = NULL;
+	int nr_pages;
+	int err;
+
+	if (!PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	pages = fuse_uring_pin_user_pages(addr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	err = account_pinned_pages(mem, pages, nr_pages);
+	if (err)
+		goto unpin;
+
+	mem->addr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!mem->addr) {
+		err = -ENOMEM;
+		goto unaccount;
+	}
+
+	mem->pages = pages;
+	mem->nr_pages = nr_pages;
+
+	return 0;
+
+unaccount:
+	if (mem->user) {
+		atomic_long_sub(nr_pages, &mem->user->locked_vm);
+		free_uid(mem->user);
+	}
+	atomic64_sub(nr_pages, &current->mm->pinned_vm);
+	mmdrop(mem->mm_account);
+unpin:
+	unpin_user_pages(pages, nr_pages);
+	kvfree(pages);
+	return err;
+}
+
 static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
-				     struct fuse_ring_queue *queue)
+				    struct fuse_ring_queue *queue,
+				    u64 init_flags)
 {
 	const struct fuse_uring_cmd_req *cmd_req =
 		io_uring_sqe128_cmd(cmd->sqe, struct fuse_uring_cmd_req);
 	u16 queue_depth = READ_ONCE(cmd_req->init.queue_depth);
 	unsigned int buf_size = READ_ONCE(cmd_req->init.buf_size);
 	struct iovec iov[FUSE_URING_IOV_SEGS];
+	bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
 	void __user *payload, *headers;
 	size_t headers_size, payload_size, ring_size;
 	struct fuse_bufring *br;
@@ -354,7 +511,17 @@ static int fuse_uring_bufring_setup(struct io_uring_cmd *cmd,
 		return -ENOMEM;
 
 	br->queue_depth = queue_depth;
-	br->headers = headers;
+	if (pinned_headers) {
+		err = fuse_bufring_pin_mem(&br->pinned_headers, headers,
+					   headers_size);
+		if (err) {
+			kfree(br);
+			return err;
+		}
+		br->use_pinned_headers = true;
+	} else {
+		br->headers = headers;
+	}
 
 	payload_addr = (uintptr_t)payload;
 
@@ -385,8 +552,15 @@ static bool queue_init_flags_consistent(struct fuse_ring_queue *queue,
 					u64 init_flags)
 {
 	bool bufring = init_flags & FUSE_URING_BUFRING;
+	bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
+
+	if (bufring_enabled(queue) != bufring)
+		return false;
+
+	if (!bufring)
+		return true;
 
-	return bufring_enabled(queue) == bufring;
+	return bufring_pinned_headers(queue) == pinned_headers;
 }
 
 static struct fuse_ring_queue *
@@ -423,7 +597,7 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
 	fuse_pqueue_init(&queue->fpq);
 
 	if (use_bufring) {
-		int err = fuse_uring_bufring_setup(cmd, queue);
+		int err = fuse_uring_bufring_setup(cmd, queue, init_flags);
 
 		if (err) {
 			kfree(pq);
@@ -437,8 +611,10 @@ fuse_uring_create_queue(struct io_uring_cmd *cmd, struct fuse_ring *ring,
 	if (ring->queues[qid]) {
 		spin_unlock(&fc->lock);
 		kfree(queue->fpq.processing);
-		if (use_bufring)
+		if (use_bufring) {
+			fuse_uring_bufring_unpin(queue);
 			kfree(queue->bufring);
+		}
 		kfree(queue);
 
 		queue = ring->queues[qid];
@@ -605,6 +781,25 @@ static void fuse_uring_async_stop_queues(struct work_struct *work)
 	}
 }
 
+static void fuse_uring_unpin_queues(struct fuse_ring *ring)
+{
+	int qid;
+
+	for (qid = 0; qid < ring->nr_queues; qid++) {
+		struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]);
+		struct fuse_bufring *br;
+
+		if (!queue)
+			continue;
+
+		br = queue->bufring;
+		if (!br)
+			continue;
+
+		fuse_uring_bufring_unpin(queue);
+	}
+}
+
 /*
  * Stop the ring queues
  */
@@ -643,6 +838,9 @@ void fuse_uring_abort(struct fuse_conn *fc)
 		fuse_uring_abort_end_requests(ring);
 		fuse_uring_stop_queues(ring);
 	}
+
+	/* unpin while in process context - can't do this in softirq */
+	fuse_uring_unpin_queues(ring);
 }
 
 /*
@@ -758,6 +956,11 @@ static int copy_header_to_ring(struct fuse_ring_ent *ent,
 		int buf_offset = offset +
 			sizeof(struct fuse_uring_req_header) * ent->id;
 
+		if (bufring_pinned_headers(ent->queue)) {
+			memcpy(ent->queue->bufring->pinned_headers.addr + buf_offset,
+			       header, header_size);
+			return 0;
+		}
 		ring = ent->queue->bufring->headers + buf_offset;
 	} else {
 		ring = (void __user *)ent->headers + offset;
@@ -785,6 +988,11 @@ static int copy_header_from_ring(struct fuse_ring_ent *ent,
 		int buf_offset = offset +
 			sizeof(struct fuse_uring_req_header) * ent->id;
 
+		if (bufring_pinned_headers(ent->queue)) {
+			memcpy(header, ent->queue->bufring->pinned_headers.addr + buf_offset,
+			       header_size);
+			return 0;
+		}
 		ring = ent->queue->bufring->headers + buf_offset;
 	} else {
 		ring = (void __user *)ent->headers + offset;
@@ -1399,7 +1607,13 @@ fuse_uring_create_ring_ent(struct io_uring_cmd *cmd,
 
 static bool init_flags_valid(u64 init_flags)
 {
-	u64 valid_flags = FUSE_URING_BUFRING;
+	u64 valid_flags =
+		FUSE_URING_BUFRING | FUSE_URING_PINNED_HEADERS;
+	bool bufring = init_flags & FUSE_URING_BUFRING;
+	bool pinned_headers = init_flags & FUSE_URING_PINNED_HEADERS;
+
+	if (pinned_headers && !bufring)
+		return false;
 
 	return !(init_flags & ~valid_flags);
 }
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 66d5d5f8dc3f..05c0f061a882 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -42,12 +42,29 @@ struct fuse_bufring_buf {
 	unsigned int id;
 };
 
-struct fuse_bufring {
-	/* pointer to the headers buffer */
-	void __user *headers;
+struct fuse_bufring_pinned {
+	void *addr;
+	struct page **pages;
+	unsigned int nr_pages;
+
+	/*
+	 * need to track this so we can unpin / unaccount pages during teardown
+	 * when not running in the server's task context
+	 */
+	struct user_struct *user;
+	struct mm_struct *mm_account;
+};
 
+struct fuse_bufring {
+	bool use_pinned_headers: 1;
 	unsigned int queue_depth;
 
+	union {
+		/* pointer to the headers buffer */
+		void __user *headers;
+		struct fuse_bufring_pinned pinned_headers;
+	};
+
 	/* metadata tracking state of the bufring */
 	unsigned int nbufs;
 	unsigned int head;
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 8753de7eb189..e57244c03d42 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -244,6 +244,7 @@
  *  7.46
  *  - add FUSE_URING_BUFRING flag
  *  - add fuse_uring_cmd_req init struct
+ *  - add FUSE_URING_PINNED_HEADERS flag
  */
 
 #ifndef _LINUX_FUSE_H
@@ -1306,6 +1307,7 @@ enum fuse_uring_cmd {
 
 /* fuse_uring_cmd_req flags */
 #define FUSE_URING_BUFRING		(1 << 0)
+#define FUSE_URING_PINNED_HEADERS	(1 << 1)
 
 /**
  * In the 80B command area of the SQE.
-- 
2.52.0


  parent reply	other threads:[~2026-04-02 16:30 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-02 16:28 [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Joanne Koong
2026-04-02 16:28 ` [PATCH v2 01/14] fuse: separate next request fetching from sending logic Joanne Koong
2026-04-29 11:52   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 02/14] fuse: refactor io-uring header copying to ring Joanne Koong
2026-04-29 12:05   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 03/14] fuse: refactor io-uring header copying from ring Joanne Koong
2026-04-29 12:06   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 04/14] fuse: use enum types for header copying Joanne Koong
2026-04-30  8:04   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 05/14] fuse: refactor setting up copy state for payload copying Joanne Koong
2026-04-30  8:06   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 06/14] fuse: support buffer copying for kernel addresses Joanne Koong
2026-04-30  8:19   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 07/14] fuse: use named constants for io-uring iovec indices Joanne Koong
2026-04-15  9:36   ` Bernd Schubert
2026-04-30  8:20   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 08/14] fuse: move fuse_uring_abort() from header to dev_uring.c Joanne Koong
2026-04-15  9:40   ` Bernd Schubert
2026-04-30  8:21   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 09/14] fuse: rearrange io-uring iovec and ent allocation logic Joanne Koong
2026-04-15  9:45   ` Bernd Schubert
2026-04-30  8:24   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 10/14] fuse: add io-uring buffer rings Joanne Koong
2026-04-15  9:48   ` Bernd Schubert
2026-04-15 21:40     ` Joanne Koong
2026-04-30 11:08   ` Jeff Layton
2026-04-30 12:44     ` Joanne Koong
2026-05-05 22:47   ` Bernd Schubert
2026-04-02 16:28 ` Joanne Koong [this message]
2026-04-14 12:47   ` [PATCH v2 11/14] fuse: add pinned headers capability for " Bernd Schubert
2026-04-15  0:48     ` Joanne Koong
2026-05-05 22:51       ` Bernd Schubert
2026-04-30 11:22   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 12/14] fuse: add pinned payload buffers " Joanne Koong
2026-04-30 11:29   ` Jeff Layton
2026-04-02 16:28 ` [PATCH v2 13/14] fuse: add zero-copy over io-uring Joanne Koong
2026-04-30 11:42   ` Jeff Layton
2026-04-30 12:35     ` Joanne Koong
2026-04-30 12:55       ` Jeff Layton
2026-05-05 22:55         ` Bernd Schubert
2026-04-30 12:56   ` Jeff Layton
2026-05-05 23:45   ` Bernd Schubert
2026-04-02 16:28 ` [PATCH v2 14/14] docs: fuse: add io-uring bufring and zero-copy documentation Joanne Koong
2026-04-14 21:05   ` Bernd Schubert
2026-04-15  1:10     ` Joanne Koong
2026-04-15 10:55       ` Bernd Schubert
2026-04-15 22:40         ` Joanne Koong
2026-04-30 12:57   ` Jeff Layton
2026-04-30 12:59 ` [PATCH v2 00/14] fuse: add io-uring buffer rings and zero-copy Jeff Layton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260402162840.2989717-12-joannelkoong@gmail.com \
    --to=joannelkoong@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=bernd@bsbernd.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=miklos@szeredi.hu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox