All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
@ 2026-06-08  6:29 sw.prabhu6
  2026-06-08 15:57 ` Jens Axboe
  2026-06-09 18:36 ` David Hildenbrand (Arm)
  0 siblings, 2 replies; 10+ messages in thread
From: sw.prabhu6 @ 2026-06-08  6:29 UTC (permalink / raw)
  To: axboe, io-uring
  Cc: linux-kernel, dave, dongjoo.seo1, Swarna Prabhu, Swarna Prabhu

From: Swarna Prabhu <sw.prabhu6@gmail.com>

io_uring sqe buffer registration path returns pinned user pages in 4k
granularity. If the first pinned page is in a hugetlb folio and
pages[nr_pages - 1] is also in the same folio then store a single page
entry and report *npages = 1 while dropping nr_pages - 1 of the pin
references it took earlier.

io_uring has support to identify and coalesce multi-hugepage-backed
fixed buffers from the function 'io_check_coalesce_buffer()'. However
we need to iterate over the entire page array and this patch bypasses
the additional checks for this case. The fast path reduces the overall
sqe buffer registration time that are backed by huge pages.

Measured with fio on bare metal backed by 1024 boot-allocated 2MB hugetlb
pages and setting the cpu cores to governor for max performance.
(hugepages=1024,hugepage_size=2M):
  fio --ioengine=io_uring --rw=randwrite --bs=1M --size=2G --iodepth=256
  --direct=1 --numjobs=5 --fixedbufs=1 --registerfiles=1 --iomem=mmaphuge
  --hugepage-size=2M.

Avg across 3 runs:
Metric                          Upstream(7.1-rc1)  Patched    Delta
Reg time(io_sqe_buffer_register): 3797ns            2970ns   -21.8%
Total reg for workload:           14.35ms           11.34ms  -21.9%
fio write bandwidth:              1416MiB/s   1416MiB/s    No regression

Signed-off-by: Swarna Prabhu <s.prabhu@samsung.com>
---
 io_uring/memmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
 io_uring/memmap.h |  3 +++
 io_uring/rsrc.c   |  9 +++++--
 3 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 4f9b439319c4..957e67d2d8e8 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -37,11 +37,11 @@ static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
 	return true;
 }
 
-struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+struct page **io_pin_pages_alloc(unsigned long uaddr, unsigned long len,
+					unsigned long *nr_pages_out)
 {
 	unsigned long start, end, nr_pages;
 	struct page **pages;
-	int ret;
 
 	if (check_add_overflow(uaddr, len, &end))
 		return ERR_PTR(-EOVERFLOW);
@@ -60,6 +60,20 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 
+	*nr_pages_out = nr_pages;
+	return pages;
+}
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+{
+	unsigned long nr_pages;
+	struct page **pages;
+	int ret;
+
+	pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return pages;
+
 	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
 					pages);
 	/* success, mapped all pages */
@@ -79,6 +93,54 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
 	return ERR_PTR(ret);
 }
 
+struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages)
+{
+	unsigned long nr_pages;
+	struct page **pages;
+	int ret;
+
+	pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return pages;
+
+	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+					pages);
+	/* success, mapped all pages */
+	if (ret == nr_pages) {
+		struct folio *folio = page_folio(pages[0]);
+
+		if (nr_pages > 1 && folio_test_hugetlb(folio) &&
+		    page_folio(pages[nr_pages - 1]) == folio) {
+			struct page **huge_pages;
+
+			huge_pages = kvmalloc_objs(struct page *, 1, GFP_KERNEL_ACCOUNT);
+			if (!huge_pages) {
+				*npages = nr_pages;
+				return pages;
+			}
+			unpin_user_folio(folio, nr_pages - 1);
+
+			huge_pages[0] = pages[0];
+			kvfree(pages);
+			pages = huge_pages;
+			*npages = 1;
+		} else {
+			*npages = nr_pages;
+		}
+		return pages;
+	}
+
+	/* partial map, or didn't map anything */
+	if (ret >= 0) {
+		/* if we did partial map, release any pages we did get */
+		if (ret)
+			unpin_user_pages(pages, ret);
+		ret = -EFAULT;
+	}
+	kvfree(pages);
+	return ERR_PTR(ret);
+}
+
 enum {
 	/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
 	IO_REGION_F_VMAP			= 1,
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index f4cfbb6b9a1f..cc41af3fae61 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -7,7 +7,10 @@
 
 #define IORING_OFF_ZCRX_SHIFT		16
 
+struct page **io_pin_pages_alloc(unsigned long uaddr, unsigned long len,
+					unsigned long *nr_pages_out);
 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
+struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages);
 
 #ifndef CONFIG_MMU
 unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 650303626be6..e117b10bef0b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -771,7 +771,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	struct io_rsrc_node *node;
 	unsigned long off;
 	size_t size;
-	int ret, nr_pages, i;
+	int ret, nr_pages, i, orig_nr_pages;
 	struct io_imu_folio_data data;
 	bool coalesced = false;
 
@@ -792,7 +792,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+	orig_nr_pages = ((unsigned long)iov->iov_base + iov->iov_len
+			 + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	orig_nr_pages -= (unsigned long)iov->iov_base >> PAGE_SHIFT;
+	pages = io_pin_pages_fast_path((unsigned long) iov->iov_base, iov->iov_len,
 				&nr_pages);
 	if (IS_ERR(pages)) {
 		ret = PTR_ERR(pages);
@@ -826,6 +829,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
 	if (coalesced)
 		imu->folio_shift = data.folio_shift;
+	else if (nr_pages == 1 && orig_nr_pages > 1)
+		imu->folio_shift = folio_shift(page_folio(pages[0]));
 	refcount_set(&imu->refs, 1);
 
 	off = (unsigned long)iov->iov_base & ~PAGE_MASK;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-06-10 18:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-08  6:29 [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration sw.prabhu6
2026-06-08 15:57 ` Jens Axboe
2026-06-09  2:18   ` Swarna Prabhu
2026-06-09 18:36 ` David Hildenbrand (Arm)
2026-06-10  6:16   ` Christoph Hellwig
2026-06-10  9:54     ` David Hildenbrand (Arm)
2026-06-10 11:34       ` Christoph Hellwig
2026-06-10 13:18         ` David Hildenbrand (Arm)
2026-06-10 18:10           ` Matthew Wilcox
2026-06-10 18:45             ` David Hildenbrand (Arm)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.