[RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration

Linux io-uring development
 help / color / mirror / Atom feed

* [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
@ 2026-06-08  6:29 sw.prabhu6
  2026-06-08 15:57 ` Jens Axboe
  2026-06-09 18:36 ` David Hildenbrand (Arm)
  0 siblings, 2 replies; 10+ messages in thread
From: sw.prabhu6 @ 2026-06-08  6:29 UTC (permalink / raw)
  To: axboe, io-uring
  Cc: linux-kernel, dave, dongjoo.seo1, Swarna Prabhu, Swarna Prabhu

From: Swarna Prabhu <sw.prabhu6@gmail.com>

io_uring sqe buffer registration path returns pinned user pages in 4k
granularity. If the first pinned page is in a hugetlb folio and
pages[nr_pages - 1] is also in the same folio then store a single page
entry and report *npages = 1 while dropping nr_pages - 1 of the pin
references it took earlier.

io_uring has support to identify and coalesce multi-hugepage-backed
fixed buffers from the function 'io_check_coalesce_buffer()'. However
we need to iterate over the entire page array and this patch bypasses
the additional checks for this case. The fast path reduces the overall
sqe buffer registration time that are backed by huge pages.

Measured with fio on bare metal backed by 1024 boot-allocated 2MB hugetlb
pages and setting the cpu cores to governor for max performance.
(hugepages=1024,hugepage_size=2M):
  fio --ioengine=io_uring --rw=randwrite --bs=1M --size=2G --iodepth=256
  --direct=1 --numjobs=5 --fixedbufs=1 --registerfiles=1 --iomem=mmaphuge
  --hugepage-size=2M.

Avg across 3 runs:
Metric                          Upstream(7.1-rc1)  Patched    Delta
Reg time(io_sqe_buffer_register): 3797ns            2970ns   -21.8%
Total reg for workload:           14.35ms           11.34ms  -21.9%
fio write bandwidth:              1416MiB/s   1416MiB/s    No regression

Signed-off-by: Swarna Prabhu <s.prabhu@samsung.com>
---
 io_uring/memmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
 io_uring/memmap.h |  3 +++
 io_uring/rsrc.c   |  9 +++++--
 3 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 4f9b439319c4..957e67d2d8e8 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -37,11 +37,11 @@ static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
 	return true;
 }
 
-struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+struct page **io_pin_pages_alloc(unsigned long uaddr, unsigned long len,
+					unsigned long *nr_pages_out)
 {
 	unsigned long start, end, nr_pages;
 	struct page **pages;
-	int ret;
 
 	if (check_add_overflow(uaddr, len, &end))
 		return ERR_PTR(-EOVERFLOW);
@@ -60,6 +60,20 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 
+	*nr_pages_out = nr_pages;
+	return pages;
+}
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
+{
+	unsigned long nr_pages;
+	struct page **pages;
+	int ret;
+
+	pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return pages;
+
 	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
 					pages);
 	/* success, mapped all pages */
@@ -79,6 +93,54 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
 	return ERR_PTR(ret);
 }
 
+struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages)
+{
+	unsigned long nr_pages;
+	struct page **pages;
+	int ret;
+
+	pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return pages;
+
+	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+					pages);
+	/* success, mapped all pages */
+	if (ret == nr_pages) {
+		struct folio *folio = page_folio(pages[0]);
+
+		if (nr_pages > 1 && folio_test_hugetlb(folio) &&
+		    page_folio(pages[nr_pages - 1]) == folio) {
+			struct page **huge_pages;
+
+			huge_pages = kvmalloc_objs(struct page *, 1, GFP_KERNEL_ACCOUNT);
+			if (!huge_pages) {
+				*npages = nr_pages;
+				return pages;
+			}
+			unpin_user_folio(folio, nr_pages - 1);
+
+			huge_pages[0] = pages[0];
+			kvfree(pages);
+			pages = huge_pages;
+			*npages = 1;
+		} else {
+			*npages = nr_pages;
+		}
+		return pages;
+	}
+
+	/* partial map, or didn't map anything */
+	if (ret >= 0) {
+		/* if we did partial map, release any pages we did get */
+		if (ret)
+			unpin_user_pages(pages, ret);
+		ret = -EFAULT;
+	}
+	kvfree(pages);
+	return ERR_PTR(ret);
+}
+
 enum {
 	/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
 	IO_REGION_F_VMAP			= 1,
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index f4cfbb6b9a1f..cc41af3fae61 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -7,7 +7,10 @@
 
 #define IORING_OFF_ZCRX_SHIFT		16
 
+struct page **io_pin_pages_alloc(unsigned long uaddr, unsigned long len,
+					unsigned long *nr_pages_out);
 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
+struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages);
 
 #ifndef CONFIG_MMU
 unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 650303626be6..e117b10bef0b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -771,7 +771,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	struct io_rsrc_node *node;
 	unsigned long off;
 	size_t size;
-	int ret, nr_pages, i;
+	int ret, nr_pages, i, orig_nr_pages;
 	struct io_imu_folio_data data;
 	bool coalesced = false;
 
@@ -792,7 +792,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 		return ERR_PTR(-ENOMEM);
 
 	ret = -ENOMEM;
-	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+	orig_nr_pages = ((unsigned long)iov->iov_base + iov->iov_len
+			 + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	orig_nr_pages -= (unsigned long)iov->iov_base >> PAGE_SHIFT;
+	pages = io_pin_pages_fast_path((unsigned long) iov->iov_base, iov->iov_len,
 				&nr_pages);
 	if (IS_ERR(pages)) {
 		ret = PTR_ERR(pages);
@@ -826,6 +829,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
 	if (coalesced)
 		imu->folio_shift = data.folio_shift;
+	else if (nr_pages == 1 && orig_nr_pages > 1)
+		imu->folio_shift = folio_shift(page_folio(pages[0]));
 	refcount_set(&imu->refs, 1);
 
 	off = (unsigned long)iov->iov_base & ~PAGE_MASK;
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-08  6:29 [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration sw.prabhu6
@ 2026-06-08 15:57 ` Jens Axboe
  2026-06-09  2:18   ` Swarna Prabhu
  2026-06-09 18:36 ` David Hildenbrand (Arm)
  1 sibling, 1 reply; 10+ messages in thread
From: Jens Axboe @ 2026-06-08 15:57 UTC (permalink / raw)
  To: sw.prabhu6, io-uring; +Cc: linux-kernel, dave, dongjoo.seo1, Swarna Prabhu

On 6/8/26 12:29 AM, sw.prabhu6@gmail.com wrote:
> From: Swarna Prabhu <sw.prabhu6@gmail.com>
> 
> io_uring sqe buffer registration path returns pinned user pages in 4k
> granularity. If the first pinned page is in a hugetlb folio and
> pages[nr_pages - 1] is also in the same folio then store a single page
> entry and report *npages = 1 while dropping nr_pages - 1 of the pin
> references it took earlier.
> 
> io_uring has support to identify and coalesce multi-hugepage-backed
> fixed buffers from the function 'io_check_coalesce_buffer()'. However
> we need to iterate over the entire page array and this patch bypasses
> the additional checks for this case. The fast path reduces the overall
> sqe buffer registration time that are backed by huge pages.
> 
> Measured with fio on bare metal backed by 1024 boot-allocated 2MB hugetlb
> pages and setting the cpu cores to governor for max performance.
> (hugepages=1024,hugepage_size=2M):
>   fio --ioengine=io_uring --rw=randwrite --bs=1M --size=2G --iodepth=256
>   --direct=1 --numjobs=5 --fixedbufs=1 --registerfiles=1 --iomem=mmaphuge
>   --hugepage-size=2M.
> 
> Avg across 3 runs:
> Metric                          Upstream(7.1-rc1)  Patched    Delta
> Reg time(io_sqe_buffer_register): 3797ns            2970ns   -21.8%
> Total reg for workload:           14.35ms           11.34ms  -21.9%
> fio write bandwidth:              1416MiB/s   1416MiB/s    No regression

This looks pretty reasonable. Curious what inspired this change though?
Workloads that register and unregister huge page backed buffers at
a rapid pace? The registration path should obviously not be slower than
it needs to on purpose, but it should also not be part of the application
fast path in general. I'd expect most users to register their IO memory
pool upfront and then never really touch it.

Can you expand on the background that led to this?

> Signed-off-by: Swarna Prabhu <s.prabhu@samsung.com>

This doesn't match your From: in the patch, that would need to be
corrected.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-08 15:57 ` Jens Axboe
@ 2026-06-09  2:18   ` Swarna Prabhu
  0 siblings, 0 replies; 10+ messages in thread
From: Swarna Prabhu @ 2026-06-09  2:18 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring, linux-kernel, dave, dongjoo.seo1, Swarna Prabhu

On Mon, Jun 08, 2026 at 09:57:03AM -0600, Jens Axboe wrote:
> On 6/8/26 12:29 AM, sw.prabhu6@gmail.com wrote:
> > From: Swarna Prabhu <sw.prabhu6@gmail.com>
> > 
> > io_uring sqe buffer registration path returns pinned user pages in 4k
> > granularity. If the first pinned page is in a hugetlb folio and
> > pages[nr_pages - 1] is also in the same folio then store a single page
> > entry and report *npages = 1 while dropping nr_pages - 1 of the pin
> > references it took earlier.
> > 
> > io_uring has support to identify and coalesce multi-hugepage-backed
> > fixed buffers from the function 'io_check_coalesce_buffer()'. However
> > we need to iterate over the entire page array and this patch bypasses
> > the additional checks for this case. The fast path reduces the overall
> > sqe buffer registration time that are backed by huge pages.
> > 
> > Measured with fio on bare metal backed by 1024 boot-allocated 2MB hugetlb
> > pages and setting the cpu cores to governor for max performance.
> > (hugepages=1024,hugepage_size=2M):
> >   fio --ioengine=io_uring --rw=randwrite --bs=1M --size=2G --iodepth=256
> >   --direct=1 --numjobs=5 --fixedbufs=1 --registerfiles=1 --iomem=mmaphuge
> >   --hugepage-size=2M.
> > 
> > Avg across 3 runs:
> > Metric                          Upstream(7.1-rc1)  Patched    Delta
> > Reg time(io_sqe_buffer_register): 3797ns            2970ns   -21.8%
> > Total reg for workload:           14.35ms           11.34ms  -21.9%
> > fio write bandwidth:              1416MiB/s   1416MiB/s    No regression
> 
> This looks pretty reasonable. Curious what inspired this change though?
> Workloads that register and unregister huge page backed buffers at
> a rapid pace? The registration path should obviously not be slower than
> it needs to on purpose, but it should also not be part of the application
> fast path in general. I'd expect most users to register their IO memory
> pool upfront and then never really touch it.
> 
> Can you expand on the background that led to this?

We started out looking at whether io_uring could get a bandwidth 
improvement from hugetlb/THP-backed fixed buffers ie having the kernel 
take better advantage of huge-page backing for the registered IO memory.
This attempt was encouraged by an RFC on the VFIO side [1], which 
introduces optimization while pinning pages backed by huge pages to 
avoid the latencies of pinning at 4k granularity.

io_uring has already implemented the post processing of pinned pages 
from the coalesce check. So bandwidth angle didn't pan out. 
However we found registration-time savings from short circuiting 
the page array walks in 'io_check_coalesce_buffer' when whole buffer 
lives in a single hugetlb folio. 

We don't have a workload that register and unregister huge page backed 
buffers at a rapid pace. Hence it is a one-time registration cost saving 
that seemed worth sending for feedback.

[1] https://lore.kernel.org/all/20251223230044.2617028-2-aaronlewis@google.com/> 

> > Signed-off-by: Swarna Prabhu <s.prabhu@samsung.com>
> 
> This doesn't match your From: in the patch, that would need to be
> corrected.

Noted. 


Thank you
Swarna

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-08  6:29 [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration sw.prabhu6
  2026-06-08 15:57 ` Jens Axboe
@ 2026-06-09 18:36 ` David Hildenbrand (Arm)
  2026-06-10  6:16   ` Christoph Hellwig
  1 sibling, 1 reply; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-09 18:36 UTC (permalink / raw)
  To: sw.prabhu6, axboe, io-uring
  Cc: linux-kernel, dave, dongjoo.seo1, Swarna Prabhu

> +struct page **io_pin_pages_fast_path(unsigned long uaddr, unsigned long len, int *npages)
> +{
> +	unsigned long nr_pages;
> +	struct page **pages;
> +	int ret;
> +
> +	pages = io_pin_pages_alloc(uaddr, len, &nr_pages);
> +	if (IS_ERR(pages))
> +		return pages;
> +
> +	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
> +					pages);
> +	/* success, mapped all pages */
> +	if (ret == nr_pages) {
> +		struct folio *folio = page_folio(pages[0]);
> +
> +		if (nr_pages > 1 && folio_test_hugetlb(folio) &&
> +		    page_folio(pages[nr_pages - 1]) == folio) {

I really don't like arbitrary GUP users to starting to special case hugetlb
folios, and making assumptions of how other pages they pinned look like (IOW,
how the page table mappings actually looked like).

Ideally, we'd have a pin_user_pages_fast() variant that would give you a list of
folio ranges instead of individual pages.

Seeing GUP users open-coded that (and special-casing on hugetlb) is a warning sign.

Assume we have a PMD-mapped (or even pte-mapped) THP, we would want the exact
same performance speedup. GUP knows that stuff belongs to the same folio, just
needs to communicate that information back in a better way.

So a no from my side.

-- 
Cheers,

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-09 18:36 ` David Hildenbrand (Arm)
@ 2026-06-10  6:16   ` Christoph Hellwig
  2026-06-10  9:54     ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2026-06-10  6:16 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: sw.prabhu6, axboe, io-uring, linux-kernel, dave, dongjoo.seo1,
	Swarna Prabhu

On Tue, Jun 09, 2026 at 08:36:43PM +0200, David Hildenbrand (Arm) wrote:
> I really don't like arbitrary GUP users to starting to special case hugetlb
> folios, and making assumptions of how other pages they pinned look like (IOW,
> how the page table mappings actually looked like).

Me neither, but the current interfaces are kind forcing them :P

> 
> Ideally, we'd have a pin_user_pages_fast() variant that would give you a list of
> folio ranges instead of individual pages.

Yes.  iov_iter_extract_bvecs and thus the block direct I/O fast path
would instantly benefit from that.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-10  6:16   ` Christoph Hellwig
@ 2026-06-10  9:54     ` David Hildenbrand (Arm)
  2026-06-10 11:34       ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-10  9:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: sw.prabhu6, axboe, io-uring, linux-kernel, dave, dongjoo.seo1,
	Swarna Prabhu, linux-mm@kvack.org, Matthew Wilcox, Zi Yan

On 6/10/26 08:16, Christoph Hellwig wrote:
> On Tue, Jun 09, 2026 at 08:36:43PM +0200, David Hildenbrand (Arm) wrote:
>> I really don't like arbitrary GUP users to starting to special case hugetlb
>> folios, and making assumptions of how other pages they pinned look like (IOW,
>> how the page table mappings actually looked like).
> 
> Me neither, but the current interfaces are kind forcing them :P

Yeah :)

But general rule: if you're outside of MM core and test for hugetlb folios, you
are doing something very wrong.

> 
>>
>> Ideally, we'd have a pin_user_pages_fast() variant that would give you a list of
>> folio ranges instead of individual pages.
> 
> Yes.  iov_iter_extract_bvecs and thus the block direct I/O fast path
> would instantly benefit from that.
The tricky bit for such an interface is that, soon, some pages won't be folios,
but we could still end up with non-folio pages in the address space (e.g.,
vm_insert_page()) and have to pin+return them. So using folios is not future-proof.

There are some long-term plans on providing an interface that would abstract how
you refcount something you GUP'ed. (because, some pages we GUP in the future
might not even have a dedicated refcount, all still fairly unclear). But it's
all not really finalized I think.

For now, we could expose a folio+page/offset+nr_pages interface, where we,
long-term, would not be able to return non-folio pages (e.g., vm_insert_page())
and would instead, in the future, fail the request if we stumble over a
non-folio thing in the page tables. That sounds reasonable for now.

Another solution would be, exposing page-ranges (e.g., page + nr_pages), whereby
we'd say, that all pages in a range belong to the same compound page, and that
we took a single reference for all pages in the range. IOW, page_folio() would
for now be the same for all pages in a range.

As soon as some mapped pages are no longer folios, we'll likely have to modify
plenty of drivers either way, that blindly cast pages to folios ...

So maybe a folio-range based interface is good enough for now.

-- 
Cheers,

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-10  9:54     ` David Hildenbrand (Arm)
@ 2026-06-10 11:34       ` Christoph Hellwig
  2026-06-10 13:18         ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2026-06-10 11:34 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Christoph Hellwig, sw.prabhu6, axboe, io-uring, linux-kernel,
	dave, dongjoo.seo1, Swarna Prabhu, linux-mm@kvack.org,
	Matthew Wilcox, Zi Yan

On Wed, Jun 10, 2026 at 11:54:01AM +0200, David Hildenbrand (Arm) wrote:
> > Yes.  iov_iter_extract_bvecs and thus the block direct I/O fast path
> > would instantly benefit from that.
> The tricky bit for such an interface is that, soon, some pages won't be folios,
> but we could still end up with non-folio pages in the address space (e.g.,
> vm_insert_page()) and have to pin+return them. So using folios is not future-proof.

I'm still doubtful on the "soon" beause of all the issues like this
in the I/O path.

> There are some long-term plans on providing an interface that would abstract how
> you refcount something you GUP'ed. (because, some pages we GUP in the future
> might not even have a dedicated refcount, all still fairly unclear). But it's
> all not really finalized I think.
> 
> For now, we could expose a folio+page/offset+nr_pages interface, where we,
> long-term, would not be able to return non-folio pages (e.g., vm_insert_page())
> and would instead, in the future, fail the request if we stumble over a
> non-folio thing in the page tables. That sounds reasonable for now.

I think whatever we're going to use for direct I/O has to also support
non-folio pages, especially PCI P2P memory.  So coming up with an
interface that support this ASAP would be helpful.

> Another solution would be, exposing page-ranges (e.g., page + nr_pages), whereby
> we'd say, that all pages in a range belong to the same compound page, and that
> we took a single reference for all pages in the range. IOW, page_folio() would
> for now be the same for all pages in a range.

This does sound like a reasonable short-term improvement.  One annoying
issue with returning only order 0 page in the current interfaces is
that it fills up the pages array in the caller for no good reason.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-10 11:34       ` Christoph Hellwig
@ 2026-06-10 13:18         ` David Hildenbrand (Arm)
  2026-06-10 18:10           ` Matthew Wilcox
  0 siblings, 1 reply; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-10 13:18 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: sw.prabhu6, axboe, io-uring, linux-kernel, dave, dongjoo.seo1,
	Swarna Prabhu, linux-mm@kvack.org, Matthew Wilcox, Zi Yan

On 6/10/26 13:34, Christoph Hellwig wrote:
> On Wed, Jun 10, 2026 at 11:54:01AM +0200, David Hildenbrand (Arm) wrote:
>>> Yes.  iov_iter_extract_bvecs and thus the block direct I/O fast path
>>> would instantly benefit from that.
>> The tricky bit for such an interface is that, soon, some pages won't be folios,
>> but we could still end up with non-folio pages in the address space (e.g.,
>> vm_insert_page()) and have to pin+return them. So using folios is not future-proof.
> 
> I'm still doubtful on the "soon" beause of all the issues like this
> in the I/O path.

Yeah, there are a bunch of very hairy things.

> 
>> There are some long-term plans on providing an interface that would abstract how
>> you refcount something you GUP'ed. (because, some pages we GUP in the future
>> might not even have a dedicated refcount, all still fairly unclear). But it's
>> all not really finalized I think.
>>
>> For now, we could expose a folio+page/offset+nr_pages interface, where we,
>> long-term, would not be able to return non-folio pages (e.g., vm_insert_page())
>> and would instead, in the future, fail the request if we stumble over a
>> non-folio thing in the page tables. That sounds reasonable for now.
> 
> I think whatever we're going to use for direct I/O has to also support
> non-folio pages, especially PCI P2P memory.  So coming up with an
> interface that support this ASAP would be helpful.

Yes.

I think we can keep returning pages as long a the unpin interface knows the
right thing to do to unpin them.

> 
>> Another solution would be, exposing page-ranges (e.g., page + nr_pages), whereby
>> we'd say, that all pages in a range belong to the same compound page, and that
>> we took a single reference for all pages in the range. IOW, page_folio() would
>> for now be the same for all pages in a range.
> 
> This does sound like a reasonable short-term improvement.
Right, and as long as callers don't cast the returned thing to a folio, it would
be future proof. But I guess quite some GUP users cast to folios.

Would there be users for a new interface that returns page ranges as described
above, that would want to still unpin stuff partially? E.g., we give them a page
range that belongs to the same folio with only a single pin/reference, but they
would want to logically split that range and unpin pages individually?

-- 
Cheers,

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-10 13:18         ` David Hildenbrand (Arm)
@ 2026-06-10 18:10           ` Matthew Wilcox
  2026-06-10 18:45             ` David Hildenbrand (Arm)
  0 siblings, 1 reply; 10+ messages in thread
From: Matthew Wilcox @ 2026-06-10 18:10 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Christoph Hellwig, sw.prabhu6, axboe, io-uring, linux-kernel,
	dave, dongjoo.seo1, Swarna Prabhu, linux-mm@kvack.org, Zi Yan

On Wed, Jun 10, 2026 at 03:18:52PM +0200, David Hildenbrand (Arm) wrote:
> On 6/10/26 13:34, Christoph Hellwig wrote:
> > On Wed, Jun 10, 2026 at 11:54:01AM +0200, David Hildenbrand (Arm) wrote:
> >> There are some long-term plans on providing an interface that would abstract how
> >> you refcount something you GUP'ed. (because, some pages we GUP in the future
> >> might not even have a dedicated refcount, all still fairly unclear). But it's
> >> all not really finalized I think.
> >>
> >> For now, we could expose a folio+page/offset+nr_pages interface, where we,
> >> long-term, would not be able to return non-folio pages (e.g., vm_insert_page())
> >> and would instead, in the future, fail the request if we stumble over a
> >> non-folio thing in the page tables. That sounds reasonable for now.
> > 
> > I think whatever we're going to use for direct I/O has to also support
> > non-folio pages, especially PCI P2P memory.  So coming up with an
> > interface that support this ASAP would be helpful.
> 
> Yes.
> 
> I think we can keep returning pages as long a the unpin interface knows the
> right thing to do to unpin them.

This would be the get_user_phyrs() interface I've talked about before.

https://lore.kernel.org/all/ZbVO2RKhw-dLUMvf@casper.infradead.org/
and the long thread:
https://lore.kernel.org/all/YdyKWeU0HTv8m7wD@casper.infradead.org/

> Would there be users for a new interface that returns page ranges as described
> above, that would want to still unpin stuff partially? E.g., we give them a page
> range that belongs to the same folio with only a single pin/reference, but they
> would want to logically split that range and unpin pages individually?

Urgh, no, we shouldn't do that.  ranges should be pinned / unpinned
as a whole.  I'm sympathetic to "for this special operation we need to
create a new range from this existing range and adjust the refcount(s)
appropriately so each of the two rangees can be put separately", but
I'm not sympathetic to "we need to allow each page to be individually
refcounted".

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration
  2026-06-10 18:10           ` Matthew Wilcox
@ 2026-06-10 18:45             ` David Hildenbrand (Arm)
  0 siblings, 0 replies; 10+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-10 18:45 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Hellwig, sw.prabhu6, axboe, io-uring, linux-kernel,
	dave, dongjoo.seo1, Swarna Prabhu, linux-mm@kvack.org, Zi Yan

On 6/10/26 20:10, Matthew Wilcox wrote:
> On Wed, Jun 10, 2026 at 03:18:52PM +0200, David Hildenbrand (Arm) wrote:
>> On 6/10/26 13:34, Christoph Hellwig wrote:
>>>
>>> I think whatever we're going to use for direct I/O has to also support
>>> non-folio pages, especially PCI P2P memory.  So coming up with an
>>> interface that support this ASAP would be helpful.
>>
>> Yes.
>>
>> I think we can keep returning pages as long a the unpin interface knows the
>> right thing to do to unpin them.
> 
> This would be the get_user_phyrs() interface I've talked about before.
> 
> https://lore.kernel.org/all/ZbVO2RKhw-dLUMvf@casper.infradead.org/
> and the long thread:
> https://lore.kernel.org/all/YdyKWeU0HTv8m7wD@casper.infradead.org/
> 
>> Would there be users for a new interface that returns page ranges as described
>> above, that would want to still unpin stuff partially? E.g., we give them a page
>> range that belongs to the same folio with only a single pin/reference, but they
>> would want to logically split that range and unpin pages individually?
> 
> Urgh, no, we shouldn't do that.  ranges should be pinned / unpinned
> as a whole.  I'm sympathetic to "for this special operation we need to
> create a new range from this existing range and adjust the refcount(s)
> appropriately so each of the two rangees can be put separately", but
> I'm not sympathetic to "we need to allow each page to be individually
> refcounted".

Yes, me too. I wanted to understand if that a common thing to happen for users,
such that we would have to worry about it right from the start.

-- 
Cheers,

David

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-06-10 18:45 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-08  6:29 [RFC v1] io_uring/rsrc: add fast path huge page handling in buffer registration sw.prabhu6
2026-06-08 15:57 ` Jens Axboe
2026-06-09  2:18   ` Swarna Prabhu
2026-06-09 18:36 ` David Hildenbrand (Arm)
2026-06-10  6:16   ` Christoph Hellwig
2026-06-10  9:54     ` David Hildenbrand (Arm)
2026-06-10 11:34       ` Christoph Hellwig
2026-06-10 13:18         ` David Hildenbrand (Arm)
2026-06-10 18:10           ` Matthew Wilcox
2026-06-10 18:45             ` David Hildenbrand (Arm)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox