[PATCH 1/8] io_uring: don't allow discontig pages for IORING_SETUP_NO

public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/8] io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP
       [not found] <20231130194633.649319-1-axboe@kernel.dk>
@ 2023-11-30 19:45 ` Jens Axboe
  2023-11-30 19:45 ` [PATCH 2/8] io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2023-11-30 19:45 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, stable, Jann Horn

io_sqes_map() is used rather than io_mem_alloc(), if the application
passes in memory for mapping rather than have the kernel allocate it and
then mmap(2) the ranges. This then calls __io_uaddr_map() to perform the
page mapping and pinning, which checks if we end up with the same pages,
if more than one page is mapped. But this check is incorrect and only
checks if the first and last pages are the same, where it really should
be checking if the mapped pages are contigous. This allows mapping a
single normal page, or a huge page range.

Down the line we can add support for remapping pages to be virtually
contigous, which is really all that io_uring cares about.

Cc: stable@vger.kernel.org
Fixes: 03d89a2de25b ("io_uring: support for user allocated memory for rings/sqes")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ed254076c723..b45abfd75415 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2697,6 +2697,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 {
 	struct page **page_array;
 	unsigned int nr_pages;
+	void *page_addr;
 	int ret, i;
 
 	*npages = 0;
@@ -2718,27 +2719,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 		io_pages_free(&page_array, ret > 0 ? ret : 0);
 		return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
 	}
-	/*
-	 * Should be a single page. If the ring is small enough that we can
-	 * use a normal page, that is fine. If we need multiple pages, then
-	 * userspace should use a huge page. That's the only way to guarantee
-	 * that we get contigious memory, outside of just being lucky or
-	 * (currently) having low memory fragmentation.
-	 */
-	if (page_array[0] != page_array[ret - 1])
-		goto err;
 
-	/*
-	 * Can't support mapping user allocated ring memory on 32-bit archs
-	 * where it could potentially reside in highmem. Just fail those with
-	 * -EINVAL, just like we did on kernels that didn't support this
-	 * feature.
-	 */
+	page_addr = page_address(page_array[0]);
 	for (i = 0; i < nr_pages; i++) {
-		if (PageHighMem(page_array[i])) {
-			ret = -EINVAL;
+		ret = -EINVAL;
+
+		/*
+		 * Can't support mapping user allocated ring memory on 32-bit
+		 * archs where it could potentially reside in highmem. Just
+		 * fail those with -EINVAL, just like we did on kernels that
+		 * didn't support this feature.
+		 */
+		if (PageHighMem(page_array[i]))
 			goto err;
-		}
+
+		/*
+		 * No support for discontig pages for now, should either be a
+		 * single normal page, or a huge page. Later on we can add
+		 * support for remapping discontig pages, for now we will
+		 * just fail them with EINVAL.
+		 */
+		if (page_address(page_array[i]) != page_addr)
+			goto err;
+		page_addr += PAGE_SIZE;
 	}
 
 	*pages = page_array;
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/8] io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP
       [not found] <20231130194633.649319-1-axboe@kernel.dk>
  2023-11-30 19:45 ` [PATCH 1/8] io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP Jens Axboe
@ 2023-11-30 19:45 ` Jens Axboe
  2023-11-30 19:45 ` [PATCH 4/8] io_uring/kbuf: defer release of mapped buffer rings Jens Axboe
  2023-11-30 19:45 ` [PATCH 7/8] io_uring: free io_buffer_list entries via RCU Jens Axboe
  3 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2023-11-30 19:45 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, stable

This flag only applies to the SQ and CQ rings, it's perfectly valid
to use a mmap approach for the provided ring buffers. Move the
check into where it belongs.

Cc: stable@vger.kernel.org
Fixes: 03d89a2de25b ("io_uring: support for user allocated memory for rings/sqes")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b45abfd75415..52e4b14ad8aa 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3478,16 +3478,18 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	struct page *page;
 	void *ptr;
 
-	/* Don't allow mmap if the ring was setup without it */
-	if (ctx->flags & IORING_SETUP_NO_MMAP)
-		return ERR_PTR(-EINVAL);
-
 	switch (offset & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
 	case IORING_OFF_CQ_RING:
+		/* Don't allow mmap if the ring was setup without it */
+		if (ctx->flags & IORING_SETUP_NO_MMAP)
+			return ERR_PTR(-EINVAL);
 		ptr = ctx->rings;
 		break;
 	case IORING_OFF_SQES:
+		/* Don't allow mmap if the ring was setup without it */
+		if (ctx->flags & IORING_SETUP_NO_MMAP)
+			return ERR_PTR(-EINVAL);
 		ptr = ctx->sq_sqes;
 		break;
 	case IORING_OFF_PBUF_RING: {
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 4/8] io_uring/kbuf: defer release of mapped buffer rings
       [not found] <20231130194633.649319-1-axboe@kernel.dk>
  2023-11-30 19:45 ` [PATCH 1/8] io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP Jens Axboe
  2023-11-30 19:45 ` [PATCH 2/8] io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP Jens Axboe
@ 2023-11-30 19:45 ` Jens Axboe
  2023-11-30 19:45 ` [PATCH 7/8] io_uring: free io_buffer_list entries via RCU Jens Axboe
  3 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2023-11-30 19:45 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, stable, Jann Horn

If a provided buffer ring is setup with IOU_PBUF_RING_MMAP, then the
kernel allocates the memory for it and the application is expected to
mmap(2) this memory. However, io_uring uses remap_pfn_range() for this
operation, so we cannot rely on normal munmap/release on freeing them
for us.

Stash an io_buf_free entry away for each of these, if any, and provide
a helper to free them post ->release().

Cc: stable@vger.kernel.org
Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 io_uring/io_uring.c            |  2 ++
 io_uring/kbuf.c                | 44 ++++++++++++++++++++++++++++++----
 io_uring/kbuf.h                |  2 ++
 4 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d3009d56af0b..805bb635cdf5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -340,6 +340,9 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_cache;
 
+	/* deferred free list, protected by ->uring_lock */
+	struct hlist_head	io_buf_list;
+
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e40b11438210..3a216f0744dd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
+	INIT_HLIST_HEAD(&ctx->io_buf_list);
 	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
 	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2950,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 	io_rings_free(ctx);
+	io_kbuf_mmap_list_free(ctx);
 
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a1e4239c7d75..85e680fc74ce 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -33,6 +33,11 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
+struct io_buf_free {
+	struct hlist_node		list;
+	void				*mem;
+};
+
 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 							unsigned int bgid)
 {
@@ -223,7 +228,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (bl->is_mapped) {
 		i = bl->buf_ring->tail - bl->head;
 		if (bl->is_mmap) {
-			folio_put(virt_to_folio(bl->buf_ring));
+			/*
+			 * io_kbuf_list_free() will free the page(s) at
+			 * ->release() time.
+			 */
 			bl->buf_ring = NULL;
 			bl->is_mmap = 0;
 		} else if (bl->buf_nr_pages) {
@@ -531,18 +539,28 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	return -EINVAL;
 }
 
-static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
+			      struct io_uring_buf_reg *reg,
 			      struct io_buffer_list *bl)
 {
-	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	struct io_buf_free *ibf;
 	size_t ring_size;
 	void *ptr;
 
 	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
-	ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+	ptr = io_mem_alloc(ring_size);
 	if (!ptr)
 		return -ENOMEM;
 
+	/* Allocate and store deferred free entry */
+	ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
+	if (!ibf) {
+		io_mem_free(ptr);
+		return -ENOMEM;
+	}
+	ibf->mem = ptr;
+	hlist_add_head(&ibf->list, &ctx->io_buf_list);
+
 	bl->buf_ring = ptr;
 	bl->is_mapped = 1;
 	bl->is_mmap = 1;
@@ -599,7 +617,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (!(reg.flags & IOU_PBUF_RING_MMAP))
 		ret = io_pin_pbuf_ring(&reg, bl);
 	else
-		ret = io_alloc_pbuf_ring(&reg, bl);
+		ret = io_alloc_pbuf_ring(ctx, &reg, bl);
 
 	if (!ret) {
 		bl->nr_entries = reg.ring_entries;
@@ -649,3 +667,19 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 
 	return bl->buf_ring;
 }
+
+/*
+ * Called at or after ->release(), free the mmap'ed buffers that we used
+ * for memory mapped provided buffer rings.
+ */
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+{
+	struct io_buf_free *ibf;
+	struct hlist_node *tmp;
+
+	hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+		hlist_del(&ibf->list);
+		io_mem_free(ibf->mem);
+		kfree(ibf);
+	}
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index f2d615236b2c..6c7646e6057c 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -51,6 +51,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
 int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
+
 unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 7/8] io_uring: free io_buffer_list entries via RCU
       [not found] <20231130194633.649319-1-axboe@kernel.dk>
                   ` (2 preceding siblings ...)
  2023-11-30 19:45 ` [PATCH 4/8] io_uring/kbuf: defer release of mapped buffer rings Jens Axboe
@ 2023-11-30 19:45 ` Jens Axboe
  3 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2023-11-30 19:45 UTC (permalink / raw)
  To: io-uring; +Cc: Jens Axboe, stable

mmap_lock nests under uring_lock out of necessity, as we may be doing
user copies with uring_lock held. However, for mmap of provided buffer
rings, we attempt to grab uring_lock with mmap_lock already held from
do_mmap(). This makes lockdep, rightfully, complain:

WARNING: possible circular locking dependency detected
6.7.0-rc1-00009-gff3337ebaf94-dirty #4438 Not tainted
------------------------------------------------------
buf-ring.t/442 is trying to acquire lock:
ffff00020e1480a8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_uring_validate_mmap_request.isra.0+0x4c/0x140

but task is already holding lock:
ffff0000dc226190 (&mm->mmap_lock){++++}-{3:3}, at: vm_mmap_pgoff+0x124/0x264

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (&mm->mmap_lock){++++}-{3:3}:
       __might_fault+0x90/0xbc
       io_register_pbuf_ring+0x94/0x488
       __arm64_sys_io_uring_register+0x8dc/0x1318
       invoke_syscall+0x5c/0x17c
       el0_svc_common.constprop.0+0x108/0x130
       do_el0_svc+0x2c/0x38
       el0_svc+0x4c/0x94
       el0t_64_sync_handler+0x118/0x124
       el0t_64_sync+0x168/0x16c

-> #0 (&ctx->uring_lock){+.+.}-{3:3}:
       __lock_acquire+0x19a0/0x2d14
       lock_acquire+0x2e0/0x44c
       __mutex_lock+0x118/0x564
       mutex_lock_nested+0x20/0x28
       io_uring_validate_mmap_request.isra.0+0x4c/0x140
       io_uring_mmu_get_unmapped_area+0x3c/0x98
       get_unmapped_area+0xa4/0x158
       do_mmap+0xec/0x5b4
       vm_mmap_pgoff+0x158/0x264
       ksys_mmap_pgoff+0x1d4/0x254
       __arm64_sys_mmap+0x80/0x9c
       invoke_syscall+0x5c/0x17c
       el0_svc_common.constprop.0+0x108/0x130
       do_el0_svc+0x2c/0x38
       el0_svc+0x4c/0x94
       el0t_64_sync_handler+0x118/0x124
       el0t_64_sync+0x168/0x16c

From that mmap(2) path, we really just need to ensure that the buffer
list doesn't go away from underneath us. For the lower indexed entries,
they never go away until the ring is freed and we can always sanely
reference those as long as the caller has a file reference. For the
higher indexed ones in our xarray, we just need to ensure that the
buffer list remains valid while we return the address of it.

Free the higher indexed io_buffer_list entries via RCU. With that we can
avoid needing ->uring_lock inside mmap(2), and simply hold the RCU read
lock around the buffer list lookup and address check.

To ensure that the arrayed lookup either returns a valid fully formulated
entry via RCU lookup, add an 'is_ready' flag that we access with store
and release memory ordering. This isn't needed for the xarray lookups,
but doesn't hurt either. Since this isn't a fast path, retain it across
both types. Similarly, for the allocated array inside the ctx, ensure
we use the proper load/acquire as setup could in theory be running in
parallel with mmap.

While in there, add a few lockdep checks for documentation purposes.

Cc: stable@vger.kernel.org
Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c |  4 +--
 io_uring/kbuf.c     | 64 ++++++++++++++++++++++++++++++++++++---------
 io_uring/kbuf.h     |  3 +++
 3 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3a216f0744dd..05f933dddfde 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3498,9 +3498,9 @@ static void *io_uring_validate_mmap_request(struct file *file,
 		unsigned int bgid;
 
 		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
-		mutex_lock(&ctx->uring_lock);
+		rcu_read_lock();
 		ptr = io_pbuf_get_address(ctx, bgid);
-		mutex_unlock(&ctx->uring_lock);
+		rcu_read_unlock();
 		if (!ptr)
 			return ERR_PTR(-EINVAL);
 		break;
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 39d15a27eb92..268788305b61 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -40,19 +40,35 @@ struct io_buf_free {
 	int				inuse;
 };
 
+static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
+						   struct io_buffer_list *bl,
+						   unsigned int bgid)
+{
+	if (bl && bgid < BGID_ARRAY)
+		return &bl[bgid];
+
+	return xa_load(&ctx->io_bl_xa, bgid);
+}
+
 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 							unsigned int bgid)
 {
-	if (ctx->io_bl && bgid < BGID_ARRAY)
-		return &ctx->io_bl[bgid];
+	lockdep_assert_held(&ctx->uring_lock);
 
-	return xa_load(&ctx->io_bl_xa, bgid);
+	return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
 }
 
 static int io_buffer_add_list(struct io_ring_ctx *ctx,
 			      struct io_buffer_list *bl, unsigned int bgid)
 {
+	/*
+	 * Store buffer group ID and finally mark the list as visible.
+	 * The normal lookup doesn't care about the visibility as we're
+	 * always under the ->uring_lock, but the RCU lookup from mmap does.
+	 */
 	bl->bgid = bgid;
+	smp_store_release(&bl->is_ready, 1);
+
 	if (bgid < BGID_ARRAY)
 		return 0;
 
@@ -203,18 +219,19 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 
 static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
 {
+	struct io_buffer_list *bl;
 	int i;
 
-	ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
-				GFP_KERNEL);
-	if (!ctx->io_bl)
+	bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
+	if (!bl)
 		return -ENOMEM;
 
 	for (i = 0; i < BGID_ARRAY; i++) {
-		INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
-		ctx->io_bl[i].bgid = i;
+		INIT_LIST_HEAD(&bl[i].buf_list);
+		bl[i].bgid = i;
 	}
 
+	smp_store_release(&ctx->io_bl, bl);
 	return 0;
 }
 
@@ -303,7 +320,7 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
 	xa_for_each(&ctx->io_bl_xa, index, bl) {
 		xa_erase(&ctx->io_bl_xa, bl->bgid);
 		__io_remove_buffers(ctx, bl, -1U);
-		kfree(bl);
+		kfree_rcu(bl, rcu);
 	}
 
 	/*
@@ -497,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		INIT_LIST_HEAD(&bl->buf_list);
 		ret = io_buffer_add_list(ctx, bl, p->bgid);
 		if (ret) {
-			kfree(bl);
+			/*
+			 * Doesn't need rcu free as it was never visible, but
+			 * let's keep it consistent throughout. Also can't
+			 * be a lower indexed array group, as adding one
+			 * where lookup failed cannot happen.
+			 */
+			if (p->bgid >= BGID_ARRAY)
+				kfree_rcu(bl, rcu);
+			else
+				WARN_ON_ONCE(1);
 			goto err;
 		}
 	}
@@ -636,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	struct io_buffer_list *bl, *free_bl = NULL;
 	int ret;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 
@@ -690,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 		return 0;
 	}
 
-	kfree(free_bl);
+	kfree_rcu(free_bl, rcu);
 	return ret;
 }
 
@@ -699,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	struct io_uring_buf_reg reg;
 	struct io_buffer_list *bl;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
@@ -715,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	__io_remove_buffers(ctx, bl, -1U);
 	if (bl->bgid >= BGID_ARRAY) {
 		xa_erase(&ctx->io_bl_xa, bl->bgid);
-		kfree(bl);
+		kfree_rcu(bl, rcu);
 	}
 	return 0;
 }
@@ -724,7 +754,15 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
 {
 	struct io_buffer_list *bl;
 
-	bl = io_buffer_get_list(ctx, bgid);
+	bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
+
+	/*
+	 * Ensure the list is fully setup. Only strictly needed for RCU lookup
+	 * via mmap, and in that case only for the array indexed groups. For
+	 * the xarray lookups, it's either visible and ready, or not at all.
+	 */
+	if (!smp_load_acquire(&bl->is_ready))
+		return NULL;
 	if (!bl || !bl->is_mmap)
 		return NULL;
 
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 6c7646e6057c..9be5960817ea 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -15,6 +15,7 @@ struct io_buffer_list {
 			struct page **buf_pages;
 			struct io_uring_buf_ring *buf_ring;
 		};
+		struct rcu_head rcu;
 	};
 	__u16 bgid;
 
@@ -28,6 +29,8 @@ struct io_buffer_list {
 	__u8 is_mapped;
 	/* ring mapped provided buffers, but mmap'ed by application */
 	__u8 is_mmap;
+	/* bl is visible from an RCU point of view for lookup */
+	__u8 is_ready;
 };
 
 struct io_buffer {
-- 
2.42.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-11-30 19:46 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20231130194633.649319-1-axboe@kernel.dk>
2023-11-30 19:45 ` [PATCH 1/8] io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP Jens Axboe
2023-11-30 19:45 ` [PATCH 2/8] io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP Jens Axboe
2023-11-30 19:45 ` [PATCH 4/8] io_uring/kbuf: defer release of mapped buffer rings Jens Axboe
2023-11-30 19:45 ` [PATCH 7/8] io_uring: free io_buffer_list entries via RCU Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox