[PATCHSET v3 0/4] Add support for ring resizing

io-uring.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCHSET v3 0/4] Add support for ring resizing
@ 2024-10-24 17:07 Jens Axboe
  2024-10-24 17:07 ` [PATCH 1/4] io_uring: move max entry definition and ring sizing into header Jens Axboe
                   ` (3 more replies)
  0 siblings, 4 replies; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 17:07 UTC (permalink / raw)
  To: io-uring; +Cc: jannh

Hi,

Here's v3 of the ring resizing support. For the v1 posting and details,
look here:

https://lore.kernel.org/io-uring/20241022021159.820925-1-axboe@kernel.dk/T/#md3a2f049b0527592cc6d8ea25b46bde9fa8e5c68

 include/uapi/linux/io_uring.h |   3 +
 io_uring/io_uring.c           |  84 +++++++------
 io_uring/io_uring.h           |   6 +
 io_uring/memmap.c             |   4 +
 io_uring/register.c           | 214 ++++++++++++++++++++++++++++++++++
 5 files changed, 273 insertions(+), 38 deletions(-)

Since v2:
- Add patch explicitly checking for valid rings at mmap time. More
  of a documentation patch than anything, doesn't fix any current or
  future issues. But it makes it explicit that they have to be valid.
- Fix an issue with resizing when the sizes were identical, causing
  a cleared io_uring_params to be returned and liburing using that
  to re-mmap the rings. Ensure io_uring_params is filled in before
  getting copied back, even if nothing was done.
- Fix an issue with SQPOLL, it needs to get quiesced before we can
  proceed with the resize.
- Hold the mmap_sem and completion lock across the final part of
  the operation, where state is copied and the rings swapped. This
  prevents concurrent IO from accessing rings that are going away,
  and mmap from seeing any NULL or going-away mappings until the
  swap has been completed.
- Add some more comments.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/4] io_uring: move max entry definition and ring sizing into header
  2024-10-24 17:07 [PATCHSET v3 0/4] Add support for ring resizing Jens Axboe
@ 2024-10-24 17:07 ` Jens Axboe
  2024-10-24 17:07 ` [PATCH 2/4] io_uring: abstract out a bit of the ring filling logic Jens Axboe
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 17:07 UTC (permalink / raw)
  To: io-uring; +Cc: jannh, Jens Axboe

In preparation for needing this somewhere else, move the definitions
for the maximum CQ and SQ ring size into io_uring.h. Make the
rings_size() helper available as well, and have it take just the setup
flags argument rather than the fill ring pointer. That's all that is
needed.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 14 ++++++--------
 io_uring/io_uring.h |  5 +++++
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 58b401900b41..6dea5242d666 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -105,9 +105,6 @@
 #include "alloc_cache.h"
 #include "eventfd.h"
 
-#define IORING_MAX_ENTRIES	32768
-#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
-
 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
 			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
 
@@ -2667,8 +2664,8 @@ static void io_rings_free(struct io_ring_ctx *ctx)
 	ctx->sq_sqes = NULL;
 }
 
-static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
-				unsigned int cq_entries, size_t *sq_offset)
+unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
+			 unsigned int cq_entries, size_t *sq_offset)
 {
 	struct io_rings *rings;
 	size_t off, sq_array_size;
@@ -2676,7 +2673,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
 	off = struct_size(rings, cqes, cq_entries);
 	if (off == SIZE_MAX)
 		return SIZE_MAX;
-	if (ctx->flags & IORING_SETUP_CQE32) {
+	if (flags & IORING_SETUP_CQE32) {
 		if (check_shl_overflow(off, 1, &off))
 			return SIZE_MAX;
 	}
@@ -2687,7 +2684,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
 		return SIZE_MAX;
 #endif
 
-	if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
+	if (flags & IORING_SETUP_NO_SQARRAY) {
 		*sq_offset = SIZE_MAX;
 		return off;
 	}
@@ -3434,7 +3431,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	ctx->sq_entries = p->sq_entries;
 	ctx->cq_entries = p->cq_entries;
 
-	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
+	size = rings_size(ctx->flags, p->sq_entries, p->cq_entries,
+			  &sq_array_offset);
 	if (size == SIZE_MAX)
 		return -EOVERFLOW;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9cd9a127e9ed..4a471a810f02 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -65,6 +65,11 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 }
 
+#define IORING_MAX_ENTRIES	32768
+#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
+
+unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
+			 unsigned int cq_entries, size_t *sq_offset);
 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/4] io_uring: abstract out a bit of the ring filling logic
  2024-10-24 17:07 [PATCHSET v3 0/4] Add support for ring resizing Jens Axboe
  2024-10-24 17:07 ` [PATCH 1/4] io_uring: move max entry definition and ring sizing into header Jens Axboe
@ 2024-10-24 17:07 ` Jens Axboe
  2024-10-24 17:07 ` [PATCH 3/4] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings Jens Axboe
  2024-10-24 17:07 ` [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS Jens Axboe
  3 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 17:07 UTC (permalink / raw)
  To: io-uring; +Cc: jannh, Jens Axboe

Abstract out a io_uring_fill_params() helper, which fills out the
necessary bits of struct io_uring_params. Add it to io_uring.h as well,
in preparation for having another internal user of it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 70 ++++++++++++++++++++++++++-------------------
 io_uring/io_uring.h |  1 +
 2 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6dea5242d666..b5974bdad48b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3498,14 +3498,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
 					 O_RDWR | O_CLOEXEC, NULL);
 }
 
-static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
-				  struct io_uring_params __user *params)
+int io_uring_fill_params(unsigned entries, struct io_uring_params *p)
 {
-	struct io_ring_ctx *ctx;
-	struct io_uring_task *tctx;
-	struct file *file;
-	int ret;
-
 	if (!entries)
 		return -EINVAL;
 	if (entries > IORING_MAX_ENTRIES) {
@@ -3547,6 +3541,42 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 		p->cq_entries = 2 * p->sq_entries;
 	}
 
+	p->sq_off.head = offsetof(struct io_rings, sq.head);
+	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
+	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
+	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
+	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
+	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
+	p->sq_off.resv1 = 0;
+	if (!(p->flags & IORING_SETUP_NO_MMAP))
+		p->sq_off.user_addr = 0;
+
+	p->cq_off.head = offsetof(struct io_rings, cq.head);
+	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
+	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
+	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
+	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
+	p->cq_off.cqes = offsetof(struct io_rings, cqes);
+	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
+	p->cq_off.resv1 = 0;
+	if (!(p->flags & IORING_SETUP_NO_MMAP))
+		p->cq_off.user_addr = 0;
+
+	return 0;
+}
+
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+				  struct io_uring_params __user *params)
+{
+	struct io_ring_ctx *ctx;
+	struct io_uring_task *tctx;
+	struct file *file;
+	int ret;
+
+	ret = io_uring_fill_params(entries, p);
+	if (unlikely(ret))
+		return ret;
+
 	ctx = io_ring_ctx_alloc(p);
 	if (!ctx)
 		return -ENOMEM;
@@ -3630,6 +3660,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ret)
 		goto err;
 
+	if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+		p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+
 	ret = io_sq_offload_create(ctx, p);
 	if (ret)
 		goto err;
@@ -3638,29 +3671,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ret)
 		goto err;
 
-	p->sq_off.head = offsetof(struct io_rings, sq.head);
-	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
-	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
-	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
-	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
-	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
-	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
-		p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
-	p->sq_off.resv1 = 0;
-	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
-		p->sq_off.user_addr = 0;
-
-	p->cq_off.head = offsetof(struct io_rings, cq.head);
-	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
-	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
-	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
-	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
-	p->cq_off.cqes = offsetof(struct io_rings, cqes);
-	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
-	p->cq_off.resv1 = 0;
-	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
-		p->cq_off.user_addr = 0;
-
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 4a471a810f02..e3e6cb14de5d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -70,6 +70,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
 
 unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
 			 unsigned int cq_entries, size_t *sq_offset);
+int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/4] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings
  2024-10-24 17:07 [PATCHSET v3 0/4] Add support for ring resizing Jens Axboe
  2024-10-24 17:07 ` [PATCH 1/4] io_uring: move max entry definition and ring sizing into header Jens Axboe
  2024-10-24 17:07 ` [PATCH 2/4] io_uring: abstract out a bit of the ring filling logic Jens Axboe
@ 2024-10-24 17:07 ` Jens Axboe
  2024-10-24 17:07 ` [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS Jens Axboe
  3 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 17:07 UTC (permalink / raw)
  To: io-uring; +Cc: jannh, Jens Axboe

The later mapping will actually check this too, but in terms of code
clarify, explicitly check for whether or not the rings and sqes are
valid during validation. That makes it explicit that if they are
non-NULL, they are valid and can get mapped.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/memmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index a0f32a255fd1..d614824e17bd 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -204,11 +204,15 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
+		if (!ctx->rings)
+			return ERR_PTR(-EFAULT);
 		return ctx->rings;
 	case IORING_OFF_SQES:
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
+		if (!ctx->sq_sqes)
+			return ERR_PTR(-EFAULT);
 		return ctx->sq_sqes;
 	case IORING_OFF_PBUF_RING: {
 		struct io_buffer_list *bl;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 17:07 [PATCHSET v3 0/4] Add support for ring resizing Jens Axboe
                   ` (2 preceding siblings ...)
  2024-10-24 17:07 ` [PATCH 3/4] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings Jens Axboe
@ 2024-10-24 17:07 ` Jens Axboe
  2024-10-24 18:13   ` Jann Horn
  3 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 17:07 UTC (permalink / raw)
  To: io-uring; +Cc: jannh, Jens Axboe

Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.

For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.

Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.

Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.

Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   3 +
 io_uring/register.c           | 214 ++++++++++++++++++++++++++++++++++
 2 files changed, 217 insertions(+)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 86cb385fe0b5..c4737892c7cd 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -615,6 +615,9 @@ enum io_uring_register_op {
 	/* send MSG_RING without having a ring */
 	IORING_REGISTER_SEND_MSG_RING		= 31,
 
+	/* resize CQ ring */
+	IORING_REGISTER_RESIZE_RINGS		= 33,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/register.c b/io_uring/register.c
index 52b2f9b74af8..911242f63704 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -29,6 +29,7 @@
 #include "napi.h"
 #include "eventfd.h"
 #include "msg_ring.h"
+#include "memmap.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -361,6 +362,213 @@ static int io_register_clock(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+/*
+ * State to maintain until we can swap. Both new and old state, used for
+ * either mapping or freeing.
+ */
+struct io_ring_ctx_rings {
+	unsigned short n_ring_pages;
+	unsigned short n_sqe_pages;
+	struct page **ring_pages;
+	struct page **sqe_pages;
+	struct io_uring_sqe *sq_sqes;
+	struct io_rings *rings;
+};
+
+static void io_register_free_rings(struct io_uring_params *p,
+				   struct io_ring_ctx_rings *r)
+{
+	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
+				true);
+		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
+				true);
+	} else {
+		io_pages_free(&r->ring_pages, r->n_ring_pages);
+		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
+		vunmap(r->rings);
+		vunmap(r->sq_sqes);
+	}
+}
+
+#define swap_old(ctx, o, n, field)		\
+	do {					\
+		(o).field = (ctx)->field;	\
+		(ctx)->field = (n).field;	\
+	} while (0)
+
+#define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
+#define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
+			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+
+static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
+	size_t size, sq_array_offset;
+	struct io_uring_params p;
+	unsigned i, tail;
+	void *ptr;
+	int ret;
+
+	/* for single issuer, must be owner resizing */
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+	    current != ctx->submitter_task)
+		return -EEXIST;
+	if (copy_from_user(&p, arg, sizeof(p)))
+		return -EFAULT;
+	if (p.flags & ~RESIZE_FLAGS)
+		return -EINVAL;
+
+	/* properties that are always inherited */
+	p.flags |= (ctx->flags & COPY_FLAGS);
+
+	ret = io_uring_fill_params(p.sq_entries, &p);
+	if (unlikely(ret))
+		return ret;
+
+	/* nothing to do, but copy params back */
+	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
+		if (copy_to_user(arg, &p, sizeof(p)))
+			return -EFAULT;
+		return 0;
+	}
+
+	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
+				&sq_array_offset);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
+	else
+		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
+						p.cq_off.user_addr, size);
+	if (IS_ERR(n.rings))
+		return PTR_ERR(n.rings);
+
+	n.rings->sq_ring_mask = p.sq_entries - 1;
+	n.rings->cq_ring_mask = p.cq_entries - 1;
+	n.rings->sq_ring_entries = p.sq_entries;
+	n.rings->cq_ring_entries = p.cq_entries;
+
+	if (copy_to_user(arg, &p, sizeof(p))) {
+		io_register_free_rings(&p, &n);
+		return -EFAULT;
+	}
+
+	if (p.flags & IORING_SETUP_SQE128)
+		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
+	else
+		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
+	if (size == SIZE_MAX) {
+		io_register_free_rings(&p, &n);
+		return -EOVERFLOW;
+	}
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
+	else
+		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
+					p.sq_off.user_addr,
+					size);
+	if (IS_ERR(ptr)) {
+		io_register_free_rings(&p, &n);
+		return PTR_ERR(ptr);
+	}
+
+	/*
+	 * If using SQPOLL, park the thread
+	 */
+	if (ctx->sq_data) {
+		mutex_unlock(&ctx->uring_lock);
+		io_sq_thread_park(ctx->sq_data);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	/*
+	 * We'll do the swap. Clear out existing mappings to prevent mmap
+	 * from seeing them, as we'll unmap them. Any attempt to mmap existing
+	 * rings beyond this point will fail. Not that it could proceed at this
+	 * point anyway, as we'll hold the mmap_sem until we've done the swap.
+	 * Likewise, hold the completion * lock over the duration of the actual
+	 * swap.
+	 */
+	mmap_write_lock(current->mm);
+	spin_lock(&ctx->completion_lock);
+	o.rings = ctx->rings;
+	ctx->rings = NULL;
+	o.sq_sqes = ctx->sq_sqes;
+	ctx->sq_sqes = NULL;
+
+	/*
+	 * Now copy SQ and CQ entries, if any. If either of the destination
+	 * rings can't hold what is already there, then fail the operation.
+	 */
+	n.sq_sqes = ptr;
+	tail = o.rings->sq.tail;
+	if (tail - o.rings->sq.head > p.sq_entries)
+		goto overflow;
+	for (i = o.rings->sq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->sq_entries - 1);
+		unsigned dst_head = i & n.rings->sq_ring_mask;
+
+		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+	}
+	n.rings->sq.head = o.rings->sq.head;
+	n.rings->sq.tail = o.rings->sq.tail;
+
+	tail = o.rings->cq.tail;
+	if (tail - o.rings->cq.head > p.cq_entries) {
+overflow:
+		/* restore old rings, and return -EOVERFLOW via cleanup path */
+		ctx->rings = o.rings;
+		ctx->sq_sqes = o.sq_sqes;
+		to_free = &n;
+		ret = -EOVERFLOW;
+		goto out;
+	}
+	for (i = o.rings->cq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->cq_entries - 1);
+		unsigned dst_head = i & n.rings->cq_ring_mask;
+
+		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+	}
+	n.rings->cq.head = o.rings->cq.head;
+	n.rings->cq.tail = o.rings->cq.tail;
+	/* invalidate cached cqe refill */
+	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
+
+	n.rings->sq_dropped = o.rings->sq_dropped;
+	n.rings->sq_flags = o.rings->sq_flags;
+	n.rings->cq_flags = o.rings->cq_flags;
+	n.rings->cq_overflow = o.rings->cq_overflow;
+
+	/* all done, store old pointers and assign new ones */
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
+
+	ctx->sq_entries = p.sq_entries;
+	ctx->cq_entries = p.cq_entries;
+
+	ctx->rings = n.rings;
+	ctx->sq_sqes = n.sq_sqes;
+	swap_old(ctx, o, n, n_ring_pages);
+	swap_old(ctx, o, n, n_sqe_pages);
+	swap_old(ctx, o, n, ring_pages);
+	swap_old(ctx, o, n, sqe_pages);
+	to_free = &o;
+	ret = 0;
+out:
+	spin_unlock(&ctx->completion_lock);
+	mmap_write_unlock(current->mm);
+	io_register_free_rings(&p, to_free);
+
+	if (ctx->sq_data)
+		io_sq_thread_unpark(ctx->sq_data);
+
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -549,6 +757,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_clone_buffers(ctx, arg);
 		break;
+	case IORING_REGISTER_RESIZE_RINGS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_resize_rings(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 17:07 ` [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS Jens Axboe
@ 2024-10-24 18:13   ` Jann Horn
  2024-10-24 19:50     ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jann Horn @ 2024-10-24 18:13 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Thu, Oct 24, 2024 at 7:08 PM Jens Axboe <axboe@kernel.dk> wrote:
> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
> the existing rings. It takes a struct io_uring_params argument, the same
> one which is used to setup the ring initially, and resizes rings
> according to the sizes given.
[...]
> +        * We'll do the swap. Clear out existing mappings to prevent mmap
> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
> +        * rings beyond this point will fail. Not that it could proceed at this
> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
> +        * Likewise, hold the completion * lock over the duration of the actual
> +        * swap.
> +        */
> +       mmap_write_lock(current->mm);

Why does the mmap lock for current->mm suffice here? I see nothing in
io_uring_mmap() that limits mmap() to tasks with the same mm_struct.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 18:13   ` Jann Horn
@ 2024-10-24 19:50     ` Jens Axboe
  2024-10-24 19:53       ` Jann Horn
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 19:50 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring

On 10/24/24 12:13 PM, Jann Horn wrote:
> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
>> the existing rings. It takes a struct io_uring_params argument, the same
>> one which is used to setup the ring initially, and resizes rings
>> according to the sizes given.
> [...]
>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
>> +        * rings beyond this point will fail. Not that it could proceed at this
>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
>> +        * Likewise, hold the completion * lock over the duration of the actual
>> +        * swap.
>> +        */
>> +       mmap_write_lock(current->mm);
> 
> Why does the mmap lock for current->mm suffice here? I see nothing in
> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.

Ehm does ->mmap() not hold ->mmap_sem already? I was under that
understanding. Obviously if it doesn't, then yeah this won't be enough.
Checked, and it does.

Ah I see what you mean now, task with different mm. But how would that
come about? The io_uring fd is CLOEXEC, and it can't get passed.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 19:50     ` Jens Axboe
@ 2024-10-24 19:53       ` Jann Horn
  2024-10-24 19:59         ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jann Horn @ 2024-10-24 19:53 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Thu, Oct 24, 2024 at 9:50 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/24/24 12:13 PM, Jann Horn wrote:
> > On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
> >> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
> >> the existing rings. It takes a struct io_uring_params argument, the same
> >> one which is used to setup the ring initially, and resizes rings
> >> according to the sizes given.
> > [...]
> >> +        * We'll do the swap. Clear out existing mappings to prevent mmap
> >> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
> >> +        * rings beyond this point will fail. Not that it could proceed at this
> >> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
> >> +        * Likewise, hold the completion * lock over the duration of the actual
> >> +        * swap.
> >> +        */
> >> +       mmap_write_lock(current->mm);
> >
> > Why does the mmap lock for current->mm suffice here? I see nothing in
> > io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
>
> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
> understanding. Obviously if it doesn't, then yeah this won't be enough.
> Checked, and it does.
>
> Ah I see what you mean now, task with different mm. But how would that
> come about? The io_uring fd is CLOEXEC, and it can't get passed.

Yeah, that's what I meant, tasks with different mm. I think there are
a few ways to get the io_uring fd into a different task, the ones I
can immediately think of:

 - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
 - O_CLOEXEC can be cleared via fcntl()
 - you can use clone() to create two tasks that share FD tables
without sharing an mm

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 19:53       ` Jann Horn
@ 2024-10-24 19:59         ` Jens Axboe
  2024-10-24 20:08           ` Jann Horn
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 19:59 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring

On 10/24/24 1:53 PM, Jann Horn wrote:
> On Thu, Oct 24, 2024 at 9:50?PM Jens Axboe <axboe@kernel.dk> wrote:
>> On 10/24/24 12:13 PM, Jann Horn wrote:
>>> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
>>>> the existing rings. It takes a struct io_uring_params argument, the same
>>>> one which is used to setup the ring initially, and resizes rings
>>>> according to the sizes given.
>>> [...]
>>>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
>>>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
>>>> +        * rings beyond this point will fail. Not that it could proceed at this
>>>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
>>>> +        * Likewise, hold the completion * lock over the duration of the actual
>>>> +        * swap.
>>>> +        */
>>>> +       mmap_write_lock(current->mm);
>>>
>>> Why does the mmap lock for current->mm suffice here? I see nothing in
>>> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
>>
>> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
>> understanding. Obviously if it doesn't, then yeah this won't be enough.
>> Checked, and it does.
>>
>> Ah I see what you mean now, task with different mm. But how would that
>> come about? The io_uring fd is CLOEXEC, and it can't get passed.
> 
> Yeah, that's what I meant, tasks with different mm. I think there are
> a few ways to get the io_uring fd into a different task, the ones I
> can immediately think of:
> 
>  - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
>  - O_CLOEXEC can be cleared via fcntl()
>  - you can use clone() to create two tasks that share FD tables
> without sharing an mm

OK good catch, yes then it won't be enough. Might just make sense to
exclude mmap separately, then. Thanks, I'll work on that for v4!
Probably trivial enough with just a separate mutex that's held over
mmap, and that register grabs too. Basically replacing mmap_write_lock()
on the resize side with ctx->mmap_lock, and grabbing that over
validate/mmap on the io_uring mmap side. Need to ponder the implications
of that in terms of how the locks nest, as you'd have mmap_sem ->
ctx->mmap_lock, but the other way around for resizing. But that should
just be for the pinning parts, which is why it does the mmap_sem
grabbing where it does right now.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 19:59         ` Jens Axboe
@ 2024-10-24 20:08           ` Jann Horn
  2024-10-24 20:25             ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jann Horn @ 2024-10-24 20:08 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Thu, Oct 24, 2024 at 9:59 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/24/24 1:53 PM, Jann Horn wrote:
> > On Thu, Oct 24, 2024 at 9:50?PM Jens Axboe <axboe@kernel.dk> wrote:
> >> On 10/24/24 12:13 PM, Jann Horn wrote:
> >>> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
> >>>> the existing rings. It takes a struct io_uring_params argument, the same
> >>>> one which is used to setup the ring initially, and resizes rings
> >>>> according to the sizes given.
> >>> [...]
> >>>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
> >>>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
> >>>> +        * rings beyond this point will fail. Not that it could proceed at this
> >>>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
> >>>> +        * Likewise, hold the completion * lock over the duration of the actual
> >>>> +        * swap.
> >>>> +        */
> >>>> +       mmap_write_lock(current->mm);
> >>>
> >>> Why does the mmap lock for current->mm suffice here? I see nothing in
> >>> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
> >>
> >> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
> >> understanding. Obviously if it doesn't, then yeah this won't be enough.
> >> Checked, and it does.
> >>
> >> Ah I see what you mean now, task with different mm. But how would that
> >> come about? The io_uring fd is CLOEXEC, and it can't get passed.
> >
> > Yeah, that's what I meant, tasks with different mm. I think there are
> > a few ways to get the io_uring fd into a different task, the ones I
> > can immediately think of:
> >
> >  - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
> >  - O_CLOEXEC can be cleared via fcntl()
> >  - you can use clone() to create two tasks that share FD tables
> > without sharing an mm
>
> OK good catch, yes then it won't be enough. Might just make sense to
> exclude mmap separately, then. Thanks, I'll work on that for v4!

Yeah, that sounds reasonable to me.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 20:08           ` Jann Horn
@ 2024-10-24 20:25             ` Jens Axboe
  2024-10-24 20:32               ` Jann Horn
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 20:25 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring

On 10/24/24 2:08 PM, Jann Horn wrote:
> On Thu, Oct 24, 2024 at 9:59?PM Jens Axboe <axboe@kernel.dk> wrote:
>> On 10/24/24 1:53 PM, Jann Horn wrote:
>>> On Thu, Oct 24, 2024 at 9:50?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>> On 10/24/24 12:13 PM, Jann Horn wrote:
>>>>> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
>>>>>> the existing rings. It takes a struct io_uring_params argument, the same
>>>>>> one which is used to setup the ring initially, and resizes rings
>>>>>> according to the sizes given.
>>>>> [...]
>>>>>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
>>>>>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
>>>>>> +        * rings beyond this point will fail. Not that it could proceed at this
>>>>>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
>>>>>> +        * Likewise, hold the completion * lock over the duration of the actual
>>>>>> +        * swap.
>>>>>> +        */
>>>>>> +       mmap_write_lock(current->mm);
>>>>>
>>>>> Why does the mmap lock for current->mm suffice here? I see nothing in
>>>>> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
>>>>
>>>> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
>>>> understanding. Obviously if it doesn't, then yeah this won't be enough.
>>>> Checked, and it does.
>>>>
>>>> Ah I see what you mean now, task with different mm. But how would that
>>>> come about? The io_uring fd is CLOEXEC, and it can't get passed.
>>>
>>> Yeah, that's what I meant, tasks with different mm. I think there are
>>> a few ways to get the io_uring fd into a different task, the ones I
>>> can immediately think of:
>>>
>>>  - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
>>>  - O_CLOEXEC can be cleared via fcntl()
>>>  - you can use clone() to create two tasks that share FD tables
>>> without sharing an mm
>>
>> OK good catch, yes then it won't be enough. Might just make sense to
>> exclude mmap separately, then. Thanks, I'll work on that for v4!
> 
> Yeah, that sounds reasonable to me.

Something like this should do it, it's really just replacing mmap_sem
with a ring private lock. And since the ordering already had to deal
with uring_lock vs mmap_sem ABBA issues, this should slot straight in as
well.

Totally untested, will write a fork() test case as well for the
resize-rings test cases.

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9746f4bb5182..2f12828b22a4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -423,6 +423,13 @@ struct io_ring_ctx {
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
 
+	/*
+	 * Protection for resize vs mmap races - both the mmap and resize
+	 * side will need to grab this lock, to prevent either side from
+	 * being run concurrently with the other.
+	 */
+	struct mutex			resize_lock;
+
 	/*
 	 * If IORING_SETUP_NO_MMAP is used, then the below holds
 	 * the gup'ed pages for the two rings, and the sqes.
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index d0b6ec8039cb..2863b957e373 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
 	io_napi_init(ctx);
+	mutex_init(&ctx->resize_lock);
 
 	return ctx;
 
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index d614824e17bd..fc777de2c229 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned int npages;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
@@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = filp->private_data;
 	void *ptr;
 
 	/*
@@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr)
 		return -EINVAL;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
 	if (IS_ERR(ptr))
 		return -ENOMEM;
@@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_uring_ctx *ctx = file->private_data;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, pgoff, len);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
diff --git a/io_uring/register.c b/io_uring/register.c
index 0f6b18b3d3d1..1eb686eaa310 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -486,14 +486,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	}
 
 	/*
-	 * We'll do the swap. Clear out existing mappings to prevent mmap
-	 * from seeing them, as we'll unmap them. Any attempt to mmap existing
-	 * rings beyond this point will fail. Not that it could proceed at this
-	 * point anyway, as we'll hold the mmap_sem until we've done the swap.
-	 * Likewise, hold the completion * lock over the duration of the actual
-	 * swap.
+	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
+	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
+	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
+	 * existing rings beyond this point will fail. Not that it could proceed
+	 * at this point anyway, as the io_uring mmap side needs go grab the
+	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
+	 * duration of the actual swap.
 	 */
-	mmap_write_lock(current->mm);
+	mutex_lock(&ctx->resize_lock);
 	spin_lock(&ctx->completion_lock);
 	o.rings = ctx->rings;
 	ctx->rings = NULL;
@@ -560,7 +561,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
 	ret = 0;
 out:
 	spin_unlock(&ctx->completion_lock);
-	mmap_write_unlock(current->mm);
+	mutex_unlock(&ctx->resize_lock);
 	io_register_free_rings(&p, to_free);
 
 	if (ctx->sq_data)

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 20:25             ` Jens Axboe
@ 2024-10-24 20:32               ` Jann Horn
  2024-10-24 21:08                 ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jann Horn @ 2024-10-24 20:32 UTC (permalink / raw)
  To: Jens Axboe; +Cc: io-uring

On Thu, Oct 24, 2024 at 10:25 PM Jens Axboe <axboe@kernel.dk> wrote:
> On 10/24/24 2:08 PM, Jann Horn wrote:
> > On Thu, Oct 24, 2024 at 9:59?PM Jens Axboe <axboe@kernel.dk> wrote:
> >> On 10/24/24 1:53 PM, Jann Horn wrote:
> >>> On Thu, Oct 24, 2024 at 9:50?PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>> On 10/24/24 12:13 PM, Jann Horn wrote:
> >>>>> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
> >>>>>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
> >>>>>> the existing rings. It takes a struct io_uring_params argument, the same
> >>>>>> one which is used to setup the ring initially, and resizes rings
> >>>>>> according to the sizes given.
> >>>>> [...]
> >>>>>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
> >>>>>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
> >>>>>> +        * rings beyond this point will fail. Not that it could proceed at this
> >>>>>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
> >>>>>> +        * Likewise, hold the completion * lock over the duration of the actual
> >>>>>> +        * swap.
> >>>>>> +        */
> >>>>>> +       mmap_write_lock(current->mm);
> >>>>>
> >>>>> Why does the mmap lock for current->mm suffice here? I see nothing in
> >>>>> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
> >>>>
> >>>> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
> >>>> understanding. Obviously if it doesn't, then yeah this won't be enough.
> >>>> Checked, and it does.
> >>>>
> >>>> Ah I see what you mean now, task with different mm. But how would that
> >>>> come about? The io_uring fd is CLOEXEC, and it can't get passed.
> >>>
> >>> Yeah, that's what I meant, tasks with different mm. I think there are
> >>> a few ways to get the io_uring fd into a different task, the ones I
> >>> can immediately think of:
> >>>
> >>>  - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
> >>>  - O_CLOEXEC can be cleared via fcntl()
> >>>  - you can use clone() to create two tasks that share FD tables
> >>> without sharing an mm
> >>
> >> OK good catch, yes then it won't be enough. Might just make sense to
> >> exclude mmap separately, then. Thanks, I'll work on that for v4!
> >
> > Yeah, that sounds reasonable to me.
>
> Something like this should do it, it's really just replacing mmap_sem
> with a ring private lock. And since the ordering already had to deal
> with uring_lock vs mmap_sem ABBA issues, this should slot straight in as
> well.

Looks good to me at a glance.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 20:32               ` Jann Horn
@ 2024-10-24 21:08                 ` Jens Axboe
  2024-10-24 22:00                   ` Jens Axboe
  0 siblings, 1 reply; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 21:08 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring

On 10/24/24 2:32 PM, Jann Horn wrote:
> On Thu, Oct 24, 2024 at 10:25?PM Jens Axboe <axboe@kernel.dk> wrote:
>> On 10/24/24 2:08 PM, Jann Horn wrote:
>>> On Thu, Oct 24, 2024 at 9:59?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>> On 10/24/24 1:53 PM, Jann Horn wrote:
>>>>> On Thu, Oct 24, 2024 at 9:50?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>> On 10/24/24 12:13 PM, Jann Horn wrote:
>>>>>>> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
>>>>>>>> the existing rings. It takes a struct io_uring_params argument, the same
>>>>>>>> one which is used to setup the ring initially, and resizes rings
>>>>>>>> according to the sizes given.
>>>>>>> [...]
>>>>>>>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
>>>>>>>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
>>>>>>>> +        * rings beyond this point will fail. Not that it could proceed at this
>>>>>>>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
>>>>>>>> +        * Likewise, hold the completion * lock over the duration of the actual
>>>>>>>> +        * swap.
>>>>>>>> +        */
>>>>>>>> +       mmap_write_lock(current->mm);
>>>>>>>
>>>>>>> Why does the mmap lock for current->mm suffice here? I see nothing in
>>>>>>> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
>>>>>>
>>>>>> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
>>>>>> understanding. Obviously if it doesn't, then yeah this won't be enough.
>>>>>> Checked, and it does.
>>>>>>
>>>>>> Ah I see what you mean now, task with different mm. But how would that
>>>>>> come about? The io_uring fd is CLOEXEC, and it can't get passed.
>>>>>
>>>>> Yeah, that's what I meant, tasks with different mm. I think there are
>>>>> a few ways to get the io_uring fd into a different task, the ones I
>>>>> can immediately think of:
>>>>>
>>>>>  - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
>>>>>  - O_CLOEXEC can be cleared via fcntl()
>>>>>  - you can use clone() to create two tasks that share FD tables
>>>>> without sharing an mm
>>>>
>>>> OK good catch, yes then it won't be enough. Might just make sense to
>>>> exclude mmap separately, then. Thanks, I'll work on that for v4!
>>>
>>> Yeah, that sounds reasonable to me.
>>
>> Something like this should do it, it's really just replacing mmap_sem
>> with a ring private lock. And since the ordering already had to deal
>> with uring_lock vs mmap_sem ABBA issues, this should slot straight in as
>> well.
> 
> Looks good to me at a glance.

Great, thanks for checking Jann. In the first place as well, appreciate
it.

FWIW, compiled and ran through the testing, looks fine so far here.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS
  2024-10-24 21:08                 ` Jens Axboe
@ 2024-10-24 22:00                   ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2024-10-24 22:00 UTC (permalink / raw)
  To: Jann Horn; +Cc: io-uring

On 10/24/24 3:08 PM, Jens Axboe wrote:
> On 10/24/24 2:32 PM, Jann Horn wrote:
>> On Thu, Oct 24, 2024 at 10:25?PM Jens Axboe <axboe@kernel.dk> wrote:
>>> On 10/24/24 2:08 PM, Jann Horn wrote:
>>>> On Thu, Oct 24, 2024 at 9:59?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>> On 10/24/24 1:53 PM, Jann Horn wrote:
>>>>>> On Thu, Oct 24, 2024 at 9:50?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>> On 10/24/24 12:13 PM, Jann Horn wrote:
>>>>>>>> On Thu, Oct 24, 2024 at 7:08?PM Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>> Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
>>>>>>>>> the existing rings. It takes a struct io_uring_params argument, the same
>>>>>>>>> one which is used to setup the ring initially, and resizes rings
>>>>>>>>> according to the sizes given.
>>>>>>>> [...]
>>>>>>>>> +        * We'll do the swap. Clear out existing mappings to prevent mmap
>>>>>>>>> +        * from seeing them, as we'll unmap them. Any attempt to mmap existing
>>>>>>>>> +        * rings beyond this point will fail. Not that it could proceed at this
>>>>>>>>> +        * point anyway, as we'll hold the mmap_sem until we've done the swap.
>>>>>>>>> +        * Likewise, hold the completion * lock over the duration of the actual
>>>>>>>>> +        * swap.
>>>>>>>>> +        */
>>>>>>>>> +       mmap_write_lock(current->mm);
>>>>>>>>
>>>>>>>> Why does the mmap lock for current->mm suffice here? I see nothing in
>>>>>>>> io_uring_mmap() that limits mmap() to tasks with the same mm_struct.
>>>>>>>
>>>>>>> Ehm does ->mmap() not hold ->mmap_sem already? I was under that
>>>>>>> understanding. Obviously if it doesn't, then yeah this won't be enough.
>>>>>>> Checked, and it does.
>>>>>>>
>>>>>>> Ah I see what you mean now, task with different mm. But how would that
>>>>>>> come about? The io_uring fd is CLOEXEC, and it can't get passed.
>>>>>>
>>>>>> Yeah, that's what I meant, tasks with different mm. I think there are
>>>>>> a few ways to get the io_uring fd into a different task, the ones I
>>>>>> can immediately think of:
>>>>>>
>>>>>>  - O_CLOEXEC only applies on execve(), fork() should still inherit the fd
>>>>>>  - O_CLOEXEC can be cleared via fcntl()
>>>>>>  - you can use clone() to create two tasks that share FD tables
>>>>>> without sharing an mm
>>>>>
>>>>> OK good catch, yes then it won't be enough. Might just make sense to
>>>>> exclude mmap separately, then. Thanks, I'll work on that for v4!
>>>>
>>>> Yeah, that sounds reasonable to me.
>>>
>>> Something like this should do it, it's really just replacing mmap_sem
>>> with a ring private lock. And since the ordering already had to deal
>>> with uring_lock vs mmap_sem ABBA issues, this should slot straight in as
>>> well.
>>
>> Looks good to me at a glance.
> 
> Great, thanks for checking Jann. In the first place as well, appreciate
> it.
> 
> FWIW, compiled and ran through the testing, looks fine so far here.

And also fwiw, I did write a test case for this, and it goes boom pretty
quickly without the patch, no issues with the patch. Sample output:

==================================================================
BUG: KASAN: slab-use-after-free in vm_insert_pages+0x634/0x73c
Read of size 8 at addr ffff0000d8a264e0 by task resize-rings.t/741

CPU: 5 UID: 1000 PID: 741 Comm: resize-rings.t Not tainted 6.12.0-rc4-00082-g0935537ea92a #7661
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace.part.0+0xd0/0xe0
 show_stack+0x14/0x1c
 dump_stack_lvl+0x68/0x8c
 print_report+0x16c/0x4c8
 kasan_report+0xa0/0xe0
 __asan_report_load8_noabort+0x1c/0x24
 vm_insert_pages+0x634/0x73c
 io_uring_mmap_pages+0x1d4/0x2d8
 io_uring_mmap+0x19c/0x1c0
 mmap_region+0x844/0x19e0
 do_mmap+0x5f4/0xb00
 vm_mmap_pgoff+0x164/0x2a0
 ksys_mmap_pgoff+0x2a8/0x3c0
 __arm64_sys_mmap+0xc8/0x140
 invoke_syscall+0x6c/0x260
 el0_svc_common.constprop.0+0x158/0x224
 do_el0_svc+0x3c/0x5c
 el0_svc+0x44/0xb4
 el0t_64_sync_handler+0x118/0x124
 el0t_64_sync+0x168/0x16c

Allocated by task 733:
 kasan_save_stack+0x28/0x4c
 kasan_save_track+0x1c/0x40
 kasan_save_alloc_info+0x3c/0x4c
 __kasan_kmalloc+0xac/0xb0
 __kmalloc_node_noprof+0x1b4/0x3f0
 __kvmalloc_node_noprof+0x68/0x134
 io_pages_map+0x50/0x448
 io_register_resize_rings+0x484/0x1498
 __arm64_sys_io_uring_register+0x780/0x1f3c
 invoke_syscall+0x6c/0x260
 el0_svc_common.constprop.0+0x158/0x224
 do_el0_svc+0x3c/0x5c
 el0_svc+0x44/0xb4
 el0t_64_sync_handler+0x118/0x124
 el0t_64_sync+0x168/0x16c

Freed by task 733:
 kasan_save_stack+0x28/0x4c
 kasan_save_track+0x1c/0x40
 kasan_save_free_info+0x48/0x94
 __kasan_slab_free+0x48/0x60
 kfree+0x120/0x494
 kvfree+0x34/0x40
 io_pages_unmap+0x1a4/0x308
 io_register_free_rings.isra.0+0x6c/0x168
 io_register_resize_rings+0xce4/0x1498
 __arm64_sys_io_uring_register+0x780/0x1f3c
 invoke_syscall+0x6c/0x260
 el0_svc_common.constprop.0+0x158/0x224
 do_el0_svc+0x3c/0x5c
 el0_svc+0x44/0xb4
 el0t_64_sync_handler+0x118/0x124
 el0t_64_sync+0x168/0x16c

The buggy address belongs to the object at ffff0000d8a264e0
 which belongs to the cache kmalloc-cg-8 of size 8
The buggy address is located 0 bytes inside of
 freed 8-byte region [ffff0000d8a264e0, ffff0000d8a264e8)

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/4] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings
  2024-10-25 14:02 [PATCHSET v4 0/4] Add support for ring resizing Jens Axboe
@ 2024-10-25 14:02 ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2024-10-25 14:02 UTC (permalink / raw)
  To: io-uring; +Cc: jannh, Jens Axboe

The later mapping will actually check this too, but in terms of code
clarify, explicitly check for whether or not the rings and sqes are
valid during validation. That makes it explicit that if they are
non-NULL, they are valid and can get mapped.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/memmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index a0f32a255fd1..d614824e17bd 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -204,11 +204,15 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
+		if (!ctx->rings)
+			return ERR_PTR(-EFAULT);
 		return ctx->rings;
 	case IORING_OFF_SQES:
 		/* Don't allow mmap if the ring was setup without it */
 		if (ctx->flags & IORING_SETUP_NO_MMAP)
 			return ERR_PTR(-EINVAL);
+		if (!ctx->sq_sqes)
+			return ERR_PTR(-EFAULT);
 		return ctx->sq_sqes;
 	case IORING_OFF_PBUF_RING: {
 		struct io_buffer_list *bl;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2024-10-25 14:05 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-10-24 17:07 [PATCHSET v3 0/4] Add support for ring resizing Jens Axboe
2024-10-24 17:07 ` [PATCH 1/4] io_uring: move max entry definition and ring sizing into header Jens Axboe
2024-10-24 17:07 ` [PATCH 2/4] io_uring: abstract out a bit of the ring filling logic Jens Axboe
2024-10-24 17:07 ` [PATCH 3/4] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings Jens Axboe
2024-10-24 17:07 ` [PATCH 4/4] io_uring/register: add IORING_REGISTER_RESIZE_RINGS Jens Axboe
2024-10-24 18:13   ` Jann Horn
2024-10-24 19:50     ` Jens Axboe
2024-10-24 19:53       ` Jann Horn
2024-10-24 19:59         ` Jens Axboe
2024-10-24 20:08           ` Jann Horn
2024-10-24 20:25             ` Jens Axboe
2024-10-24 20:32               ` Jann Horn
2024-10-24 21:08                 ` Jens Axboe
2024-10-24 22:00                   ` Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2024-10-25 14:02 [PATCHSET v4 0/4] Add support for ring resizing Jens Axboe
2024-10-25 14:02 ` [PATCH 3/4] io_uring/memmap: explicitly return -EFAULT for mmap on NULL rings Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).