Linux io-uring development
 help / color / mirror / Atom feed
* [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU
@ 2026-03-30 17:23 Junxi Qian
  2026-03-30 18:08 ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Junxi Qian @ 2026-03-30 17:23 UTC (permalink / raw)
  To: io-uring; +Cc: axboe

io_register_resize_rings() briefly sets ctx->rings to NULL under
completion_lock before assigning the new rings and publishing them
via rcu_assign_pointer(ctx->rings_rcu, ...).  Several code paths
read ctx->rings without holding any of those locks, leading to a
NULL pointer dereference if they race with a resize:

  - io_uring_poll()              (VFS poll callback)
  - io_should_wake()             (waitqueue wake callback)
  - io_cqring_min_timer_wakeup() (hrtimer callback)
  - io_cqring_wait()             (called from io_uring_enter)

Commit 96189080265e only addressed io_ctx_mark_taskrun() in tw.c.
Protect the remaining sites by reading ctx->rings_rcu under
rcu_read_lock() (via guard(rcu)/scoped_guard(rcu)) and treating a
NULL rings as "no data available / force re-evaluation".

Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS")
Cc: stable@vger.kernel.org
Signed-off-by: Junxi Qian <qjx1298677004@gmail.com>
---
I'm not entirely sure this is the best approach for all the affected
call sites -- I'd appreciate any feedback or suggestions on whether
this looks reasonable.
---
 io_uring/io_uring.c | 17 +++++++++---
 io_uring/io_uring.h |  9 ++++++-
 io_uring/wait.c     | 63 +++++++++++++++++++++++++++++++++------------
 3 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9a37035e7..98029b039 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2240,6 +2240,7 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
 static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 {
 	struct io_ring_ctx *ctx = file->private_data;
+	struct io_rings *rings;
 	__poll_t mask = 0;
 
 	if (unlikely(!ctx->poll_activated))
@@ -2250,7 +2251,17 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 */
 	poll_wait(file, &ctx->poll_wq, wait);
 
-	if (!io_sqring_full(ctx))
+	/*
+	 * Use the RCU-protected rings pointer to be safe against
+	 * concurrent ring resizing, which briefly NULLs ctx->rings.
+	 */
+	guard(rcu)();
+	rings = rcu_dereference(ctx->rings_rcu);
+	if (unlikely(!rings))
+		return 0;
+
+	if (READ_ONCE(rings->sq.tail) - READ_ONCE(rings->sq.head) !=
+							ctx->sq_entries)
 		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	/*
@@ -2266,8 +2277,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
 	 * pushes them to do the flush.
 	 */
-
-	if (__io_cqring_events_user(ctx) || io_has_work(ctx))
+	if (READ_ONCE(rings->cq.tail) != READ_ONCE(rings->cq.head) ||
+	    io_has_work(ctx))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
 	return mask;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0fa844faf..ea953f2c7 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -145,7 +145,14 @@ struct io_wait_queue {
 static inline bool io_should_wake(struct io_wait_queue *iowq)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
-	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
+	struct io_rings *rings;
+	int dist;
+
+	guard(rcu)();
+	rings = rcu_dereference(ctx->rings_rcu);
+	if (unlikely(!rings))
+		return true;
+	dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail;
 
 	/*
 	 * Wake up if we have enough events, or if a timeout occurred since we
diff --git a/io_uring/wait.c b/io_uring/wait.c
index 0581cadf2..af25f8f16 100644
--- a/io_uring/wait.c
+++ b/io_uring/wait.c
@@ -78,12 +78,20 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
 	/* work we may need to run, wake function will see if we need to wake */
 	if (io_has_work(ctx))
 		goto out_wake;
-	/* got events since we started waiting, min timeout is done */
-	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
-		goto out_wake;
-	/* if we have any events and min timeout expired, we're done */
-	if (io_cqring_events(ctx))
-		goto out_wake;
+
+	scoped_guard(rcu) {
+		struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
+
+		if (!rings)
+			goto out_wake;
+		/* got events since we started waiting, min timeout is done */
+		if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail))
+			goto out_wake;
+		/* if we have any events and min timeout expired, we're done */
+		smp_rmb();
+		if (ctx->cached_cq_tail != READ_ONCE(rings->cq.head))
+			goto out_wake;
+	}
 
 	/*
 	 * If using deferred task_work running and application is waiting on
@@ -186,7 +194,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		   struct ext_arg *ext_arg)
 {
 	struct io_wait_queue iowq;
-	struct io_rings *rings = ctx->rings;
+	struct io_rings *rings;
 	ktime_t start_time;
 	int ret;
 
@@ -201,15 +209,27 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 
 	if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
 		io_cqring_do_overflow_flush(ctx);
-	if (__io_cqring_events_user(ctx) >= min_events)
-		return 0;
 
 	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
 	iowq.wq.private = current;
 	INIT_LIST_HEAD(&iowq.wq.entry);
 	iowq.ctx = ctx;
-	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
-	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
+
+	scoped_guard(rcu) {
+		rings = rcu_dereference(ctx->rings_rcu);
+		if (rings) {
+			if (READ_ONCE(rings->cq.tail) -
+			    READ_ONCE(rings->cq.head) >=
+					(unsigned int)min_events)
+				return 0;
+			iowq.cq_tail = READ_ONCE(rings->cq.head) +
+							min_events;
+			iowq.cq_min_tail = READ_ONCE(rings->cq.tail);
+		} else {
+			iowq.cq_tail = min_events;
+			iowq.cq_min_tail = 0;
+		}
+	}
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	iowq.hit_timeout = 0;
 	iowq.min_timeout = ext_arg->min_time;
@@ -243,11 +263,16 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		int nr_wait;
 
 		/* if min timeout has been hit, don't reset wait count */
-		if (!iowq.hit_timeout)
-			nr_wait = (int) iowq.cq_tail -
-					READ_ONCE(ctx->rings->cq.tail);
-		else
+		if (!iowq.hit_timeout) {
+			scoped_guard(rcu) {
+				rings = rcu_dereference(ctx->rings_rcu);
+				nr_wait = rings ?
+					(int) iowq.cq_tail -
+					  READ_ONCE(rings->cq.tail) : 1;
+			}
+		} else {
 			nr_wait = 1;
+		}
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 			atomic_set(&ctx->cq_wait_nr, nr_wait);
@@ -304,5 +329,11 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		finish_wait(&ctx->cq_wait, &iowq.wq);
 	restore_saved_sigmask_unless(ret == -EINTR);
 
-	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
+	scoped_guard(rcu) {
+		rings = rcu_dereference(ctx->rings_rcu);
+		if (rings &&
+		    READ_ONCE(rings->cq.head) != READ_ONCE(rings->cq.tail))
+			ret = 0;
+	}
+	return ret;
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU
  2026-03-30 17:23 [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU Junxi Qian
@ 2026-03-30 18:08 ` Jens Axboe
  2026-03-31  4:03   ` junxi qian
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2026-03-30 18:08 UTC (permalink / raw)
  To: Junxi Qian, io-uring

On 3/30/26 11:23 AM, Junxi Qian wrote:
> io_register_resize_rings() briefly sets ctx->rings to NULL under
> completion_lock before assigning the new rings and publishing them
> via rcu_assign_pointer(ctx->rings_rcu, ...).  Several code paths
> read ctx->rings without holding any of those locks, leading to a
> NULL pointer dereference if they race with a resize:
> 
>   - io_uring_poll()              (VFS poll callback)
>   - io_should_wake()             (waitqueue wake callback)
>   - io_cqring_min_timer_wakeup() (hrtimer callback)
>   - io_cqring_wait()             (called from io_uring_enter)
> 
> Commit 96189080265e only addressed io_ctx_mark_taskrun() in tw.c.
> Protect the remaining sites by reading ctx->rings_rcu under
> rcu_read_lock() (via guard(rcu)/scoped_guard(rcu)) and treating a
> NULL rings as "no data available / force re-evaluation".

First of all, thanks for the patch!

I took a look at this, but I'm not a huge fan of the scoped guard in
most spots, it just makes it harder to read. And I think that building
on top of this for later kernels will make sense, so cleaner to add some
helpers. Outside of that, the wait side can be a bit smarter rather than
just wrap everything in rcu multiple times (eg the nr_wait part).

There also should be no need to check 'rings' for NULL, it'll always be
a valid value.

How about something like this instead?


diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 16122f877aed..079b37835833 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2017,7 +2017,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
 	if (ctx->flags & IORING_SETUP_SQ_REWIND)
 		entries = ctx->sq_entries;
 	else
-		entries = io_sqring_entries(ctx);
+		entries = __io_sqring_entries(ctx);
 
 	entries = min(nr, entries);
 	if (unlikely(!entries))
@@ -2253,7 +2253,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	 */
 	poll_wait(file, &ctx->poll_wq, wait);
 
-	if (!io_sqring_full(ctx))
+	rcu_read_lock();
+
+	if (!__io_sqring_full(ctx))
 		mask |= EPOLLOUT | EPOLLWRNORM;
 
 	/*
@@ -2273,6 +2275,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
 	if (__io_cqring_events_user(ctx) || io_has_work(ctx))
 		mask |= EPOLLIN | EPOLLRDNORM;
 
+	rcu_read_unlock();
 	return mask;
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 91cf67b5d85b..5c47ed0b4276 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -142,16 +142,28 @@ struct io_wait_queue {
 #endif
 };
 
+static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx)
+{
+	return rcu_dereference_check(ctx->rings_rcu,
+			lockdep_is_held(&ctx->uring_lock) ||
+			lockdep_is_held(&ctx->completion_lock));
+}
+
 static inline bool io_should_wake(struct io_wait_queue *iowq)
 {
 	struct io_ring_ctx *ctx = iowq->ctx;
-	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
+	struct io_rings *rings;
+	int dist;
+
+	guard(rcu)();
+	rings = io_get_rings(ctx);
 
 	/*
 	 * Wake up if we have enough events, or if a timeout occurred since we
 	 * started waiting. For timeouts, we always want to return to userspace,
 	 * regardless of event count.
 	 */
+	dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail;
 	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
 }
 
@@ -431,9 +443,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
 	__io_wq_wake(&ctx->cq_wait);
 }
 
-static inline bool io_sqring_full(struct io_ring_ctx *ctx)
+static inline bool __io_sqring_full(struct io_ring_ctx *ctx)
 {
-	struct io_rings *r = ctx->rings;
+	struct io_rings *r = io_get_rings(ctx);
 
 	/*
 	 * SQPOLL must use the actual sqring head, as using the cached_sq_head
@@ -445,9 +457,15 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 	return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries;
 }
 
-static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
+static inline bool io_sqring_full(struct io_ring_ctx *ctx)
 {
-	struct io_rings *rings = ctx->rings;
+	guard(rcu)();
+	return __io_sqring_full(ctx);
+}
+
+static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx)
+{
+	struct io_rings *rings = io_get_rings(ctx);
 	unsigned int entries;
 
 	/* make sure SQ entry isn't read before tail */
@@ -455,6 +473,12 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return min(entries, ctx->sq_entries);
 }
 
+static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
+{
+	guard(rcu)();
+	return __io_sqring_entries(ctx);
+}
+
 /*
  * Don't complete immediately but use deferred completion infrastructure.
  * Protected by ->uring_lock and can only be used either with
diff --git a/io_uring/wait.c b/io_uring/wait.c
index 0581cadf20ee..c24d018d53ab 100644
--- a/io_uring/wait.c
+++ b/io_uring/wait.c
@@ -79,12 +79,15 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
 	if (io_has_work(ctx))
 		goto out_wake;
 	/* got events since we started waiting, min timeout is done */
-	if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
-		goto out_wake;
-	/* if we have any events and min timeout expired, we're done */
-	if (io_cqring_events(ctx))
-		goto out_wake;
+	scoped_guard(rcu) {
+		struct io_rings *rings = io_get_rings(ctx);
 
+		if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail))
+			goto out_wake;
+		/* if we have any events and min timeout expired, we're done */
+		if (io_cqring_events(ctx))
+			goto out_wake;
+	}
 	/*
 	 * If using deferred task_work running and application is waiting on
 	 * more than one request, ensure we reset it now where we are switching
@@ -186,9 +189,9 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 		   struct ext_arg *ext_arg)
 {
 	struct io_wait_queue iowq;
-	struct io_rings *rings = ctx->rings;
+	struct io_rings *rings;
 	ktime_t start_time;
-	int ret;
+	int ret, nr_wait;
 
 	min_events = min_t(int, min_events, ctx->cq_entries);
 
@@ -201,15 +204,23 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 
 	if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
 		io_cqring_do_overflow_flush(ctx);
-	if (__io_cqring_events_user(ctx) >= min_events)
+
+	rcu_read_lock();
+	rings = io_get_rings(ctx);
+	if (__io_cqring_events_user(ctx) >= min_events) {
+		rcu_read_unlock();
 		return 0;
+	}
 
 	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
 	iowq.wq.private = current;
 	INIT_LIST_HEAD(&iowq.wq.entry);
 	iowq.ctx = ctx;
-	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
-	iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
+	iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events;
+	iowq.cq_min_tail = READ_ONCE(rings->cq.tail);
+	nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail);
+	rcu_read_unlock();
+	rings = NULL;
 	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
 	iowq.hit_timeout = 0;
 	iowq.min_timeout = ext_arg->min_time;
@@ -240,14 +251,6 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 	trace_io_uring_cqring_wait(ctx, min_events);
 	do {
 		unsigned long check_cq;
-		int nr_wait;
-
-		/* if min timeout has been hit, don't reset wait count */
-		if (!iowq.hit_timeout)
-			nr_wait = (int) iowq.cq_tail -
-					READ_ONCE(ctx->rings->cq.tail);
-		else
-			nr_wait = 1;
 
 		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
 			atomic_set(&ctx->cq_wait_nr, nr_wait);
@@ -298,11 +301,20 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
 			break;
 		}
 		cond_resched();
+
+		/* if min timeout has been hit, don't reset wait count */
+		if (!iowq.hit_timeout)
+			scoped_guard(rcu)
+				nr_wait = (int) iowq.cq_tail -
+						READ_ONCE(ctx->rings_rcu->cq.tail);
+		else
+			nr_wait = 1;
 	} while (1);
 
 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
 		finish_wait(&ctx->cq_wait, &iowq.wq);
 	restore_saved_sigmask_unless(ret == -EINTR);
 
-	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
+	guard(rcu)();
+	return READ_ONCE(ctx->rings_rcu->cq.head) == READ_ONCE(ctx->rings_rcu->cq.tail) ? ret : 0;
 }
diff --git a/io_uring/wait.h b/io_uring/wait.h
index 037e512dd80c..a4274b137f81 100644
--- a/io_uring/wait.h
+++ b/io_uring/wait.h
@@ -29,12 +29,15 @@ void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx);
 
 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
 {
-	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
+	struct io_rings *rings = io_get_rings(ctx);
+	return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
 }
 
 static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
 {
-	return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
+	struct io_rings *rings = io_get_rings(ctx);
+
+	return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
 }
 
 /*

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU
  2026-03-30 18:08 ` Jens Axboe
@ 2026-03-31  4:03   ` junxi qian
  0 siblings, 0 replies; 3+ messages in thread
From: junxi qian @ 2026-03-31  4:03 UTC (permalink / raw)
  To: Jens Axboe, io-uring

Looks good, thanks for the rework!

I applied this on top of v7.0-rc5, compiled with KASAN enabled, and
ran the resize + poll() race reproducer - no KASAN splat triggered.

Reviewed-by: Junxi Qian <qjx1298677004@gmail.com>
Tested-by: Junxi Qian <qjx1298677004@gmail.com>

On Tue, Mar 31, 2026 at 2:08 AM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 3/30/26 11:23 AM, Junxi Qian wrote:
> > io_register_resize_rings() briefly sets ctx->rings to NULL under
> > completion_lock before assigning the new rings and publishing them
> > via rcu_assign_pointer(ctx->rings_rcu, ...).  Several code paths
> > read ctx->rings without holding any of those locks, leading to a
> > NULL pointer dereference if they race with a resize:
> >
> >   - io_uring_poll()              (VFS poll callback)
> >   - io_should_wake()             (waitqueue wake callback)
> >   - io_cqring_min_timer_wakeup() (hrtimer callback)
> >   - io_cqring_wait()             (called from io_uring_enter)
> >
> > Commit 96189080265e only addressed io_ctx_mark_taskrun() in tw.c.
> > Protect the remaining sites by reading ctx->rings_rcu under
> > rcu_read_lock() (via guard(rcu)/scoped_guard(rcu)) and treating a
> > NULL rings as "no data available / force re-evaluation".
>
> First of all, thanks for the patch!
>
> I took a look at this, but I'm not a huge fan of the scoped guard in
> most spots, it just makes it harder to read. And I think that building
> on top of this for later kernels will make sense, so cleaner to add some
> helpers. Outside of that, the wait side can be a bit smarter rather than
> just wrap everything in rcu multiple times (eg the nr_wait part).
>
> There also should be no need to check 'rings' for NULL, it'll always be
> a valid value.
>
> How about something like this instead?
>
>
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 16122f877aed..079b37835833 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -2017,7 +2017,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
>         if (ctx->flags & IORING_SETUP_SQ_REWIND)
>                 entries = ctx->sq_entries;
>         else
> -               entries = io_sqring_entries(ctx);
> +               entries = __io_sqring_entries(ctx);
>
>         entries = min(nr, entries);
>         if (unlikely(!entries))
> @@ -2253,7 +2253,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
>          */
>         poll_wait(file, &ctx->poll_wq, wait);
>
> -       if (!io_sqring_full(ctx))
> +       rcu_read_lock();
> +
> +       if (!__io_sqring_full(ctx))
>                 mask |= EPOLLOUT | EPOLLWRNORM;
>
>         /*
> @@ -2273,6 +2275,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
>         if (__io_cqring_events_user(ctx) || io_has_work(ctx))
>                 mask |= EPOLLIN | EPOLLRDNORM;
>
> +       rcu_read_unlock();
>         return mask;
>  }
>
> diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
> index 91cf67b5d85b..5c47ed0b4276 100644
> --- a/io_uring/io_uring.h
> +++ b/io_uring/io_uring.h
> @@ -142,16 +142,28 @@ struct io_wait_queue {
>  #endif
>  };
>
> +static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx)
> +{
> +       return rcu_dereference_check(ctx->rings_rcu,
> +                       lockdep_is_held(&ctx->uring_lock) ||
> +                       lockdep_is_held(&ctx->completion_lock));
> +}
> +
>  static inline bool io_should_wake(struct io_wait_queue *iowq)
>  {
>         struct io_ring_ctx *ctx = iowq->ctx;
> -       int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
> +       struct io_rings *rings;
> +       int dist;
> +
> +       guard(rcu)();
> +       rings = io_get_rings(ctx);
>
>         /*
>          * Wake up if we have enough events, or if a timeout occurred since we
>          * started waiting. For timeouts, we always want to return to userspace,
>          * regardless of event count.
>          */
> +       dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail;
>         return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
>  }
>
> @@ -431,9 +443,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
>         __io_wq_wake(&ctx->cq_wait);
>  }
>
> -static inline bool io_sqring_full(struct io_ring_ctx *ctx)
> +static inline bool __io_sqring_full(struct io_ring_ctx *ctx)
>  {
> -       struct io_rings *r = ctx->rings;
> +       struct io_rings *r = io_get_rings(ctx);
>
>         /*
>          * SQPOLL must use the actual sqring head, as using the cached_sq_head
> @@ -445,9 +457,15 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
>         return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries;
>  }
>
> -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
> +static inline bool io_sqring_full(struct io_ring_ctx *ctx)
>  {
> -       struct io_rings *rings = ctx->rings;
> +       guard(rcu)();
> +       return __io_sqring_full(ctx);
> +}
> +
> +static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx)
> +{
> +       struct io_rings *rings = io_get_rings(ctx);
>         unsigned int entries;
>
>         /* make sure SQ entry isn't read before tail */
> @@ -455,6 +473,12 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
>         return min(entries, ctx->sq_entries);
>  }
>
> +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
> +{
> +       guard(rcu)();
> +       return __io_sqring_entries(ctx);
> +}
> +
>  /*
>   * Don't complete immediately but use deferred completion infrastructure.
>   * Protected by ->uring_lock and can only be used either with
> diff --git a/io_uring/wait.c b/io_uring/wait.c
> index 0581cadf20ee..c24d018d53ab 100644
> --- a/io_uring/wait.c
> +++ b/io_uring/wait.c
> @@ -79,12 +79,15 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
>         if (io_has_work(ctx))
>                 goto out_wake;
>         /* got events since we started waiting, min timeout is done */
> -       if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
> -               goto out_wake;
> -       /* if we have any events and min timeout expired, we're done */
> -       if (io_cqring_events(ctx))
> -               goto out_wake;
> +       scoped_guard(rcu) {
> +               struct io_rings *rings = io_get_rings(ctx);
>
> +               if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail))
> +                       goto out_wake;
> +               /* if we have any events and min timeout expired, we're done */
> +               if (io_cqring_events(ctx))
> +                       goto out_wake;
> +       }
>         /*
>          * If using deferred task_work running and application is waiting on
>          * more than one request, ensure we reset it now where we are switching
> @@ -186,9 +189,9 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>                    struct ext_arg *ext_arg)
>  {
>         struct io_wait_queue iowq;
> -       struct io_rings *rings = ctx->rings;
> +       struct io_rings *rings;
>         ktime_t start_time;
> -       int ret;
> +       int ret, nr_wait;
>
>         min_events = min_t(int, min_events, ctx->cq_entries);
>
> @@ -201,15 +204,23 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>
>         if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
>                 io_cqring_do_overflow_flush(ctx);
> -       if (__io_cqring_events_user(ctx) >= min_events)
> +
> +       rcu_read_lock();
> +       rings = io_get_rings(ctx);
> +       if (__io_cqring_events_user(ctx) >= min_events) {
> +               rcu_read_unlock();
>                 return 0;
> +       }
>
>         init_waitqueue_func_entry(&iowq.wq, io_wake_function);
>         iowq.wq.private = current;
>         INIT_LIST_HEAD(&iowq.wq.entry);
>         iowq.ctx = ctx;
> -       iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
> -       iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
> +       iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events;
> +       iowq.cq_min_tail = READ_ONCE(rings->cq.tail);
> +       nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail);
> +       rcu_read_unlock();
> +       rings = NULL;
>         iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
>         iowq.hit_timeout = 0;
>         iowq.min_timeout = ext_arg->min_time;
> @@ -240,14 +251,6 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>         trace_io_uring_cqring_wait(ctx, min_events);
>         do {
>                 unsigned long check_cq;
> -               int nr_wait;
> -
> -               /* if min timeout has been hit, don't reset wait count */
> -               if (!iowq.hit_timeout)
> -                       nr_wait = (int) iowq.cq_tail -
> -                                       READ_ONCE(ctx->rings->cq.tail);
> -               else
> -                       nr_wait = 1;
>
>                 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
>                         atomic_set(&ctx->cq_wait_nr, nr_wait);
> @@ -298,11 +301,20 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
>                         break;
>                 }
>                 cond_resched();
> +
> +               /* if min timeout has been hit, don't reset wait count */
> +               if (!iowq.hit_timeout)
> +                       scoped_guard(rcu)
> +                               nr_wait = (int) iowq.cq_tail -
> +                                               READ_ONCE(ctx->rings_rcu->cq.tail);
> +               else
> +                       nr_wait = 1;
>         } while (1);
>
>         if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
>                 finish_wait(&ctx->cq_wait, &iowq.wq);
>         restore_saved_sigmask_unless(ret == -EINTR);
>
> -       return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
> +       guard(rcu)();
> +       return READ_ONCE(ctx->rings_rcu->cq.head) == READ_ONCE(ctx->rings_rcu->cq.tail) ? ret : 0;
>  }
> diff --git a/io_uring/wait.h b/io_uring/wait.h
> index 037e512dd80c..a4274b137f81 100644
> --- a/io_uring/wait.h
> +++ b/io_uring/wait.h
> @@ -29,12 +29,15 @@ void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx);
>
>  static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
>  {
> -       return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
> +       struct io_rings *rings = io_get_rings(ctx);
> +       return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
>  }
>
>  static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
>  {
> -       return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
> +       struct io_rings *rings = io_get_rings(ctx);
> +
> +       return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
>  }
>
>  /*
>
> --
> Jens Axboe

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-03-31  4:03 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-30 17:23 [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU Junxi Qian
2026-03-30 18:08 ` Jens Axboe
2026-03-31  4:03   ` junxi qian

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox