* [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU
@ 2026-03-30 17:23 Junxi Qian
2026-03-30 18:08 ` Jens Axboe
0 siblings, 1 reply; 3+ messages in thread
From: Junxi Qian @ 2026-03-30 17:23 UTC (permalink / raw)
To: io-uring; +Cc: axboe
io_register_resize_rings() briefly sets ctx->rings to NULL under
completion_lock before assigning the new rings and publishing them
via rcu_assign_pointer(ctx->rings_rcu, ...). Several code paths
read ctx->rings without holding any of those locks, leading to a
NULL pointer dereference if they race with a resize:
- io_uring_poll() (VFS poll callback)
- io_should_wake() (waitqueue wake callback)
- io_cqring_min_timer_wakeup() (hrtimer callback)
- io_cqring_wait() (called from io_uring_enter)
Commit 96189080265e only addressed io_ctx_mark_taskrun() in tw.c.
Protect the remaining sites by reading ctx->rings_rcu under
rcu_read_lock() (via guard(rcu)/scoped_guard(rcu)) and treating a
NULL rings as "no data available / force re-evaluation".
Fixes: 79cfe9e59c2a ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS")
Cc: stable@vger.kernel.org
Signed-off-by: Junxi Qian <qjx1298677004@gmail.com>
---
I'm not entirely sure this is the best approach for all the affected
call sites -- I'd appreciate any feedback or suggestions on whether
this looks reasonable.
---
io_uring/io_uring.c | 17 +++++++++---
io_uring/io_uring.h | 9 ++++++-
io_uring/wait.c | 63 +++++++++++++++++++++++++++++++++------------
3 files changed, 69 insertions(+), 20 deletions(-)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9a37035e7..98029b039 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2240,6 +2240,7 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{
struct io_ring_ctx *ctx = file->private_data;
+ struct io_rings *rings;
__poll_t mask = 0;
if (unlikely(!ctx->poll_activated))
@@ -2250,7 +2251,17 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
*/
poll_wait(file, &ctx->poll_wq, wait);
- if (!io_sqring_full(ctx))
+ /*
+ * Use the RCU-protected rings pointer to be safe against
+ * concurrent ring resizing, which briefly NULLs ctx->rings.
+ */
+ guard(rcu)();
+ rings = rcu_dereference(ctx->rings_rcu);
+ if (unlikely(!rings))
+ return 0;
+
+ if (READ_ONCE(rings->sq.tail) - READ_ONCE(rings->sq.head) !=
+ ctx->sq_entries)
mask |= EPOLLOUT | EPOLLWRNORM;
/*
@@ -2266,8 +2277,8 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
* Users may get EPOLLIN meanwhile seeing nothing in cqring, this
* pushes them to do the flush.
*/
-
- if (__io_cqring_events_user(ctx) || io_has_work(ctx))
+ if (READ_ONCE(rings->cq.tail) != READ_ONCE(rings->cq.head) ||
+ io_has_work(ctx))
mask |= EPOLLIN | EPOLLRDNORM;
return mask;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0fa844faf..ea953f2c7 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -145,7 +145,14 @@ struct io_wait_queue {
static inline bool io_should_wake(struct io_wait_queue *iowq)
{
struct io_ring_ctx *ctx = iowq->ctx;
- int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
+ struct io_rings *rings;
+ int dist;
+
+ guard(rcu)();
+ rings = rcu_dereference(ctx->rings_rcu);
+ if (unlikely(!rings))
+ return true;
+ dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail;
/*
* Wake up if we have enough events, or if a timeout occurred since we
diff --git a/io_uring/wait.c b/io_uring/wait.c
index 0581cadf2..af25f8f16 100644
--- a/io_uring/wait.c
+++ b/io_uring/wait.c
@@ -78,12 +78,20 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
/* work we may need to run, wake function will see if we need to wake */
if (io_has_work(ctx))
goto out_wake;
- /* got events since we started waiting, min timeout is done */
- if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
- goto out_wake;
- /* if we have any events and min timeout expired, we're done */
- if (io_cqring_events(ctx))
- goto out_wake;
+
+ scoped_guard(rcu) {
+ struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
+
+ if (!rings)
+ goto out_wake;
+ /* got events since we started waiting, min timeout is done */
+ if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail))
+ goto out_wake;
+ /* if we have any events and min timeout expired, we're done */
+ smp_rmb();
+ if (ctx->cached_cq_tail != READ_ONCE(rings->cq.head))
+ goto out_wake;
+ }
/*
* If using deferred task_work running and application is waiting on
@@ -186,7 +194,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
struct ext_arg *ext_arg)
{
struct io_wait_queue iowq;
- struct io_rings *rings = ctx->rings;
+ struct io_rings *rings;
ktime_t start_time;
int ret;
@@ -201,15 +209,27 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
io_cqring_do_overflow_flush(ctx);
- if (__io_cqring_events_user(ctx) >= min_events)
- return 0;
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
iowq.ctx = ctx;
- iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
- iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
+
+ scoped_guard(rcu) {
+ rings = rcu_dereference(ctx->rings_rcu);
+ if (rings) {
+ if (READ_ONCE(rings->cq.tail) -
+ READ_ONCE(rings->cq.head) >=
+ (unsigned int)min_events)
+ return 0;
+ iowq.cq_tail = READ_ONCE(rings->cq.head) +
+ min_events;
+ iowq.cq_min_tail = READ_ONCE(rings->cq.tail);
+ } else {
+ iowq.cq_tail = min_events;
+ iowq.cq_min_tail = 0;
+ }
+ }
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
iowq.hit_timeout = 0;
iowq.min_timeout = ext_arg->min_time;
@@ -243,11 +263,16 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
int nr_wait;
/* if min timeout has been hit, don't reset wait count */
- if (!iowq.hit_timeout)
- nr_wait = (int) iowq.cq_tail -
- READ_ONCE(ctx->rings->cq.tail);
- else
+ if (!iowq.hit_timeout) {
+ scoped_guard(rcu) {
+ rings = rcu_dereference(ctx->rings_rcu);
+ nr_wait = rings ?
+ (int) iowq.cq_tail -
+ READ_ONCE(rings->cq.tail) : 1;
+ }
+ } else {
nr_wait = 1;
+ }
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, nr_wait);
@@ -304,5 +329,11 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
finish_wait(&ctx->cq_wait, &iowq.wq);
restore_saved_sigmask_unless(ret == -EINTR);
- return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
+ scoped_guard(rcu) {
+ rings = rcu_dereference(ctx->rings_rcu);
+ if (rings &&
+ READ_ONCE(rings->cq.head) != READ_ONCE(rings->cq.tail))
+ ret = 0;
+ }
+ return ret;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU 2026-03-30 17:23 [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU Junxi Qian @ 2026-03-30 18:08 ` Jens Axboe 2026-03-31 4:03 ` junxi qian 0 siblings, 1 reply; 3+ messages in thread From: Jens Axboe @ 2026-03-30 18:08 UTC (permalink / raw) To: Junxi Qian, io-uring On 3/30/26 11:23 AM, Junxi Qian wrote: > io_register_resize_rings() briefly sets ctx->rings to NULL under > completion_lock before assigning the new rings and publishing them > via rcu_assign_pointer(ctx->rings_rcu, ...). Several code paths > read ctx->rings without holding any of those locks, leading to a > NULL pointer dereference if they race with a resize: > > - io_uring_poll() (VFS poll callback) > - io_should_wake() (waitqueue wake callback) > - io_cqring_min_timer_wakeup() (hrtimer callback) > - io_cqring_wait() (called from io_uring_enter) > > Commit 96189080265e only addressed io_ctx_mark_taskrun() in tw.c. > Protect the remaining sites by reading ctx->rings_rcu under > rcu_read_lock() (via guard(rcu)/scoped_guard(rcu)) and treating a > NULL rings as "no data available / force re-evaluation". First of all, thanks for the patch! I took a look at this, but I'm not a huge fan of the scoped guard in most spots, it just makes it harder to read. And I think that building on top of this for later kernels will make sense, so cleaner to add some helpers. Outside of that, the wait side can be a bit smarter rather than just wrap everything in rcu multiple times (eg the nr_wait part). There also should be no need to check 'rings' for NULL, it'll always be a valid value. How about something like this instead? diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 16122f877aed..079b37835833 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2017,7 +2017,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (ctx->flags & IORING_SETUP_SQ_REWIND) entries = ctx->sq_entries; else - entries = io_sqring_entries(ctx); + entries = __io_sqring_entries(ctx); entries = min(nr, entries); if (unlikely(!entries)) @@ -2253,7 +2253,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) */ poll_wait(file, &ctx->poll_wq, wait); - if (!io_sqring_full(ctx)) + rcu_read_lock(); + + if (!__io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; /* @@ -2273,6 +2275,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (__io_cqring_events_user(ctx) || io_has_work(ctx)) mask |= EPOLLIN | EPOLLRDNORM; + rcu_read_unlock(); return mask; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 91cf67b5d85b..5c47ed0b4276 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -142,16 +142,28 @@ struct io_wait_queue { #endif }; +static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx) +{ + return rcu_dereference_check(ctx->rings_rcu, + lockdep_is_held(&ctx->uring_lock) || + lockdep_is_held(&ctx->completion_lock)); +} + static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; + struct io_rings *rings; + int dist; + + guard(rcu)(); + rings = io_get_rings(ctx); /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ + dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail; return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } @@ -431,9 +443,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) __io_wq_wake(&ctx->cq_wait); } -static inline bool io_sqring_full(struct io_ring_ctx *ctx) +static inline bool __io_sqring_full(struct io_ring_ctx *ctx) { - struct io_rings *r = ctx->rings; + struct io_rings *r = io_get_rings(ctx); /* * SQPOLL must use the actual sqring head, as using the cached_sq_head @@ -445,9 +457,15 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; } -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +static inline bool io_sqring_full(struct io_ring_ctx *ctx) { - struct io_rings *rings = ctx->rings; + guard(rcu)(); + return __io_sqring_full(ctx); +} + +static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = io_get_rings(ctx); unsigned int entries; /* make sure SQ entry isn't read before tail */ @@ -455,6 +473,12 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) return min(entries, ctx->sq_entries); } +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + guard(rcu)(); + return __io_sqring_entries(ctx); +} + /* * Don't complete immediately but use deferred completion infrastructure. * Protected by ->uring_lock and can only be used either with diff --git a/io_uring/wait.c b/io_uring/wait.c index 0581cadf20ee..c24d018d53ab 100644 --- a/io_uring/wait.c +++ b/io_uring/wait.c @@ -79,12 +79,15 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) if (io_has_work(ctx)) goto out_wake; /* got events since we started waiting, min timeout is done */ - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) - goto out_wake; - /* if we have any events and min timeout expired, we're done */ - if (io_cqring_events(ctx)) - goto out_wake; + scoped_guard(rcu) { + struct io_rings *rings = io_get_rings(ctx); + if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail)) + goto out_wake; + /* if we have any events and min timeout expired, we're done */ + if (io_cqring_events(ctx)) + goto out_wake; + } /* * If using deferred task_work running and application is waiting on * more than one request, ensure we reset it now where we are switching @@ -186,9 +189,9 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg) { struct io_wait_queue iowq; - struct io_rings *rings = ctx->rings; + struct io_rings *rings; ktime_t start_time; - int ret; + int ret, nr_wait; min_events = min_t(int, min_events, ctx->cq_entries); @@ -201,15 +204,23 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) io_cqring_do_overflow_flush(ctx); - if (__io_cqring_events_user(ctx) >= min_events) + + rcu_read_lock(); + rings = io_get_rings(ctx); + if (__io_cqring_events_user(ctx) >= min_events) { + rcu_read_unlock(); return 0; + } init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); + iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events; + iowq.cq_min_tail = READ_ONCE(rings->cq.tail); + nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail); + rcu_read_unlock(); + rings = NULL; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.hit_timeout = 0; iowq.min_timeout = ext_arg->min_time; @@ -240,14 +251,6 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, trace_io_uring_cqring_wait(ctx, min_events); do { unsigned long check_cq; - int nr_wait; - - /* if min timeout has been hit, don't reset wait count */ - if (!iowq.hit_timeout) - nr_wait = (int) iowq.cq_tail - - READ_ONCE(ctx->rings->cq.tail); - else - nr_wait = 1; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, nr_wait); @@ -298,11 +301,20 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, break; } cond_resched(); + + /* if min timeout has been hit, don't reset wait count */ + if (!iowq.hit_timeout) + scoped_guard(rcu) + nr_wait = (int) iowq.cq_tail - + READ_ONCE(ctx->rings_rcu->cq.tail); + else + nr_wait = 1; } while (1); if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; + guard(rcu)(); + return READ_ONCE(ctx->rings_rcu->cq.head) == READ_ONCE(ctx->rings_rcu->cq.tail) ? ret : 0; } diff --git a/io_uring/wait.h b/io_uring/wait.h index 037e512dd80c..a4274b137f81 100644 --- a/io_uring/wait.h +++ b/io_uring/wait.h @@ -29,12 +29,15 @@ void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); + struct io_rings *rings = io_get_rings(ctx); + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); } static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) { - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); + struct io_rings *rings = io_get_rings(ctx); + + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); } /* -- Jens Axboe ^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU 2026-03-30 18:08 ` Jens Axboe @ 2026-03-31 4:03 ` junxi qian 0 siblings, 0 replies; 3+ messages in thread From: junxi qian @ 2026-03-31 4:03 UTC (permalink / raw) To: Jens Axboe, io-uring Looks good, thanks for the rework! I applied this on top of v7.0-rc5, compiled with KASAN enabled, and ran the resize + poll() race reproducer - no KASAN splat triggered. Reviewed-by: Junxi Qian <qjx1298677004@gmail.com> Tested-by: Junxi Qian <qjx1298677004@gmail.com> On Tue, Mar 31, 2026 at 2:08 AM Jens Axboe <axboe@kernel.dk> wrote: > > On 3/30/26 11:23 AM, Junxi Qian wrote: > > io_register_resize_rings() briefly sets ctx->rings to NULL under > > completion_lock before assigning the new rings and publishing them > > via rcu_assign_pointer(ctx->rings_rcu, ...). Several code paths > > read ctx->rings without holding any of those locks, leading to a > > NULL pointer dereference if they race with a resize: > > > > - io_uring_poll() (VFS poll callback) > > - io_should_wake() (waitqueue wake callback) > > - io_cqring_min_timer_wakeup() (hrtimer callback) > > - io_cqring_wait() (called from io_uring_enter) > > > > Commit 96189080265e only addressed io_ctx_mark_taskrun() in tw.c. > > Protect the remaining sites by reading ctx->rings_rcu under > > rcu_read_lock() (via guard(rcu)/scoped_guard(rcu)) and treating a > > NULL rings as "no data available / force re-evaluation". > > First of all, thanks for the patch! > > I took a look at this, but I'm not a huge fan of the scoped guard in > most spots, it just makes it harder to read. And I think that building > on top of this for later kernels will make sense, so cleaner to add some > helpers. Outside of that, the wait side can be a bit smarter rather than > just wrap everything in rcu multiple times (eg the nr_wait part). > > There also should be no need to check 'rings' for NULL, it'll always be > a valid value. > > How about something like this instead? > > > diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c > index 16122f877aed..079b37835833 100644 > --- a/io_uring/io_uring.c > +++ b/io_uring/io_uring.c > @@ -2017,7 +2017,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) > if (ctx->flags & IORING_SETUP_SQ_REWIND) > entries = ctx->sq_entries; > else > - entries = io_sqring_entries(ctx); > + entries = __io_sqring_entries(ctx); > > entries = min(nr, entries); > if (unlikely(!entries)) > @@ -2253,7 +2253,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) > */ > poll_wait(file, &ctx->poll_wq, wait); > > - if (!io_sqring_full(ctx)) > + rcu_read_lock(); > + > + if (!__io_sqring_full(ctx)) > mask |= EPOLLOUT | EPOLLWRNORM; > > /* > @@ -2273,6 +2275,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) > if (__io_cqring_events_user(ctx) || io_has_work(ctx)) > mask |= EPOLLIN | EPOLLRDNORM; > > + rcu_read_unlock(); > return mask; > } > > diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h > index 91cf67b5d85b..5c47ed0b4276 100644 > --- a/io_uring/io_uring.h > +++ b/io_uring/io_uring.h > @@ -142,16 +142,28 @@ struct io_wait_queue { > #endif > }; > > +static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx) > +{ > + return rcu_dereference_check(ctx->rings_rcu, > + lockdep_is_held(&ctx->uring_lock) || > + lockdep_is_held(&ctx->completion_lock)); > +} > + > static inline bool io_should_wake(struct io_wait_queue *iowq) > { > struct io_ring_ctx *ctx = iowq->ctx; > - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; > + struct io_rings *rings; > + int dist; > + > + guard(rcu)(); > + rings = io_get_rings(ctx); > > /* > * Wake up if we have enough events, or if a timeout occurred since we > * started waiting. For timeouts, we always want to return to userspace, > * regardless of event count. > */ > + dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail; > return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; > } > > @@ -431,9 +443,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) > __io_wq_wake(&ctx->cq_wait); > } > > -static inline bool io_sqring_full(struct io_ring_ctx *ctx) > +static inline bool __io_sqring_full(struct io_ring_ctx *ctx) > { > - struct io_rings *r = ctx->rings; > + struct io_rings *r = io_get_rings(ctx); > > /* > * SQPOLL must use the actual sqring head, as using the cached_sq_head > @@ -445,9 +457,15 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) > return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; > } > > -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) > +static inline bool io_sqring_full(struct io_ring_ctx *ctx) > { > - struct io_rings *rings = ctx->rings; > + guard(rcu)(); > + return __io_sqring_full(ctx); > +} > + > +static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx) > +{ > + struct io_rings *rings = io_get_rings(ctx); > unsigned int entries; > > /* make sure SQ entry isn't read before tail */ > @@ -455,6 +473,12 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) > return min(entries, ctx->sq_entries); > } > > +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) > +{ > + guard(rcu)(); > + return __io_sqring_entries(ctx); > +} > + > /* > * Don't complete immediately but use deferred completion infrastructure. > * Protected by ->uring_lock and can only be used either with > diff --git a/io_uring/wait.c b/io_uring/wait.c > index 0581cadf20ee..c24d018d53ab 100644 > --- a/io_uring/wait.c > +++ b/io_uring/wait.c > @@ -79,12 +79,15 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) > if (io_has_work(ctx)) > goto out_wake; > /* got events since we started waiting, min timeout is done */ > - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) > - goto out_wake; > - /* if we have any events and min timeout expired, we're done */ > - if (io_cqring_events(ctx)) > - goto out_wake; > + scoped_guard(rcu) { > + struct io_rings *rings = io_get_rings(ctx); > > + if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail)) > + goto out_wake; > + /* if we have any events and min timeout expired, we're done */ > + if (io_cqring_events(ctx)) > + goto out_wake; > + } > /* > * If using deferred task_work running and application is waiting on > * more than one request, ensure we reset it now where we are switching > @@ -186,9 +189,9 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > struct ext_arg *ext_arg) > { > struct io_wait_queue iowq; > - struct io_rings *rings = ctx->rings; > + struct io_rings *rings; > ktime_t start_time; > - int ret; > + int ret, nr_wait; > > min_events = min_t(int, min_events, ctx->cq_entries); > > @@ -201,15 +204,23 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > > if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) > io_cqring_do_overflow_flush(ctx); > - if (__io_cqring_events_user(ctx) >= min_events) > + > + rcu_read_lock(); > + rings = io_get_rings(ctx); > + if (__io_cqring_events_user(ctx) >= min_events) { > + rcu_read_unlock(); > return 0; > + } > > init_waitqueue_func_entry(&iowq.wq, io_wake_function); > iowq.wq.private = current; > INIT_LIST_HEAD(&iowq.wq.entry); > iowq.ctx = ctx; > - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; > - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); > + iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events; > + iowq.cq_min_tail = READ_ONCE(rings->cq.tail); > + nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail); > + rcu_read_unlock(); > + rings = NULL; > iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); > iowq.hit_timeout = 0; > iowq.min_timeout = ext_arg->min_time; > @@ -240,14 +251,6 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > trace_io_uring_cqring_wait(ctx, min_events); > do { > unsigned long check_cq; > - int nr_wait; > - > - /* if min timeout has been hit, don't reset wait count */ > - if (!iowq.hit_timeout) > - nr_wait = (int) iowq.cq_tail - > - READ_ONCE(ctx->rings->cq.tail); > - else > - nr_wait = 1; > > if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { > atomic_set(&ctx->cq_wait_nr, nr_wait); > @@ -298,11 +301,20 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, > break; > } > cond_resched(); > + > + /* if min timeout has been hit, don't reset wait count */ > + if (!iowq.hit_timeout) > + scoped_guard(rcu) > + nr_wait = (int) iowq.cq_tail - > + READ_ONCE(ctx->rings_rcu->cq.tail); > + else > + nr_wait = 1; > } while (1); > > if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) > finish_wait(&ctx->cq_wait, &iowq.wq); > restore_saved_sigmask_unless(ret == -EINTR); > > - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; > + guard(rcu)(); > + return READ_ONCE(ctx->rings_rcu->cq.head) == READ_ONCE(ctx->rings_rcu->cq.tail) ? ret : 0; > } > diff --git a/io_uring/wait.h b/io_uring/wait.h > index 037e512dd80c..a4274b137f81 100644 > --- a/io_uring/wait.h > +++ b/io_uring/wait.h > @@ -29,12 +29,15 @@ void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); > > static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) > { > - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); > + struct io_rings *rings = io_get_rings(ctx); > + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); > } > > static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) > { > - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); > + struct io_rings *rings = io_get_rings(ctx); > + > + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); > } > > /* > > -- > Jens Axboe ^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2026-03-31 4:03 UTC | newest] Thread overview: 3+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-03-30 17:23 [PATCH] io_uring: protect remaining lockless ctx->rings accesses with RCU Junxi Qian 2026-03-30 18:08 ` Jens Axboe 2026-03-31 4:03 ` junxi qian
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox