* [PATCH 0/3] timeout fixes
@ 2020-05-02 12:07 Pavel Begunkov
2020-05-02 12:07 ` [PATCH 1/3] io_uring: trigger timeout after any sqe->off CQEs Pavel Begunkov
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Pavel Begunkov @ 2020-05-02 12:07 UTC (permalink / raw)
To: Jens Axboe, io-uring, linux-kernel
Nothing changed from the last time, just resending.
[1,2] CQ vs SQ for timeout offset accounting
[3] fixes timeout flushing with batched commits
Pavel Begunkov (3):
io_uring: trigger timeout after any sqe->off CQEs
io_uring: don't trigger timeout with another t-out
io_uring: fix timeout offset with batch CQ commit
fs/io_uring.c | 126 +++++++++++++++++++++-----------------------------
1 file changed, 52 insertions(+), 74 deletions(-)
--
2.24.0
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/3] io_uring: trigger timeout after any sqe->off CQEs
2020-05-02 12:07 [PATCH 0/3] timeout fixes Pavel Begunkov
@ 2020-05-02 12:07 ` Pavel Begunkov
2020-05-02 12:07 ` [PATCH 2/3] io_uring: don't trigger timeout with another t-out Pavel Begunkov
2020-05-02 12:07 ` [PATCH 3/3] io_uring: fix timeout offset with batch CQ commit Pavel Begunkov
2 siblings, 0 replies; 4+ messages in thread
From: Pavel Begunkov @ 2020-05-02 12:07 UTC (permalink / raw)
To: Jens Axboe, io-uring, linux-kernel
sequence mode timeouts wait not for sqe->off CQEs, but rather
sqe->off + number of prior inflight requests with a quirk ignoring other
timeouts completions. Wait exactly for sqe->off using completion count
(tail) for accounting.
Reported-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
fs/io_uring.c | 120 +++++++++++++++++++-------------------------------
1 file changed, 46 insertions(+), 74 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 65458eda2127..148f61734572 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -384,7 +384,8 @@ struct io_timeout {
struct file *file;
u64 addr;
int flags;
- u32 count;
+ u32 off;
+ u32 target_seq;
};
struct io_rw {
@@ -986,23 +987,6 @@ static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx)
return NULL;
}
-static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx)
-{
- struct io_kiocb *req;
-
- req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list);
- if (req) {
- if (req->flags & REQ_F_TIMEOUT_NOSEQ)
- return NULL;
- if (!__req_need_defer(req)) {
- list_del_init(&req->list);
- return req;
- }
- }
-
- return NULL;
-}
-
static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
@@ -1118,12 +1102,42 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
+static inline bool io_check_in_range(u32 pos, u32 start, u32 end)
+{
+ /* if @end < @start, check for [end, MAX_UINT] + [MAX_UINT, start] */
+ return (pos - start) <= (end - start);
+}
+
+static void __io_flush_timeouts(struct io_ring_ctx *ctx)
+{
+ u32 end, start;
+
+ start = end = ctx->cached_cq_tail;
+ do {
+ struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
+ struct io_kiocb, list);
+
+ if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+ break;
+ /*
+ * multiple timeouts may have the same target,
+ * check that @req is in [first_tail, cur_tail]
+ */
+ if (!io_check_in_range(req->timeout.target_seq, start, end))
+ break;
+
+ list_del_init(&req->list);
+ io_kill_timeout(req);
+ end = ctx->cached_cq_tail;
+ } while (!list_empty(&ctx->timeout_list));
+}
+
static void io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
- while ((req = io_get_timeout_req(ctx)) != NULL)
- io_kill_timeout(req);
+ if (!list_empty(&ctx->timeout_list))
+ __io_flush_timeouts(ctx);
__io_commit_cqring(ctx);
@@ -4582,20 +4596,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
* We could be racing with timeout deletion. If the list is empty,
* then timeout lookup already found it and will be handling it.
*/
- if (!list_empty(&req->list)) {
- struct io_kiocb *prev;
-
- /*
- * Adjust the reqs sequence before the current one because it
- * will consume a slot in the cq_ring and the cq_tail
- * pointer will be increased, otherwise other timeout reqs may
- * return in advance without waiting for enough wait_nr.
- */
- prev = req;
- list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
- prev->sequence++;
+ if (!list_empty(&req->list))
list_del_init(&req->list);
- }
io_cqring_fill_event(req, -ETIME);
io_commit_cqring(ctx);
@@ -4675,18 +4677,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
{
struct io_timeout_data *data;
unsigned flags;
+ u32 off = READ_ONCE(sqe->off);
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
return -EINVAL;
- if (sqe->off && is_timeout_link)
+ if (off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL;
- req->timeout.count = READ_ONCE(sqe->off);
+ req->timeout.off = off;
if (!req->io && io_alloc_async_ctx(req))
return -ENOMEM;
@@ -4710,68 +4713,37 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_timeout(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_timeout_data *data;
+ struct io_timeout_data *data = &req->io->timeout;
struct list_head *entry;
- unsigned span = 0;
- u32 count = req->timeout.count;
- u32 seq = req->sequence;
+ u32 tail, off = req->timeout.off;
- data = &req->io->timeout;
+ spin_lock_irq(&ctx->completion_lock);
/*
* sqe->off holds how many events that need to occur for this
* timeout event to be satisfied. If it isn't set, then this is
* a pure timeout request, sequence isn't used.
*/
- if (!count) {
+ if (!off) {
req->flags |= REQ_F_TIMEOUT_NOSEQ;
- spin_lock_irq(&ctx->completion_lock);
entry = ctx->timeout_list.prev;
goto add;
}
- req->sequence = seq + count;
+ tail = ctx->cached_cq_tail;
+ req->timeout.target_seq = tail + off;
/*
* Insertion sort, ensuring the first entry in the list is always
* the one we need first.
*/
- spin_lock_irq(&ctx->completion_lock);
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- unsigned nxt_seq;
- long long tmp, tmp_nxt;
- u32 nxt_offset = nxt->timeout.count;
-
- if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
- continue;
-
- /*
- * Since seq + count can overflow, use type long
- * long to store it.
- */
- tmp = (long long)seq + count;
- nxt_seq = nxt->sequence - nxt_offset;
- tmp_nxt = (long long)nxt_seq + nxt_offset;
+ u32 nxt_off = nxt->timeout.target_seq - tail;
- /*
- * cached_sq_head may overflow, and it will never overflow twice
- * once there is some timeout req still be valid.
- */
- if (seq < nxt_seq)
- tmp += UINT_MAX;
-
- if (tmp > tmp_nxt)
+ if (!(nxt->flags & REQ_F_TIMEOUT_NOSEQ) && (off >= nxt_off))
break;
-
- /*
- * Sequence of reqs after the insert one and itself should
- * be adjusted because each timeout req consumes a slot.
- */
- span++;
- nxt->sequence++;
}
- req->sequence -= span;
add:
list_add(&req->list, entry);
data->timer.function = io_timeout_fn;
--
2.24.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 2/3] io_uring: don't trigger timeout with another t-out
2020-05-02 12:07 [PATCH 0/3] timeout fixes Pavel Begunkov
2020-05-02 12:07 ` [PATCH 1/3] io_uring: trigger timeout after any sqe->off CQEs Pavel Begunkov
@ 2020-05-02 12:07 ` Pavel Begunkov
2020-05-02 12:07 ` [PATCH 3/3] io_uring: fix timeout offset with batch CQ commit Pavel Begunkov
2 siblings, 0 replies; 4+ messages in thread
From: Pavel Begunkov @ 2020-05-02 12:07 UTC (permalink / raw)
To: Jens Axboe, io-uring, linux-kernel
When deciding whether to fire a timeout basing on number of completions,
ignore CQEs emitted by other timeouts.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
fs/io_uring.c | 19 +++----------------
1 file changed, 3 insertions(+), 16 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 148f61734572..827b0b967dc7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1102,33 +1102,20 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
-static inline bool io_check_in_range(u32 pos, u32 start, u32 end)
-{
- /* if @end < @start, check for [end, MAX_UINT] + [MAX_UINT, start] */
- return (pos - start) <= (end - start);
-}
-
static void __io_flush_timeouts(struct io_ring_ctx *ctx)
{
- u32 end, start;
-
- start = end = ctx->cached_cq_tail;
do {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
struct io_kiocb, list);
if (req->flags & REQ_F_TIMEOUT_NOSEQ)
break;
- /*
- * multiple timeouts may have the same target,
- * check that @req is in [first_tail, cur_tail]
- */
- if (!io_check_in_range(req->timeout.target_seq, start, end))
+ if (req->timeout.target_seq != ctx->cached_cq_tail
+ - atomic_read(&ctx->cq_timeouts))
break;
list_del_init(&req->list);
io_kill_timeout(req);
- end = ctx->cached_cq_tail;
} while (!list_empty(&ctx->timeout_list));
}
@@ -4730,7 +4717,7 @@ static int io_timeout(struct io_kiocb *req)
goto add;
}
- tail = ctx->cached_cq_tail;
+ tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
req->timeout.target_seq = tail + off;
/*
--
2.24.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
* [PATCH 3/3] io_uring: fix timeout offset with batch CQ commit
2020-05-02 12:07 [PATCH 0/3] timeout fixes Pavel Begunkov
2020-05-02 12:07 ` [PATCH 1/3] io_uring: trigger timeout after any sqe->off CQEs Pavel Begunkov
2020-05-02 12:07 ` [PATCH 2/3] io_uring: don't trigger timeout with another t-out Pavel Begunkov
@ 2020-05-02 12:07 ` Pavel Begunkov
2 siblings, 0 replies; 4+ messages in thread
From: Pavel Begunkov @ 2020-05-02 12:07 UTC (permalink / raw)
To: Jens Axboe, io-uring, linux-kernel
Completions may be done in batches, where io_commit_cqring() is called
only once at the end. It means, that timeout sequence checks are done
only once and don't consider events in between, so potentially failing
to trigger some timeouts.
Do a separate CQ sequence accounting in u64. On timeout sequence
checking look up to UINT_MAX sequence behind, which it could have
missed. It's safe to do, because sqe->off is u32 and so can't wrap
around to used [seq - UINT_MAX, seq] window.
It's also necessary to decouple CQ timeout sequences from
ctx->cq_cached_tail for implementing "single CQE per link" feature and
others.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
fs/io_uring.c | 33 ++++++++++++++++++++++++++-------
1 file changed, 26 insertions(+), 7 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 827b0b967dc7..4ed82d39540b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -298,6 +298,7 @@ struct io_ring_ctx {
unsigned cq_entries;
unsigned cq_mask;
atomic_t cq_timeouts;
+ u64 cq_seq;
unsigned long cq_check_overflow;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
@@ -385,7 +386,7 @@ struct io_timeout {
u64 addr;
int flags;
u32 off;
- u32 target_seq;
+ u64 target_seq;
};
struct io_rw {
@@ -1085,6 +1086,7 @@ static void io_kill_timeout(struct io_kiocb *req)
ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts);
+ req->ctx->cq_seq--;
list_del_init(&req->list);
req->flags |= REQ_F_COMP_LOCKED;
io_cqring_fill_event(req, 0);
@@ -1102,16 +1104,31 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->completion_lock);
}
+static inline bool io_check_in_range(u64 pos, u64 start, u64 end)
+{
+ /* if @end < @start, check for [end, MAX_U64] + [MAX_U64, start] */
+ return (pos - start) <= (end - start);
+}
+
static void __io_flush_timeouts(struct io_ring_ctx *ctx)
{
+ u64 start_seq = ctx->cq_seq;
+
+
+ /*
+ * Batched CQ commit may have left some pending timeouts sequences
+ * behind @cq_sqe. Look back to find them. Note, that sqe->off is u32,
+ * and it uses u64 to not falsely trigger timeouts with large off.
+ */
+ start_seq -= UINT_MAX;
do {
struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
struct io_kiocb, list);
if (req->flags & REQ_F_TIMEOUT_NOSEQ)
break;
- if (req->timeout.target_seq != ctx->cached_cq_tail
- - atomic_read(&ctx->cq_timeouts))
+ if (!io_check_in_range(req->timeout.target_seq, start_seq,
+ ctx->cq_seq))
break;
list_del_init(&req->list);
@@ -1147,6 +1164,7 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return NULL;
ctx->cached_cq_tail++;
+ ctx->cq_seq++;
return &rings->cqes[tail & ctx->cq_mask];
}
@@ -4579,6 +4597,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags);
+ ctx->cq_seq--;
+
/*
* We could be racing with timeout deletion. If the list is empty,
* then timeout lookup already found it and will be handling it.
@@ -4702,7 +4722,7 @@ static int io_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data = &req->io->timeout;
struct list_head *entry;
- u32 tail, off = req->timeout.off;
+ u32 off = req->timeout.off;
spin_lock_irq(&ctx->completion_lock);
@@ -4717,8 +4737,7 @@ static int io_timeout(struct io_kiocb *req)
goto add;
}
- tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- req->timeout.target_seq = tail + off;
+ req->timeout.target_seq = ctx->cq_seq + off;
/*
* Insertion sort, ensuring the first entry in the list is always
@@ -4726,7 +4745,7 @@ static int io_timeout(struct io_kiocb *req)
*/
list_for_each_prev(entry, &ctx->timeout_list) {
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
- u32 nxt_off = nxt->timeout.target_seq - tail;
+ u32 nxt_off = (u32)(nxt->timeout.target_seq - ctx->cq_seq);
if (!(nxt->flags & REQ_F_TIMEOUT_NOSEQ) && (off >= nxt_off))
break;
--
2.24.0
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2020-05-02 12:09 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-05-02 12:07 [PATCH 0/3] timeout fixes Pavel Begunkov
2020-05-02 12:07 ` [PATCH 1/3] io_uring: trigger timeout after any sqe->off CQEs Pavel Begunkov
2020-05-02 12:07 ` [PATCH 2/3] io_uring: don't trigger timeout with another t-out Pavel Begunkov
2020-05-02 12:07 ` [PATCH 3/3] io_uring: fix timeout offset with batch CQ commit Pavel Begunkov
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.