From: Jens Axboe <axboe@kernel.dk>
To: Usama Arif <usama.arif@bytedance.com>,
io-uring@vger.kernel.org, asml.silence@gmail.com,
linux-kernel@vger.kernel.org
Cc: fam.zheng@bytedance.com
Subject: Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
Date: Wed, 2 Feb 2022 12:18:51 -0700 [thread overview]
Message-ID: <7df1059c-6151-29c8-9ed5-0bc0726c362d@kernel.dk> (raw)
In-Reply-To: <86ae792e-d138-112e-02bb-ab70e3c2a147@kernel.dk>
On 2/2/22 9:57 AM, Jens Axboe wrote:
> On 2/2/22 8:59 AM, Usama Arif wrote:
>> Acquire completion_lock at the start of __io_uring_register before
>> registering/unregistering eventfd and release it at the end. Hence
>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>> will finish before acquiring the spin_lock in io_uring_register, and
>> all new calls will wait till the eventfd is registered. This avoids
>> ring quiesce which is much more expensive than acquiring the
>> spin_lock.
>>
>> On the system tested with this patch, io_uring_reigster with
>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
>
> This seems like optimizing for the wrong thing, so I've got a few
> questions. Are you doing a lot of eventfd registrations (and
> unregister) in your workload? Or is it just the initial pain of
> registering one? In talking to Pavel, he suggested that RCU might be a
> good use case here, and I think so too. That would still remove the
> need to quiesce, and the posted side just needs a fairly cheap rcu
> read lock/unlock around it.
Totally untested, but perhaps can serve as a starting point or
inspiration.
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 64c055421809..195752f4823f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,6 +329,12 @@ struct io_submit_state {
struct blk_plug plug;
};
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ struct io_ring_ctx *ctx;
+ struct rcu_head rcu;
+};
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -412,7 +418,7 @@ struct io_ring_ctx {
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
+ struct io_ev_fd *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -1741,13 +1747,27 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
+ if (likely(!ctx->io_ev_fd))
return false;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return false;
return !ctx->eventfd_async || io_wq_current_is_worker();
}
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
+
+ if (!io_should_trigger_evfd(ctx))
+ return;
+
+ rcu_read_lock();
+ ev_fd = READ_ONCE(ctx->io_ev_fd);
+ if (ev_fd)
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+ rcu_read_unlock();
+}
+
/*
* This should only get called when at least one event has been posted.
* Some applications rely on the eventfd notification count only changing
@@ -1764,8 +1784,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ io_eventfd_signal(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1777,8 +1796,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
if (waitqueue_active(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
}
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ io_eventfd_signal(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -9569,31 +9587,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
- if (ctx->cq_ev_fd)
+ if (ctx->io_ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
+
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ kfree(ev_fd);
return ret;
}
+ ev_fd->ctx = ctx;
+ WRITE_ONCE(ctx->io_ev_fd, ev_fd);
return 0;
}
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+ struct io_ring_ctx *ctx = ev_fd->ctx;
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+ WRITE_ONCE(ctx->io_ev_fd, NULL);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd = ctx->io_ev_fd;
+
+ if (ev_fd) {
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}
@@ -9659,7 +9695,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
mutex_unlock(&ctx->uring_lock);
- io_eventfd_unregister(ctx);
+ if (ctx->io_ev_fd) {
+ eventfd_ctx_put(ctx->io_ev_fd->cq_ev_fd);
+ kfree(ctx->io_ev_fd);
+ }
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -11209,6 +11248,8 @@ static bool io_register_op_must_quiesce(int op)
case IORING_UNREGISTER_IOWQ_AFF:
case IORING_REGISTER_IOWQ_MAX_WORKERS:
case IORING_REGISTER_MAP_BUFFERS:
+ case IORING_REGISTER_EVENTFD:
+ case IORING_UNREGISTER_EVENTFD:
return false;
default:
return true;
@@ -11423,7 +11464,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ ctx->io_ev_fd != NULL, ret);
out_fput:
fdput(f);
return ret;
--
Jens Axboe
next prev parent reply other threads:[~2022-02-02 19:18 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-02-02 15:59 [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
2022-02-02 16:57 ` Jens Axboe
2022-02-02 18:32 ` Pavel Begunkov
2022-02-02 18:39 ` Pavel Begunkov
2022-02-02 19:18 ` Jens Axboe [this message]
2022-02-03 15:14 ` [External] " Usama Arif
2022-02-03 15:44 ` Pavel Begunkov
2022-02-03 15:55 ` Pavel Begunkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=7df1059c-6151-29c8-9ed5-0bc0726c362d@kernel.dk \
--to=axboe@kernel.dk \
--cc=asml.silence@gmail.com \
--cc=fam.zheng@bytedance.com \
--cc=io-uring@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=usama.arif@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.