Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jens Axboe <axboe@kernel.dk>
To: Usama Arif <usama.arif@bytedance.com>,
	io-uring@vger.kernel.org, asml.silence@gmail.com,
	linux-kernel@vger.kernel.org
Cc: fam.zheng@bytedance.com
Subject: Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd
Date: Wed, 2 Feb 2022 12:18:51 -0700	[thread overview]
Message-ID: <7df1059c-6151-29c8-9ed5-0bc0726c362d@kernel.dk> (raw)
In-Reply-To: <86ae792e-d138-112e-02bb-ab70e3c2a147@kernel.dk>

On 2/2/22 9:57 AM, Jens Axboe wrote:
> On 2/2/22 8:59 AM, Usama Arif wrote:
>> Acquire completion_lock at the start of __io_uring_register before
>> registering/unregistering eventfd and release it at the end. Hence
>> all calls to io_cqring_ev_posted which adds to the eventfd counter
>> will finish before acquiring the spin_lock in io_uring_register, and
>> all new calls will wait till the eventfd is registered. This avoids
>> ring quiesce which is much more expensive than acquiring the
>> spin_lock.
>>
>> On the system tested with this patch, io_uring_reigster with
>> IORING_REGISTER_EVENTFD takes less than 1ms, compared to 15ms before.
> 
> This seems like optimizing for the wrong thing, so I've got a few
> questions. Are you doing a lot of eventfd registrations (and
> unregister) in your workload? Or is it just the initial pain of
> registering one? In talking to Pavel, he suggested that RCU might be a
> good use case here, and I think so too. That would still remove the
> need to quiesce, and the posted side just needs a fairly cheap rcu
> read lock/unlock around it.

Totally untested, but perhaps can serve as a starting point or
inspiration.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 64c055421809..195752f4823f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -329,6 +329,12 @@ struct io_submit_state {
 	struct blk_plug		plug;
 };
 
+struct io_ev_fd {
+	struct eventfd_ctx	*cq_ev_fd;
+	struct io_ring_ctx	*ctx;
+	struct rcu_head		rcu;
+};
+
 struct io_ring_ctx {
 	/* const or read-mostly hot data */
 	struct {
@@ -412,7 +418,7 @@ struct io_ring_ctx {
 	struct {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
-		struct eventfd_ctx	*cq_ev_fd;
+		struct io_ev_fd		*io_ev_fd;
 		struct wait_queue_head	cq_wait;
 		unsigned		cq_extra;
 		atomic_t		cq_timeouts;
@@ -1741,13 +1747,27 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 
 static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
 {
-	if (likely(!ctx->cq_ev_fd))
+	if (likely(!ctx->io_ev_fd))
 		return false;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		return false;
 	return !ctx->eventfd_async || io_wq_current_is_worker();
 }
 
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+	struct io_ev_fd *ev_fd;
+
+	if (!io_should_trigger_evfd(ctx))
+		return;
+
+	rcu_read_lock();
+	ev_fd = READ_ONCE(ctx->io_ev_fd);
+	if (ev_fd)
+		eventfd_signal(ev_fd->cq_ev_fd, 1);
+	rcu_read_unlock();
+}
+
 /*
  * This should only get called when at least one event has been posted.
  * Some applications rely on the eventfd notification count only changing
@@ -1764,8 +1784,7 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 	 */
 	if (wq_has_sleeper(&ctx->cq_wait))
 		wake_up_all(&ctx->cq_wait);
-	if (io_should_trigger_evfd(ctx))
-		eventfd_signal(ctx->cq_ev_fd, 1);
+	io_eventfd_signal(ctx);
 }
 
 static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
@@ -1777,8 +1796,7 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
 		if (waitqueue_active(&ctx->cq_wait))
 			wake_up_all(&ctx->cq_wait);
 	}
-	if (io_should_trigger_evfd(ctx))
-		eventfd_signal(ctx->cq_ev_fd, 1);
+	io_eventfd_signal(ctx);
 }
 
 /* Returns true if there are no backlogged entries after the flush */
@@ -9569,31 +9587,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 
 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
 {
+	struct io_ev_fd *ev_fd;
 	__s32 __user *fds = arg;
 	int fd;
 
-	if (ctx->cq_ev_fd)
+	if (ctx->io_ev_fd)
 		return -EBUSY;
 
 	if (copy_from_user(&fd, fds, sizeof(*fds)))
 		return -EFAULT;
 
-	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
-	if (IS_ERR(ctx->cq_ev_fd)) {
-		int ret = PTR_ERR(ctx->cq_ev_fd);
+	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+	if (!ev_fd)
+		return -ENOMEM;
+
+	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+	if (IS_ERR(ev_fd->cq_ev_fd)) {
+		int ret = PTR_ERR(ev_fd->cq_ev_fd);
 
-		ctx->cq_ev_fd = NULL;
+		kfree(ev_fd);
 		return ret;
 	}
 
+	ev_fd->ctx = ctx;
+	WRITE_ONCE(ctx->io_ev_fd, ev_fd);
 	return 0;
 }
 
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+	struct io_ring_ctx *ctx = ev_fd->ctx;
+
+	eventfd_ctx_put(ev_fd->cq_ev_fd);
+	kfree(ev_fd);
+	WRITE_ONCE(ctx->io_ev_fd, NULL);
+}
+
 static int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
-	if (ctx->cq_ev_fd) {
-		eventfd_ctx_put(ctx->cq_ev_fd);
-		ctx->cq_ev_fd = NULL;
+	struct io_ev_fd *ev_fd = ctx->io_ev_fd;
+
+	if (ev_fd) {
+		call_rcu(&ev_fd->rcu, io_eventfd_put);
 		return 0;
 	}
 
@@ -9659,7 +9695,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->rings)
 		__io_cqring_overflow_flush(ctx, true);
 	mutex_unlock(&ctx->uring_lock);
-	io_eventfd_unregister(ctx);
+	if (ctx->io_ev_fd) {
+		eventfd_ctx_put(ctx->io_ev_fd->cq_ev_fd);
+		kfree(ctx->io_ev_fd);
+	}
 	io_destroy_buffers(ctx);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -11209,6 +11248,8 @@ static bool io_register_op_must_quiesce(int op)
 	case IORING_UNREGISTER_IOWQ_AFF:
 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
 	case IORING_REGISTER_MAP_BUFFERS:
+	case IORING_REGISTER_EVENTFD:
+	case IORING_UNREGISTER_EVENTFD:
 		return false;
 	default:
 		return true;
@@ -11423,7 +11464,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
-							ctx->cq_ev_fd != NULL, ret);
+							ctx->io_ev_fd != NULL, ret);
 out_fput:
 	fdput(f);
 	return ret;

-- 
Jens Axboe

next prev parent reply	other threads:[~2022-02-02 19:18 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-02 15:59 [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd Usama Arif
2022-02-02 16:57 ` Jens Axboe
2022-02-02 18:32   ` Pavel Begunkov
2022-02-02 18:39     ` Pavel Begunkov
2022-02-02 19:18   ` Jens Axboe [this message]
2022-02-03 15:14     ` [External] " Usama Arif
2022-02-03 15:44       ` Pavel Begunkov
2022-02-03 15:55         ` Pavel Begunkov

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:64c05542180 dfblob:195752f4823 )
 OR (
bs:"Re: [RFC] io_uring: avoid ring quiesce while registering/unregistering eventfd" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7df1059c-6151-29c8-9ed5-0bc0726c362d@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=asml.silence@gmail.com \
    --cc=fam.zheng@bytedance.com \
    --cc=io-uring@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=usama.arif@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.