All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: io-uring@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 3/7] io_uring/msg_ring: avoid double indirection task_work for data messages
Date: Thu, 30 May 2024 09:23:40 -0600	[thread overview]
Message-ID: <20240530152822.535791-5-axboe@kernel.dk> (raw)
In-Reply-To: <20240530152822.535791-2-axboe@kernel.dk>

If IORING_SETUP_DEFER_TASKRUN is set, then we can't post CQEs remotely
to the target ring. Instead, task_work is queued for the target ring,
which is used to post the CQE. To make matters worse, once the target
CQE has been posted, task_work is then queued with the originator to
fill the completion.

This obviously adds a bunch of overhead and latency. Instead of relying
on generic kernel task_work for this, fill an overflow entry on the
target ring and flag it as such that the target ring will flush it. This
avoids both the task_work for posting the CQE, and it means that the
originator CQE can be filled inline as well.

In local testing, this reduces the latency on the sender side by 5-6x.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/msg_ring.c | 87 ++++++++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 32 deletions(-)

diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 15e7bda77d0d..bdb935ef7aa2 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -87,38 +87,61 @@ static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func)
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
-static void io_msg_tw_complete(struct callback_head *head)
+static struct io_overflow_cqe *io_alloc_overflow(struct io_ring_ctx *target_ctx)
 {
-	struct io_msg *msg = container_of(head, struct io_msg, tw);
-	struct io_kiocb *req = cmd_to_io_kiocb(msg);
-	struct io_ring_ctx *target_ctx = req->file->private_data;
-	int ret = 0;
-
-	if (current->flags & PF_EXITING) {
-		ret = -EOWNERDEAD;
-	} else {
-		u32 flags = 0;
-
-		if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
-			flags = msg->cqe_flags;
-
-		/*
-		 * If the target ring is using IOPOLL mode, then we need to be
-		 * holding the uring_lock for posting completions. Other ring
-		 * types rely on the regular completion locking, which is
-		 * handled while posting.
-		 */
-		if (target_ctx->flags & IORING_SETUP_IOPOLL)
-			mutex_lock(&target_ctx->uring_lock);
-		if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
-			ret = -EOVERFLOW;
-		if (target_ctx->flags & IORING_SETUP_IOPOLL)
-			mutex_unlock(&target_ctx->uring_lock);
+	bool is_cqe32 = target_ctx->flags & IORING_SETUP_CQE32;
+	size_t cqe_size = sizeof(struct io_overflow_cqe);
+	struct io_overflow_cqe *ocqe;
+
+	if (is_cqe32)
+		cqe_size += sizeof(struct io_uring_cqe);
+
+	ocqe = kmalloc(cqe_size, GFP_ATOMIC | __GFP_ACCOUNT);
+	if (!ocqe)
+		return NULL;
+
+	if (is_cqe32)
+		ocqe->cqe.big_cqe[0] = ocqe->cqe.big_cqe[1] = 0;
+
+	return ocqe;
+}
+
+/*
+ * Entered with the target uring_lock held, and will drop it before
+ * returning. Adds a previously allocated ocqe to the overflow list on
+ * the target, and marks it appropriately for flushing.
+ */
+static void io_msg_add_overflow(struct io_msg *msg,
+				struct io_ring_ctx *target_ctx,
+				struct io_overflow_cqe *ocqe, int ret,
+				u32 flags)
+	__releases(&target_ctx->completion_lock)
+{
+	if (list_empty(&target_ctx->cq_overflow_list)) {
+		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &target_ctx->check_cq);
+		atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags);
 	}
 
-	if (ret < 0)
-		req_set_fail(req);
-	io_req_queue_tw_complete(req, ret);
+	ocqe->cqe.user_data = msg->user_data;
+	ocqe->cqe.res = ret;
+	ocqe->cqe.flags = flags;
+	list_add_tail(&ocqe->list, &target_ctx->cq_overflow_list);
+	spin_unlock(&target_ctx->completion_lock);
+	wake_up_state(target_ctx->submitter_task, TASK_INTERRUPTIBLE);
+}
+
+static int io_msg_fill_remote(struct io_msg *msg, unsigned int issue_flags,
+			      struct io_ring_ctx *target_ctx, u32 flags)
+{
+	struct io_overflow_cqe *ocqe;
+
+	ocqe = io_alloc_overflow(target_ctx);
+	if (!ocqe)
+		return -ENOMEM;
+
+	spin_lock(&target_ctx->completion_lock);
+	io_msg_add_overflow(msg, target_ctx, ocqe, msg->len, flags);
+	return 0;
 }
 
 static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
@@ -135,12 +158,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 	if (target_ctx->flags & IORING_SETUP_R_DISABLED)
 		return -EBADFD;
 
-	if (io_msg_need_remote(target_ctx))
-		return io_msg_exec_remote(req, io_msg_tw_complete);
-
 	if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
 		flags = msg->cqe_flags;
 
+	if (io_msg_need_remote(target_ctx))
+		return io_msg_fill_remote(msg, issue_flags, target_ctx, flags);
+
 	ret = -EOVERFLOW;
 	if (target_ctx->flags & IORING_SETUP_IOPOLL) {
 		if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
-- 
2.43.0


  parent reply	other threads:[~2024-05-30 15:28 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-30 15:23 [PATCHSET v2 0/7] Improve MSG_RING DEFER_TASKRUN performance Jens Axboe
2024-05-30 15:23 ` [PATCH 1/7] io_uring/msg_ring: split fd installing into a helper Jens Axboe
2024-05-30 15:23 ` [PATCH 2/7] io_uring/msg_ring: tighten requirement for remote posting Jens Axboe
2024-05-30 15:23 ` Jens Axboe [this message]
2024-05-30 15:23 ` [PATCH 4/7] io_uring/msg_ring: avoid double indirection task_work for fd passing Jens Axboe
2024-05-30 15:23 ` [PATCH 5/7] io_uring/msg_ring: add an alloc cache for CQE entries Jens Axboe
2024-05-30 15:23 ` [PATCH 6/7] io_uring/msg_ring: remove callback_head from struct io_msg Jens Axboe
2024-05-30 15:23 ` [PATCH 7/7] io_uring/msg_ring: remove non-remote message passing Jens Axboe
2024-06-03 13:53 ` [PATCHSET v2 0/7] Improve MSG_RING DEFER_TASKRUN performance Pavel Begunkov
2024-06-04 18:57   ` Jens Axboe
2024-06-04 19:55     ` Jens Axboe
2024-06-05 15:50     ` Pavel Begunkov
2024-06-05 16:41       ` Jens Axboe
2024-06-05 19:20         ` Pavel Begunkov
2024-06-05 19:36           ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240530152822.535791-5-axboe@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=io-uring@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.