[PATCH 8/8] nvme-tcp: align I/O cpu with blk-mq mapping

Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed

From: Hannes Reinecke <hare@kernel.org>
To: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>, Keith Busch <kbusch@kernel.org>,
	linux-nvme@lists.infradead.org, Hannes Reinecke <hare@kernel.org>
Subject: [PATCH 8/8] nvme-tcp: align I/O cpu with blk-mq mapping
Date: Tue, 16 Jul 2024 09:36:16 +0200	[thread overview]
Message-ID: <20240716073616.84417-9-hare@kernel.org> (raw)
In-Reply-To: <20240716073616.84417-1-hare@kernel.org>

We should align the 'io_cpu' setting with the blk-mq
cpu mapping to ensure that we're not bouncing threads
when doing I/O. To avoid cpu contention this patch also
adds an atomic counter for the number of queues on each
cpu to distribute the load across all CPUs in the blk-mq cpu set.
Additionally we should always set the 'io_cpu' value, as
in the WQ_UNBOUND case it'll be treated as a hint anyway.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/tcp.c | 65 +++++++++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index f3a94168b2c3..a391a3f7c4d7 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -28,6 +28,8 @@
 
 struct nvme_tcp_queue;
 
+static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
+
 /* Define the socket priority to use for connections were it is desirable
  * that the NIC consider performing optimized packet processing or filtering.
  * A non-zero value being sufficient to indicate general consideration of any
@@ -1799,20 +1801,42 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
-	int qid = nvme_tcp_queue_id(queue);
-	int n = 0;
-
-	if (nvme_tcp_default_queue(queue))
-		n = qid - 1;
-	else if (nvme_tcp_read_queue(queue))
-		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
-	else if (nvme_tcp_poll_queue(queue))
+	struct blk_mq_tag_set *set = &ctrl->tag_set;
+	int qid = nvme_tcp_queue_id(queue) - 1;
+	unsigned int *mq_map = NULL;;
+	int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;
+
+	if (nvme_tcp_default_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
+		n = qid;
+	} else if (nvme_tcp_read_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_READ].mq_map;
+		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
+	} else if (nvme_tcp_poll_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_POLL].mq_map;
 		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
-				ctrl->io_queues[HCTX_TYPE_READ] - 1;
-	if (wq_unbound)
-		queue->io_cpu = WORK_CPU_UNBOUND;
-	else
-		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+				ctrl->io_queues[HCTX_TYPE_READ];
+	}
+
+	if (WARN_ON(!mq_map))
+		return;
+	for_each_online_cpu(cpu) {
+		int num_queues;
+
+		if (mq_map[cpu] != qid)
+			continue;
+		num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
+		if (num_queues < min_queues) {
+			min_queues = num_queues;
+			io_cpu = cpu;
+		}
+	}
+	if (io_cpu != queue->io_cpu) {
+		queue->io_cpu = io_cpu;
+		atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
+	}
+	dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
+		qid, queue->io_cpu);
 }
 
 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -1957,7 +1981,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
 	queue->sock->sk->sk_use_task_frag = false;
-	nvme_tcp_set_queue_io_cpu(queue);
+	queue->io_cpu = WORK_CPU_UNBOUND;
 	queue->request = NULL;
 	queue->data_remaining = 0;
 	queue->ddgst_remaining = 0;
@@ -2088,6 +2112,10 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
 	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 	nvme_tcp_restore_sock_ops(queue);
 	cancel_work_sync(&queue->io_work);
+	if (queue->io_cpu != WORK_CPU_UNBOUND) {
+		atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
+		queue->io_cpu = WORK_CPU_UNBOUND;
+	}
 }
 
 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
@@ -2133,9 +2161,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
 	nvme_tcp_init_recv_ctx(queue);
 	nvme_tcp_setup_sock_ops(queue);
 
-	if (idx)
+	if (idx) {
+		nvme_tcp_set_queue_io_cpu(queue);
 		ret = nvmf_connect_io_queue(nctrl, idx);
-	else
+	} else
 		ret = nvmf_connect_admin_queue(nctrl);
 
 	if (!ret) {
@@ -3179,6 +3208,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 static int __init nvme_tcp_init_module(void)
 {
 	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+	int cpu;
 
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
@@ -3200,6 +3230,9 @@ static int __init nvme_tcp_init_module(void)
 	if (!nvme_tcp_debugfs)
 		return -ENOMEM;
 
+	for_each_possible_cpu(cpu)
+		atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
+
 	nvmf_register_transport(&nvme_tcp_transport);
 	return 0;
 }
-- 
2.35.3

next prev parent reply	other threads:[~2024-07-16  7:37 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-16  7:36 [PATCHv3 0/8] nvme-tcp: improve scalability Hannes Reinecke
2024-07-16  7:36 ` [PATCH 1/8] nvme-tcp: switch TX deadline to microseconds and make it configurable Hannes Reinecke
2024-07-17 21:03   ` Sagi Grimberg
2024-07-18  6:30     ` Hannes Reinecke
2024-07-16  7:36 ` [PATCH 2/8] nvme-tcp: io_work stall debugging Hannes Reinecke
2024-07-17 21:05   ` Sagi Grimberg
2024-07-16  7:36 ` [PATCH 3/8] nvme-tcp: re-init request list entries Hannes Reinecke
2024-07-17 21:23   ` Sagi Grimberg
2024-07-16  7:36 ` [PATCH 4/8] nvme-tcp: improve stall debugging Hannes Reinecke
2024-07-17 21:11   ` Sagi Grimberg
2024-07-16  7:36 ` [PATCH 5/8] nvme-tcp: debugfs entries for latency statistics Hannes Reinecke
2024-07-17 21:14   ` Sagi Grimberg
2024-07-16  7:36 ` [PATCH 6/8] nvme-tcp: reduce callback lock contention Hannes Reinecke
2024-07-17 21:19   ` Sagi Grimberg
2024-07-18  6:42     ` Hannes Reinecke
2024-07-21 11:46       ` Sagi Grimberg
2024-07-16  7:36 ` [PATCH 7/8] nvme-tcp: check for SOCK_NOSPACE before sending Hannes Reinecke
2024-07-17 21:19   ` Sagi Grimberg
2024-07-16  7:36 ` Hannes Reinecke [this message]
2024-07-17 21:34   ` [PATCH 8/8] nvme-tcp: align I/O cpu with blk-mq mapping Sagi Grimberg
2024-08-13 19:36     ` Sagi Grimberg
2024-07-17 21:01 ` [PATCHv3 0/8] nvme-tcp: improve scalability Sagi Grimberg
2024-07-18  6:20   ` Hannes Reinecke
2024-07-21 12:05     ` Sagi Grimberg

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:f3a94168b2c dfblob:a391a3f7c4d )
 OR (
bs:"[PATCH 8/8] nvme-tcp: align I/O cpu with blk-mq mapping" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240716073616.84417-9-hare@kernel.org \
    --to=hare@kernel.org \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox