All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-nvme@lists.infradead.org
Cc: kbusch@kernel.org, hch@lst.de, hare@suse.de, sagi@grimberg.me,
	chaitanyak@nvidia.com, gjoyce@linux.ibm.com,
	Nilay Shroff <nilay@linux.ibm.com>
Subject: [RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues
Date: Mon, 20 Apr 2026 17:19:33 +0530	[thread overview]
Message-ID: <20260420115716.3071293-2-nilay@linux.ibm.com> (raw)
In-Reply-To: <20260420115716.3071293-1-nilay@linux.ibm.com>

NVMe-TCP currently provisions I/O queues primarily based on CPU
availability. On systems where the number of CPUs significantly exceeds
the number of NIC hardware queues, this can lead to multiple I/O queues
sharing the same NIC TX/RX queues, resulting in increased lock
contention, cacheline bouncing, and inter-processor interrupts (IPIs).

In such configurations, limiting the number of NVMe-TCP I/O queues to
the number of NIC hardware queues can improve performance by reducing
contention and improving locality. Aligning NVMe-TCP worker threads with
NIC queue topology may also help reduce tail latency.

Add a new transport option "match_hw_queues" to allow users to
optionally limit the number of NVMe-TCP I/O queues to the number of NIC
TX/RX queues. When enabled, the number of I/O queues is set to:

    min(num_online_cpus, num_nic_queues)

This behavior is opt-in and does not change existing defaults.

Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
 drivers/nvme/host/fabrics.c |   4 ++
 drivers/nvme/host/fabrics.h |   3 +
 drivers/nvme/host/tcp.c     | 120 +++++++++++++++++++++++++++++++++++-
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ac3d4f400601..62ae998825e1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -709,6 +709,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TLS,			"tls"			},
 	{ NVMF_OPT_CONCAT,		"concat"		},
 #endif
+	{ NVMF_OPT_MATCH_HW_QUEUES,	"match_hw_queues"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -1064,6 +1065,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			}
 			opts->concat = true;
 			break;
+		case NVMF_OPT_MATCH_HW_QUEUES:
+			opts->match_hw_queues = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index caf5503d0833..e8e3a2672832 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -67,6 +67,7 @@ enum {
 	NVMF_OPT_KEYRING	= 1 << 26,
 	NVMF_OPT_TLS_KEY	= 1 << 27,
 	NVMF_OPT_CONCAT		= 1 << 28,
+	NVMF_OPT_MATCH_HW_QUEUES = 1 << 29,
 };
 
 /**
@@ -106,6 +107,7 @@ enum {
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
+ * @match_hw_queues: limit controller IO queue count based on NIC queues (TCP)
  * @nr_write_queues: number of queues for write I/O
  * @nr_poll_queues: number of queues for polling I/O
  * @tos: type of service
@@ -136,6 +138,7 @@ struct nvmf_ctrl_options {
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
+	bool			match_hw_queues;
 	unsigned int		nr_write_queues;
 	unsigned int		nr_poll_queues;
 	int			tos;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 243dab830dc8..7102a7a54d78 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -16,6 +16,8 @@
 #include <net/tls.h>
 #include <net/tls_prot.h>
 #include <net/handshake.h>
+#include <net/ip6_route.h>
+#include <linux/in6.h>
 #include <linux/blk-mq.h>
 #include <net/busy_poll.h>
 #include <trace/events/sock.h>
@@ -1762,6 +1764,103 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
 	return ret;
 }
 
+static struct net_device *nvme_tcp_get_netdev(struct nvme_ctrl *ctrl)
+{
+	struct net_device *dev = NULL;
+
+	if (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)
+		dev = dev_get_by_name(&init_net, ctrl->opts->host_iface);
+	else {
+		struct nvme_tcp_ctrl *tctrl = to_tcp_ctrl(ctrl);
+
+		if (tctrl->addr.ss_family == AF_INET) {
+			struct rtable *rt;
+			struct flowi4 fl4 = {};
+			struct sockaddr_in *addr =
+					(struct sockaddr_in *)&tctrl->addr;
+
+			fl4.daddr = addr->sin_addr.s_addr;
+			if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+				addr = (struct sockaddr_in *)&tctrl->src_addr;
+				fl4.saddr = addr->sin_addr.s_addr;
+			}
+			fl4.flowi4_proto = IPPROTO_TCP;
+
+			rt = ip_route_output_key(&init_net, &fl4);
+			if (IS_ERR(rt))
+				return NULL;
+
+			dev = dst_dev(&rt->dst);
+			/*
+			 * Get reference to netdev as ip_rt_put() will
+			 * release the netdev reference.
+			 */
+			if (dev)
+				dev_hold(dev);
+
+			ip_rt_put(rt);
+
+		} else if (tctrl->addr.ss_family == AF_INET6) {
+			struct dst_entry *dst;
+			struct flowi6 fl6 = {};
+			struct sockaddr_in6 *addr6 =
+					(struct sockaddr_in6 *)&tctrl->addr;
+
+			fl6.daddr = addr6->sin6_addr;
+			if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+				addr6 = (struct sockaddr_in6 *)&tctrl->src_addr;
+				fl6.saddr = addr6->sin6_addr;
+			}
+			fl6.flowi6_proto = IPPROTO_TCP;
+
+			dst = ip6_route_output(&init_net, NULL, &fl6);
+			if (dst->error) {
+				dst_release(dst);
+				return NULL;
+			}
+
+			dev = dst_dev(dst);
+			/*
+			 * Get reference to netdev as dst_release() will
+			 * release the netdev reference.
+			 */
+			if (dev)
+				dev_hold(dev);
+
+			dst_release(dst);
+		}
+	}
+
+	return dev;
+}
+
+static void nvme_tcp_put_netdev(struct net_device *dev)
+{
+	if (dev)
+		dev_put(dev);
+}
+
+/*
+ * Returns number of active NIC queues (min of TX/RX), or 0 if device cannot
+ * be determined.
+ */
+static int nvme_tcp_get_netdev_current_queue_count(struct nvme_ctrl *ctrl)
+{
+	struct net_device *dev;
+	int tx_queues, rx_queues;
+
+	dev = nvme_tcp_get_netdev(ctrl);
+	if (!dev)
+		return 0;
+
+	tx_queues = dev->real_num_tx_queues;
+	rx_queues = dev->real_num_rx_queues;
+
+	nvme_tcp_put_netdev(dev);
+
+	return min(tx_queues, rx_queues);
+}
+
 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 				key_serial_t pskid)
 {
@@ -2144,6 +2243,24 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 	unsigned int nr_io_queues;
 	int ret;
 
+	if (!(ctrl->opts->mask & NVMF_OPT_NR_IO_QUEUES) &&
+			(ctrl->opts->mask & NVMF_OPT_MATCH_HW_QUEUES)) {
+		int nr_hw_queues;
+
+		nr_hw_queues = nvme_tcp_get_netdev_current_queue_count(ctrl);
+		if (nr_hw_queues <= 0)
+			goto init_queue;
+
+		ctrl->opts->nr_io_queues = min(nr_hw_queues, num_online_cpus());
+
+		if (ctrl->opts->nr_io_queues < num_online_cpus())
+			dev_info(ctrl->device,
+				"limiting I/O queues to %u (NIC queues %d, CPUs %u)\n",
+				ctrl->opts->nr_io_queues, nr_hw_queues,
+				num_online_cpus());
+	}
+
+init_queue:
 	nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
 	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
 	if (ret)
@@ -3019,7 +3136,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
 			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
 			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
-			  NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT,
+			  NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY |
+			  NVMF_OPT_CONCAT | NVMF_OPT_MATCH_HW_QUEUES,
 	.create_ctrl	= nvme_tcp_create_ctrl,
 };
 
-- 
2.53.0



  reply	other threads:[~2026-04-20 11:57 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-20 11:49 [RFC PATCH 0/4] nvme-tcp: NIC topology aware I/O queue scaling and queue info export Nilay Shroff
2026-04-20 11:49 ` Nilay Shroff [this message]
2026-04-24 13:46   ` [RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues Christoph Hellwig
2026-04-27  7:37     ` Nilay Shroff
2026-04-24 22:10   ` Sagi Grimberg
2026-04-27 11:57     ` Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 2/4] nvme-tcp: add a diagnostic message when NIC queues are underutilized Nilay Shroff
2026-04-24 22:15   ` Sagi Grimberg
2026-04-27 12:14     ` Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 3/4] nvme: add debugfs helpers for NVMe drivers Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 4/4] nvme: expose queue information via debugfs Nilay Shroff
2026-04-24 22:23   ` Sagi Grimberg
2026-04-27 12:12     ` Nilay Shroff
2026-04-22 11:10 ` [RFC PATCH 0/4] nvme-tcp: NIC topology aware I/O queue scaling and queue info export Hannes Reinecke
2026-04-24 22:30   ` Sagi Grimberg
2026-04-27 12:11     ` Nilay Shroff
2026-04-27  6:13   ` Nilay Shroff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260420115716.3071293-2-nilay@linux.ibm.com \
    --to=nilay@linux.ibm.com \
    --cc=chaitanyak@nvidia.com \
    --cc=gjoyce@linux.ibm.com \
    --cc=hare@suse.de \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.