[RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues

public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed

From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-nvme@lists.infradead.org
Cc: kbusch@kernel.org, hch@lst.de, hare@suse.de, sagi@grimberg.me,
	chaitanyak@nvidia.com, gjoyce@linux.ibm.com,
	Nilay Shroff <nilay@linux.ibm.com>
Subject: [RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues
Date: Mon, 20 Apr 2026 17:19:33 +0530	[thread overview]
Message-ID: <20260420115716.3071293-2-nilay@linux.ibm.com> (raw)
In-Reply-To: <20260420115716.3071293-1-nilay@linux.ibm.com>

NVMe-TCP currently provisions I/O queues primarily based on CPU
availability. On systems where the number of CPUs significantly exceeds
the number of NIC hardware queues, this can lead to multiple I/O queues
sharing the same NIC TX/RX queues, resulting in increased lock
contention, cacheline bouncing, and inter-processor interrupts (IPIs).

In such configurations, limiting the number of NVMe-TCP I/O queues to
the number of NIC hardware queues can improve performance by reducing
contention and improving locality. Aligning NVMe-TCP worker threads with
NIC queue topology may also help reduce tail latency.

Add a new transport option "match_hw_queues" to allow users to
optionally limit the number of NVMe-TCP I/O queues to the number of NIC
TX/RX queues. When enabled, the number of I/O queues is set to:

    min(num_online_cpus, num_nic_queues)

This behavior is opt-in and does not change existing defaults.

Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
 drivers/nvme/host/fabrics.c |   4 ++
 drivers/nvme/host/fabrics.h |   3 +
 drivers/nvme/host/tcp.c     | 120 +++++++++++++++++++++++++++++++++++-
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ac3d4f400601..62ae998825e1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -709,6 +709,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TLS,			"tls"			},
 	{ NVMF_OPT_CONCAT,		"concat"		},
 #endif
+	{ NVMF_OPT_MATCH_HW_QUEUES,	"match_hw_queues"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -1064,6 +1065,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			}
 			opts->concat = true;
 			break;
+		case NVMF_OPT_MATCH_HW_QUEUES:
+			opts->match_hw_queues = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index caf5503d0833..e8e3a2672832 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -67,6 +67,7 @@ enum {
 	NVMF_OPT_KEYRING	= 1 << 26,
 	NVMF_OPT_TLS_KEY	= 1 << 27,
 	NVMF_OPT_CONCAT		= 1 << 28,
+	NVMF_OPT_MATCH_HW_QUEUES = 1 << 29,
 };
 
 /**
@@ -106,6 +107,7 @@ enum {
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
+ * @match_hw_queues: limit controller IO queue count based on NIC queues (TCP)
  * @nr_write_queues: number of queues for write I/O
  * @nr_poll_queues: number of queues for polling I/O
  * @tos: type of service
@@ -136,6 +138,7 @@ struct nvmf_ctrl_options {
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
+	bool			match_hw_queues;
 	unsigned int		nr_write_queues;
 	unsigned int		nr_poll_queues;
 	int			tos;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 243dab830dc8..7102a7a54d78 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -16,6 +16,8 @@
 #include <net/tls.h>
 #include <net/tls_prot.h>
 #include <net/handshake.h>
+#include <net/ip6_route.h>
+#include <linux/in6.h>
 #include <linux/blk-mq.h>
 #include <net/busy_poll.h>
 #include <trace/events/sock.h>
@@ -1762,6 +1764,103 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
 	return ret;
 }
 
+static struct net_device *nvme_tcp_get_netdev(struct nvme_ctrl *ctrl)
+{
+	struct net_device *dev = NULL;
+
+	if (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)
+		dev = dev_get_by_name(&init_net, ctrl->opts->host_iface);
+	else {
+		struct nvme_tcp_ctrl *tctrl = to_tcp_ctrl(ctrl);
+
+		if (tctrl->addr.ss_family == AF_INET) {
+			struct rtable *rt;
+			struct flowi4 fl4 = {};
+			struct sockaddr_in *addr =
+					(struct sockaddr_in *)&tctrl->addr;
+
+			fl4.daddr = addr->sin_addr.s_addr;
+			if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+				addr = (struct sockaddr_in *)&tctrl->src_addr;
+				fl4.saddr = addr->sin_addr.s_addr;
+			}
+			fl4.flowi4_proto = IPPROTO_TCP;
+
+			rt = ip_route_output_key(&init_net, &fl4);
+			if (IS_ERR(rt))
+				return NULL;
+
+			dev = dst_dev(&rt->dst);
+			/*
+			 * Get reference to netdev as ip_rt_put() will
+			 * release the netdev reference.
+			 */
+			if (dev)
+				dev_hold(dev);
+
+			ip_rt_put(rt);
+
+		} else if (tctrl->addr.ss_family == AF_INET6) {
+			struct dst_entry *dst;
+			struct flowi6 fl6 = {};
+			struct sockaddr_in6 *addr6 =
+					(struct sockaddr_in6 *)&tctrl->addr;
+
+			fl6.daddr = addr6->sin6_addr;
+			if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+				addr6 = (struct sockaddr_in6 *)&tctrl->src_addr;
+				fl6.saddr = addr6->sin6_addr;
+			}
+			fl6.flowi6_proto = IPPROTO_TCP;
+
+			dst = ip6_route_output(&init_net, NULL, &fl6);
+			if (dst->error) {
+				dst_release(dst);
+				return NULL;
+			}
+
+			dev = dst_dev(dst);
+			/*
+			 * Get reference to netdev as dst_release() will
+			 * release the netdev reference.
+			 */
+			if (dev)
+				dev_hold(dev);
+
+			dst_release(dst);
+		}
+	}
+
+	return dev;
+}
+
+static void nvme_tcp_put_netdev(struct net_device *dev)
+{
+	if (dev)
+		dev_put(dev);
+}
+
+/*
+ * Returns number of active NIC queues (min of TX/RX), or 0 if device cannot
+ * be determined.
+ */
+static int nvme_tcp_get_netdev_current_queue_count(struct nvme_ctrl *ctrl)
+{
+	struct net_device *dev;
+	int tx_queues, rx_queues;
+
+	dev = nvme_tcp_get_netdev(ctrl);
+	if (!dev)
+		return 0;
+
+	tx_queues = dev->real_num_tx_queues;
+	rx_queues = dev->real_num_rx_queues;
+
+	nvme_tcp_put_netdev(dev);
+
+	return min(tx_queues, rx_queues);
+}
+
 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 				key_serial_t pskid)
 {
@@ -2144,6 +2243,24 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 	unsigned int nr_io_queues;
 	int ret;
 
+	if (!(ctrl->opts->mask & NVMF_OPT_NR_IO_QUEUES) &&
+			(ctrl->opts->mask & NVMF_OPT_MATCH_HW_QUEUES)) {
+		int nr_hw_queues;
+
+		nr_hw_queues = nvme_tcp_get_netdev_current_queue_count(ctrl);
+		if (nr_hw_queues <= 0)
+			goto init_queue;
+
+		ctrl->opts->nr_io_queues = min(nr_hw_queues, num_online_cpus());
+
+		if (ctrl->opts->nr_io_queues < num_online_cpus())
+			dev_info(ctrl->device,
+				"limiting I/O queues to %u (NIC queues %d, CPUs %u)\n",
+				ctrl->opts->nr_io_queues, nr_hw_queues,
+				num_online_cpus());
+	}
+
+init_queue:
 	nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
 	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
 	if (ret)
@@ -3019,7 +3136,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
 			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
 			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
-			  NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT,
+			  NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY |
+			  NVMF_OPT_CONCAT | NVMF_OPT_MATCH_HW_QUEUES,
 	.create_ctrl	= nvme_tcp_create_ctrl,
 };
 
-- 
2.53.0

next prev parent reply	other threads:[~2026-04-20 11:57 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-20 11:49 [RFC PATCH 0/4] nvme-tcp: NIC topology aware I/O queue scaling and queue info export Nilay Shroff
2026-04-20 11:49 ` Nilay Shroff [this message]
2026-04-20 11:49 ` [RFC PATCH 2/4] nvme-tcp: add a diagnostic message when NIC queues are underutilized Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 3/4] nvme: add debugfs helpers for NVMe drivers Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 4/4] nvme: expose queue information via debugfs Nilay Shroff

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ac3d4f40060 dfblob:62ae998825e dfblob:caf5503d083
dfblob:e8e3a267283 dfblob:243dab830dc dfblob:7102a7a54d7 )
 OR (
bs:"[RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260420115716.3071293-2-nilay@linux.ibm.com \
    --to=nilay@linux.ibm.com \
    --cc=chaitanyak@nvidia.com \
    --cc=gjoyce@linux.ibm.com \
    --cc=hare@suse.de \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox