From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-nvme@lists.infradead.org
Cc: kbusch@kernel.org, hch@lst.de, hare@suse.de, sagi@grimberg.me,
chaitanyak@nvidia.com, gjoyce@linux.ibm.com,
Nilay Shroff <nilay@linux.ibm.com>
Subject: [RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues
Date: Mon, 20 Apr 2026 17:19:33 +0530 [thread overview]
Message-ID: <20260420115716.3071293-2-nilay@linux.ibm.com> (raw)
In-Reply-To: <20260420115716.3071293-1-nilay@linux.ibm.com>
NVMe-TCP currently provisions I/O queues primarily based on CPU
availability. On systems where the number of CPUs significantly exceeds
the number of NIC hardware queues, this can lead to multiple I/O queues
sharing the same NIC TX/RX queues, resulting in increased lock
contention, cacheline bouncing, and inter-processor interrupts (IPIs).
In such configurations, limiting the number of NVMe-TCP I/O queues to
the number of NIC hardware queues can improve performance by reducing
contention and improving locality. Aligning NVMe-TCP worker threads with
NIC queue topology may also help reduce tail latency.
Add a new transport option "match_hw_queues" to allow users to
optionally limit the number of NVMe-TCP I/O queues to the number of NIC
TX/RX queues. When enabled, the number of I/O queues is set to:
min(num_online_cpus, num_nic_queues)
This behavior is opt-in and does not change existing defaults.
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
drivers/nvme/host/fabrics.c | 4 ++
drivers/nvme/host/fabrics.h | 3 +
drivers/nvme/host/tcp.c | 120 +++++++++++++++++++++++++++++++++++-
3 files changed, 126 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ac3d4f400601..62ae998825e1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -709,6 +709,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_TLS, "tls" },
{ NVMF_OPT_CONCAT, "concat" },
#endif
+ { NVMF_OPT_MATCH_HW_QUEUES, "match_hw_queues" },
{ NVMF_OPT_ERR, NULL }
};
@@ -1064,6 +1065,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->concat = true;
break;
+ case NVMF_OPT_MATCH_HW_QUEUES:
+ opts->match_hw_queues = true;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index caf5503d0833..e8e3a2672832 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -67,6 +67,7 @@ enum {
NVMF_OPT_KEYRING = 1 << 26,
NVMF_OPT_TLS_KEY = 1 << 27,
NVMF_OPT_CONCAT = 1 << 28,
+ NVMF_OPT_MATCH_HW_QUEUES = 1 << 29,
};
/**
@@ -106,6 +107,7 @@ enum {
* @disable_sqflow: disable controller sq flow control
* @hdr_digest: generate/verify header digest (TCP)
* @data_digest: generate/verify data digest (TCP)
+ * @match_hw_queues: limit controller IO queue count based on NIC queues (TCP)
* @nr_write_queues: number of queues for write I/O
* @nr_poll_queues: number of queues for polling I/O
* @tos: type of service
@@ -136,6 +138,7 @@ struct nvmf_ctrl_options {
bool disable_sqflow;
bool hdr_digest;
bool data_digest;
+ bool match_hw_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
int tos;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 243dab830dc8..7102a7a54d78 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -16,6 +16,8 @@
#include <net/tls.h>
#include <net/tls_prot.h>
#include <net/handshake.h>
+#include <net/ip6_route.h>
+#include <linux/in6.h>
#include <linux/blk-mq.h>
#include <net/busy_poll.h>
#include <trace/events/sock.h>
@@ -1762,6 +1764,103 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
return ret;
}
+static struct net_device *nvme_tcp_get_netdev(struct nvme_ctrl *ctrl)
+{
+ struct net_device *dev = NULL;
+
+ if (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)
+ dev = dev_get_by_name(&init_net, ctrl->opts->host_iface);
+ else {
+ struct nvme_tcp_ctrl *tctrl = to_tcp_ctrl(ctrl);
+
+ if (tctrl->addr.ss_family == AF_INET) {
+ struct rtable *rt;
+ struct flowi4 fl4 = {};
+ struct sockaddr_in *addr =
+ (struct sockaddr_in *)&tctrl->addr;
+
+ fl4.daddr = addr->sin_addr.s_addr;
+ if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+ addr = (struct sockaddr_in *)&tctrl->src_addr;
+ fl4.saddr = addr->sin_addr.s_addr;
+ }
+ fl4.flowi4_proto = IPPROTO_TCP;
+
+ rt = ip_route_output_key(&init_net, &fl4);
+ if (IS_ERR(rt))
+ return NULL;
+
+ dev = dst_dev(&rt->dst);
+ /*
+ * Get reference to netdev as ip_rt_put() will
+ * release the netdev reference.
+ */
+ if (dev)
+ dev_hold(dev);
+
+ ip_rt_put(rt);
+
+ } else if (tctrl->addr.ss_family == AF_INET6) {
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {};
+ struct sockaddr_in6 *addr6 =
+ (struct sockaddr_in6 *)&tctrl->addr;
+
+ fl6.daddr = addr6->sin6_addr;
+ if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+ addr6 = (struct sockaddr_in6 *)&tctrl->src_addr;
+ fl6.saddr = addr6->sin6_addr;
+ }
+ fl6.flowi6_proto = IPPROTO_TCP;
+
+ dst = ip6_route_output(&init_net, NULL, &fl6);
+ if (dst->error) {
+ dst_release(dst);
+ return NULL;
+ }
+
+ dev = dst_dev(dst);
+ /*
+ * Get reference to netdev as dst_release() will
+ * release the netdev reference.
+ */
+ if (dev)
+ dev_hold(dev);
+
+ dst_release(dst);
+ }
+ }
+
+ return dev;
+}
+
+static void nvme_tcp_put_netdev(struct net_device *dev)
+{
+ if (dev)
+ dev_put(dev);
+}
+
+/*
+ * Returns number of active NIC queues (min of TX/RX), or 0 if device cannot
+ * be determined.
+ */
+static int nvme_tcp_get_netdev_current_queue_count(struct nvme_ctrl *ctrl)
+{
+ struct net_device *dev;
+ int tx_queues, rx_queues;
+
+ dev = nvme_tcp_get_netdev(ctrl);
+ if (!dev)
+ return 0;
+
+ tx_queues = dev->real_num_tx_queues;
+ rx_queues = dev->real_num_rx_queues;
+
+ nvme_tcp_put_netdev(dev);
+
+ return min(tx_queues, rx_queues);
+}
+
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
key_serial_t pskid)
{
@@ -2144,6 +2243,24 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
unsigned int nr_io_queues;
int ret;
+ if (!(ctrl->opts->mask & NVMF_OPT_NR_IO_QUEUES) &&
+ (ctrl->opts->mask & NVMF_OPT_MATCH_HW_QUEUES)) {
+ int nr_hw_queues;
+
+ nr_hw_queues = nvme_tcp_get_netdev_current_queue_count(ctrl);
+ if (nr_hw_queues <= 0)
+ goto init_queue;
+
+ ctrl->opts->nr_io_queues = min(nr_hw_queues, num_online_cpus());
+
+ if (ctrl->opts->nr_io_queues < num_online_cpus())
+ dev_info(ctrl->device,
+ "limiting I/O queues to %u (NIC queues %d, CPUs %u)\n",
+ ctrl->opts->nr_io_queues, nr_hw_queues,
+ num_online_cpus());
+ }
+
+init_queue:
nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
ret = nvme_set_queue_count(ctrl, &nr_io_queues);
if (ret)
@@ -3019,7 +3136,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
- NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT,
+ NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY |
+ NVMF_OPT_CONCAT | NVMF_OPT_MATCH_HW_QUEUES,
.create_ctrl = nvme_tcp_create_ctrl,
};
--
2.53.0
next prev parent reply other threads:[~2026-04-20 11:57 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-20 11:49 [RFC PATCH 0/4] nvme-tcp: NIC topology aware I/O queue scaling and queue info export Nilay Shroff
2026-04-20 11:49 ` Nilay Shroff [this message]
2026-04-20 11:49 ` [RFC PATCH 2/4] nvme-tcp: add a diagnostic message when NIC queues are underutilized Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 3/4] nvme: add debugfs helpers for NVMe drivers Nilay Shroff
2026-04-20 11:49 ` [RFC PATCH 4/4] nvme: expose queue information via debugfs Nilay Shroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260420115716.3071293-2-nilay@linux.ibm.com \
--to=nilay@linux.ibm.com \
--cc=chaitanyak@nvidia.com \
--cc=gjoyce@linux.ibm.com \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=kbusch@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=sagi@grimberg.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox