public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
From: Mohamed Khalfella <mkhalfella@purestorage.com>
To: Chaitanya Kulkarni <kch@nvidia.com>,
	Christoph Hellwig <hch@lst.de>, Jens Axboe <axboe@kernel.dk>,
	Keith Busch <kbusch@kernel.org>, Sagi Grimberg <sagi@grimberg.me>
Cc: Aaron Dailey <adailey@purestorage.com>,
	Randy Jennings <randyj@purestorage.com>,
	John Meneghini <jmeneghi@redhat.com>,
	Hannes Reinecke <hare@suse.de>,
	linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org,
	Mohamed Khalfella <mkhalfella@purestorage.com>
Subject: [RFC PATCH 08/14] nvme: Implement cross-controller reset recovery
Date: Tue, 25 Nov 2025 18:11:55 -0800	[thread overview]
Message-ID: <20251126021250.2583630-9-mkhalfella@purestorage.com> (raw)
In-Reply-To: <20251126021250.2583630-1-mkhalfella@purestorage.com>

A host that has more than one path connecting to an nvme subsystem
typically has an nvme controller associated with every path. This is
mostly applicable to nvmeof. If one path goes down, inflight IOs on that
path should not be retried immediately on another path because this
could lead to data corruption as described in TP4129. TP8028 defines
cross-controller reset mechanism that can be used by host to terminate
IOs on the failed path using one of the remaining healthy paths. Only
after IOs are terminated, or long enough time passes as defined by
TP4129, inflight IOs should be retried on another path. Implement core
cross-controller reset shared logic to be used by the transports.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/constants.c |   1 +
 drivers/nvme/host/core.c      | 133 ++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      |  10 +++
 3 files changed, 144 insertions(+)

diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index dc90df9e13a2..f679efd5110e 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
 	[nvme_admin_virtual_mgmt] = "Virtual Management",
 	[nvme_admin_nvme_mi_send] = "NVMe Send MI",
 	[nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
+	[nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
 	[nvme_admin_dbbuf] = "Doorbell Buffer Config",
 	[nvme_admin_format_nvm] = "Format NVM",
 	[nvme_admin_security_send] = "Security Send",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f5b84bc327d3..f38b70ca9cee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -554,6 +554,138 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
 
+static struct nvme_ctrl *nvme_find_ccr_ctrl(struct nvme_ctrl *ictrl,
+					    u32 min_cntlid)
+{
+	struct nvme_subsystem *subsys = ictrl->subsys;
+	struct nvme_ctrl *sctrl;
+	unsigned long flags;
+
+	mutex_lock(&nvme_subsystems_lock);
+	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
+		if (sctrl->cntlid < min_cntlid)
+			continue;
+
+		if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
+			continue;
+
+		spin_lock_irqsave(&sctrl->lock, flags);
+		if (sctrl->state != NVME_CTRL_LIVE) {
+			spin_unlock_irqrestore(&sctrl->lock, flags);
+			atomic_inc(&sctrl->ccr_limit);
+			continue;
+		}
+
+		/*
+		 * We got a good candidate source controller that is locked and
+		 * LIVE. However, no guarantee sctrl will not be deleted after
+		 * sctrl->lock is released. Get a ref of both sctrl and admin_q
+		 * so they do not disappear until we are done with them.
+		 */
+		WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
+		nvme_get_ctrl(sctrl);
+		spin_unlock_irqrestore(&sctrl->lock, flags);
+		goto found;
+	}
+	sctrl = NULL;
+found:
+	mutex_unlock(&nvme_subsystems_lock);
+	return sctrl;
+}
+
+static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
+{
+	unsigned long flags, tmo, remain;
+	struct nvme_ccr_entry ccr = { };
+	union nvme_result res = { 0 };
+	struct nvme_command c = { };
+	u32 result;
+	int ret = 0;
+
+	init_completion(&ccr.complete);
+	ccr.ictrl = ictrl;
+
+	spin_lock_irqsave(&sctrl->lock, flags);
+	list_add_tail(&ccr.list, &sctrl->ccrs);
+	spin_unlock_irqrestore(&sctrl->lock, flags);
+
+	c.ccr.opcode = nvme_admin_cross_ctrl_reset;
+	c.ccr.ciu = ictrl->ciu;
+	c.ccr.icid = cpu_to_le16(ictrl->cntlid);
+	c.ccr.cirn = cpu_to_le64(ictrl->cirn);
+	ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
+				     NULL, 0, NVME_QID_ANY, 0);
+	if (ret)
+		goto out;
+
+	result = le32_to_cpu(res.u32);
+	if (result & 0x01) /* Immediate Reset */
+		goto out;
+
+	tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
+	remain = wait_for_completion_timeout(&ccr.complete, tmo);
+	if (!remain)
+		ret = -EAGAIN;
+out:
+	spin_lock_irqsave(&sctrl->lock, flags);
+	list_del(&ccr.list);
+	spin_unlock_irqrestore(&sctrl->lock, flags);
+	return ccr.ccrs == 1 ? 0 : ret;
+}
+
+unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
+{
+	unsigned long deadline, now, timeout;
+	struct nvme_ctrl *sctrl;
+	u32 min_cntlid = 0;
+	int ret;
+
+	timeout = nvme_recovery_timeout_ms(ictrl);
+	dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
+
+	now = jiffies;
+	deadline = now + msecs_to_jiffies(timeout);
+	while (time_before(now, deadline)) {
+		sctrl = nvme_find_ccr_ctrl(ictrl, min_cntlid);
+		if (!sctrl) {
+			/* CCR failed, switch to time-based recovery */
+			return deadline - now;
+		}
+
+		ret = nvme_issue_wait_ccr(sctrl, ictrl);
+		atomic_inc(&sctrl->ccr_limit);
+
+		if (!ret) {
+			dev_info(ictrl->device, "CCR succeeded using %s\n",
+				 dev_name(sctrl->device));
+			blk_put_queue(sctrl->admin_q);
+			nvme_put_ctrl(sctrl);
+			return 0;
+		}
+
+		/* Try another controller */
+		min_cntlid = sctrl->cntlid + 1;
+		blk_put_queue(sctrl->admin_q);
+		nvme_put_ctrl(sctrl);
+		now = jiffies;
+	}
+
+	dev_info(ictrl->device, "CCR reached timeout, call it done\n");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_recover_ctrl);
+
+void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+	WRITE_ONCE(ctrl->state, NVME_CTRL_RESETTING);
+	wake_up_all(&ctrl->state_wq);
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+}
+EXPORT_SYMBOL_GPL(nvme_end_ctrl_recovery);
+
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state)
 {
@@ -5108,6 +5240,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
+	INIT_LIST_HEAD(&ctrl->ccrs);
 	xa_init(&ctrl->cels);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cde427353e0a..1f8937fce9a7 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -279,6 +279,13 @@ enum nvme_ctrl_flags {
 	NVME_CTRL_RECOVERED		= 7,
 };
 
+struct nvme_ccr_entry {
+	struct list_head list;
+	struct completion complete;
+	struct nvme_ctrl *ictrl;
+	u8 ccrs;
+};
+
 struct nvme_ctrl {
 	bool comp_seen;
 	bool identified;
@@ -296,6 +303,7 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
+	struct list_head ccrs;
 	struct mutex namespaces_lock;
 	struct srcu_struct srcu;
 	struct device ctrl_device;
@@ -805,6 +813,8 @@ blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
 void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
+unsigned long nvme_recover_ctrl(struct nvme_ctrl *ctrl);
+void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state);
 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
-- 
2.51.2



  parent reply	other threads:[~2025-11-26  2:13 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-26  2:11 [RFC PATCH 00/14] TP8028 Rapid Path Failure Recovery Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields Mohamed Khalfella
2025-12-16  1:35   ` Randy Jennings
2025-11-26  2:11 ` [RFC PATCH 02/14] nvmet/debugfs: Add ctrl uniquifier and random values Mohamed Khalfella
2025-12-16  1:43   ` Randy Jennings
2025-11-26  2:11 ` [RFC PATCH 03/14] nvmet: Implement CCR nvme command Mohamed Khalfella
2025-12-16  3:01   ` Randy Jennings
2025-12-31 21:14     ` Mohamed Khalfella
2025-12-25 13:14   ` Sagi Grimberg
2025-12-25 17:33     ` Mohamed Khalfella
2025-12-27  9:39       ` Sagi Grimberg
2025-12-31 21:35         ` Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 04/14] nvmet: Implement CCR logpage Mohamed Khalfella
2025-12-16  3:11   ` Randy Jennings
2025-11-26  2:11 ` [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion Mohamed Khalfella
2025-12-16  3:31   ` Randy Jennings
2025-12-25 13:23   ` Sagi Grimberg
2025-12-25 18:13     ` Mohamed Khalfella
2025-12-27  9:48       ` Sagi Grimberg
2025-12-31 22:00         ` Mohamed Khalfella
2026-01-04 21:09           ` Sagi Grimberg
2026-01-07  2:58             ` Randy Jennings
2026-01-30 22:31             ` Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 06/14] nvme: Rapid Path Failure Recovery read controller identify fields Mohamed Khalfella
2025-12-18 15:22   ` Randy Jennings
2025-12-31 22:26     ` Mohamed Khalfella
2026-01-02 19:06       ` Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 07/14] nvme: Add RECOVERING nvme controller state Mohamed Khalfella
2025-12-18 23:18   ` Randy Jennings
2025-12-19  1:39     ` Randy Jennings
2025-12-25 13:29   ` Sagi Grimberg
2025-12-25 17:17     ` Mohamed Khalfella
2025-12-27  9:52       ` Sagi Grimberg
2025-12-31 22:45         ` Mohamed Khalfella
2025-12-27  9:55       ` Sagi Grimberg
2025-12-31 22:36         ` Mohamed Khalfella
2025-12-31 23:04           ` Mohamed Khalfella
2025-11-26  2:11 ` Mohamed Khalfella [this message]
2025-12-19  1:21   ` [RFC PATCH 08/14] nvme: Implement cross-controller reset recovery Randy Jennings
2025-12-27 10:14   ` Sagi Grimberg
2025-12-31  0:04     ` Randy Jennings
2026-01-04 21:14       ` Sagi Grimberg
2026-01-07  3:16         ` Randy Jennings
2025-12-31 23:43     ` Mohamed Khalfella
2026-01-04 21:39       ` Sagi Grimberg
2026-01-30 22:01         ` Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 09/14] nvme: Implement cross-controller reset completion Mohamed Khalfella
2025-12-19  1:31   ` Randy Jennings
2025-12-27 10:24   ` Sagi Grimberg
2025-12-31 23:51     ` Mohamed Khalfella
2026-01-04 21:15       ` Sagi Grimberg
2026-01-30 22:32         ` Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error Mohamed Khalfella
2025-12-19  2:06   ` Randy Jennings
2026-01-01  0:04     ` Mohamed Khalfella
2025-12-27 10:35   ` Sagi Grimberg
2025-12-31  0:13     ` Randy Jennings
2026-01-04 21:19       ` Sagi Grimberg
2026-01-01  0:27     ` Mohamed Khalfella
2025-11-26  2:11 ` [RFC PATCH 11/14] nvme-rdma: " Mohamed Khalfella
2025-12-19  2:16   ` Randy Jennings
2025-12-27 10:36   ` Sagi Grimberg
2025-11-26  2:11 ` [RFC PATCH 12/14] nvme-fc: Decouple error recovery from controller reset Mohamed Khalfella
2025-12-19  2:59   ` Randy Jennings
2025-11-26  2:12 ` [RFC PATCH 13/14] nvme-fc: Use CCR to recover controller that hits an error Mohamed Khalfella
2025-12-20  1:21   ` Randy Jennings
2025-11-26  2:12 ` [RFC PATCH 14/14] nvme-fc: Hold inflight requests while in RECOVERING state Mohamed Khalfella
2025-12-20  1:44   ` Randy Jennings

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251126021250.2583630-9-mkhalfella@purestorage.com \
    --to=mkhalfella@purestorage.com \
    --cc=adailey@purestorage.com \
    --cc=axboe@kernel.dk \
    --cc=hare@suse.de \
    --cc=hch@lst.de \
    --cc=jmeneghi@redhat.com \
    --cc=kbusch@kernel.org \
    --cc=kch@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=randyj@purestorage.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox