All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mohamed Khalfella <mkhalfella@purestorage.com>
To: Justin Tee <justin.tee@broadcom.com>,
	Naresh Gottumukkala <nareshgottumukkala83@gmail.com>,
	Paul Ely <paul.ely@broadcom.com>,
	Chaitanya Kulkarni <kch@nvidia.com>, Jens Axboe <axboe@kernel.dk>,
	Keith Busch <kbusch@kernel.org>, Sagi Grimberg <sagi@grimberg.me>,
	James Smart <jsmart833426@gmail.com>,
	Hannes Reinecke <hare@suse.de>
Cc: Aaron Dailey <adailey@purestorage.com>,
	Randy Jennings <randyj@purestorage.com>,
	Dhaval Giani <dgiani@purestorage.com>,
	linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org,
	Mohamed Khalfella <mkhalfella@purestorage.com>
Subject: [PATCH v4 10/15] nvme-tcp: Use CCR to recover controller that hits an error
Date: Fri, 27 Mar 2026 17:43:41 -0700	[thread overview]
Message-ID: <20260328004518.1729186-11-mkhalfella@purestorage.com> (raw)
In-Reply-To: <20260328004518.1729186-1-mkhalfella@purestorage.com>

An alive nvme controller that hits an error now will move to FENCING
state instead of RESETTING state. ctrl->fencing_work attempts CCR to
terminate inflight IOs. Regardless of the success or failure of CCR
operation the controller is transitioned to RESETTING state to continue
error recovery process.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/tcp.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 243dab830dc8..6393ec2b3b55 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -194,6 +194,7 @@ struct nvme_tcp_ctrl {
 	struct sockaddr_storage src_addr;
 	struct nvme_ctrl	ctrl;
 
+	struct work_struct	fencing_work;
 	struct work_struct	err_work;
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
@@ -612,6 +613,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 
 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 {
+	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCING)) {
+		dev_warn(ctrl->device, "starting controller fencing\n");
+		queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->fencing_work);
+		return;
+	}
+
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 		return;
 
@@ -2471,12 +2478,29 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
+static void nvme_tcp_fencing_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+			struct nvme_tcp_ctrl, fencing_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+	int ret;
+
+	ret = nvme_fence_ctrl(ctrl);
+	if (ret)
+		dev_info(ctrl->device, "CCR failed with error %d\n", ret);
+
+	nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
 static void nvme_tcp_error_recovery_work(struct work_struct *work)
 {
 	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
 				struct nvme_tcp_ctrl, err_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
+	flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
 	if (nvme_tcp_key_revoke_needed(ctrl))
 		nvme_auth_revoke_tls_key(ctrl);
 	nvme_stop_keep_alive(ctrl);
@@ -2519,6 +2543,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 		container_of(work, struct nvme_ctrl, reset_work);
 	int ret;
 
+	flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
 	if (nvme_tcp_key_revoke_needed(ctrl))
 		nvme_auth_revoke_tls_key(ctrl);
 	nvme_stop_ctrl(ctrl);
@@ -2644,13 +2669,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
 	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
 	struct nvme_command *cmd = &pdu->cmd;
 	int qid = nvme_tcp_queue_id(req->queue);
+	enum nvme_ctrl_state state;
 
 	dev_warn(ctrl->device,
 		 "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
 		 rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
 		 nvme_fabrics_opcode_str(qid, cmd), qid);
 
-	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
+	state = nvme_ctrl_state(ctrl);
+	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {
 		/*
 		 * If we are resetting, connecting or deleting we should
 		 * complete immediately because we may block controller
@@ -2905,6 +2932,7 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
 
 	INIT_DELAYED_WORK(&ctrl->connect_work,
 			nvme_tcp_reconnect_ctrl_work);
+	INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
 	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
 
-- 
2.52.0



  parent reply	other threads:[~2026-03-28  0:46 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-28  0:43 [PATCH v4 00/15] TP8028 Rapid Path Failure Recovery Mohamed Khalfella
2026-03-28  0:43 ` [PATCH v4 01/15] nvmet: Rapid Path Failure Recovery set controller identify fields Mohamed Khalfella
2026-03-30 10:37   ` Hannes Reinecke
2026-05-15  2:08   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 02/15] nvmet/debugfs: Export controller CIU and CIRN via debugfs Mohamed Khalfella
2026-05-14 23:42   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 03/15] nvmet: Implement CCR nvme command Mohamed Khalfella
2026-03-30 10:45   ` Hannes Reinecke
2026-03-31 16:38     ` Mohamed Khalfella
2026-04-07  5:40       ` Hannes Reinecke
2026-05-15  0:18   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 04/15] nvmet: Implement CCR logpage Mohamed Khalfella
2026-05-15  0:38   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 05/15] nvmet: Send an AEN on CCR completion Mohamed Khalfella
2026-05-15  0:50   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 06/15] nvme: Rapid Path Failure Recovery read controller identify fields Mohamed Khalfella
2026-05-15  2:03   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 07/15] nvme: Introduce FENCING and FENCED controller states Mohamed Khalfella
2026-03-30 10:46   ` Hannes Reinecke
2026-05-15  2:06   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 08/15] nvme: Implement cross-controller reset recovery Mohamed Khalfella
2026-03-30 10:50   ` Hannes Reinecke
2026-03-31 16:47     ` Mohamed Khalfella
2026-04-07  5:39       ` Hannes Reinecke
2026-04-07 20:46         ` Mohamed Khalfella
2026-04-13 15:25           ` Randy Jennings
2026-04-13 16:33             ` Mohamed Khalfella
2026-04-24 23:07   ` Randy Jennings
2026-03-28  0:43 ` [PATCH v4 09/15] nvme: Implement cross-controller reset completion Mohamed Khalfella
2026-03-30 10:53   ` Hannes Reinecke
2026-03-31 16:55     ` Mohamed Khalfella
2026-04-07  5:48       ` Hannes Reinecke
2026-04-07 19:09         ` Mohamed Khalfella
2026-03-28  0:43 ` Mohamed Khalfella [this message]
2026-03-30 11:00   ` [PATCH v4 10/15] nvme-tcp: Use CCR to recover controller that hits an error Hannes Reinecke
2026-03-28  0:43 ` [PATCH v4 11/15] nvme-rdma: " Mohamed Khalfella
2026-03-28  0:43 ` [PATCH v4 12/15] nvme-fc: Refactor IO error recovery Mohamed Khalfella
2026-03-28  0:43 ` [PATCH v4 13/15] nvme-fc: Use CCR to recover controller that hits an error Mohamed Khalfella
2026-03-28  0:43 ` [PATCH v4 14/15] nvme-fc: Hold inflight requests while in FENCING state Mohamed Khalfella
2026-03-28  0:43 ` [PATCH v4 15/15] nvme-fc: Do not cancel requests in io taget before it is initialized Mohamed Khalfella
2026-05-12 21:40 ` [PATCH v4 00/15] TP8028 Rapid Path Failure Recovery Mohamed Khalfella
2026-05-12 22:02   ` Sagi Grimberg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260328004518.1729186-11-mkhalfella@purestorage.com \
    --to=mkhalfella@purestorage.com \
    --cc=adailey@purestorage.com \
    --cc=axboe@kernel.dk \
    --cc=dgiani@purestorage.com \
    --cc=hare@suse.de \
    --cc=jsmart833426@gmail.com \
    --cc=justin.tee@broadcom.com \
    --cc=kbusch@kernel.org \
    --cc=kch@nvidia.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=nareshgottumukkala83@gmail.com \
    --cc=paul.ely@broadcom.com \
    --cc=randyj@purestorage.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.