public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
From: Michael Kelley <mikelley@microsoft.com>
To: kbusch@kernel.org, axboe@fb.com, hch@lst.de, sagi@grimberg.me,
	linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org
Cc: mikelley@microsoft.com, caroline.subramoney@microsoft.com,
	riwurd@microsoft.com, nathan.obr@microsoft.com
Subject: [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller
Date: Tue, 31 May 2022 21:12:27 -0700	[thread overview]
Message-ID: <1654056747-40143-2-git-send-email-mikelley@microsoft.com> (raw)
In-Reply-To: <1654056747-40143-1-git-send-email-mikelley@microsoft.com>

In the NVM Express Revision 1.4 spec, Figure 145 describes possible
values for an AER with event type "Error" (value 000b). For a
Persistent Internal Error (value 03h), the host should perform a
controller reset.

Add support for this error using code that already exists for
doing a controller reset in response to a request timeout.

This new support was tested in a lab environment where we can
generate the persistent internal error on demand, and observe
both the Linux side and NVMe controller side to see that the
controller reset has been done.

Signed-off-by: Michael Kelley <mikelley@microsoft.com>
---

 drivers/nvme/host/pci.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h    |  4 ++++
 2 files changed, 41 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4dd87ac..b2140e9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -131,6 +131,7 @@ struct nvme_dev {
 	void __iomem *bar;
 	unsigned long bar_mapped_size;
 	struct work_struct remove_work;
+	struct work_struct persistent_err_work;
 	struct mutex shutdown_lock;
 	bool subsystem;
 	u64 cmb_size;
@@ -1119,6 +1120,39 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_persistent_err_work(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev,
+						persistent_err_work);
+
+	nvme_dev_disable(dev, false);
+	nvme_reset_ctrl(&dev->ctrl);
+}
+
+static bool nvme_check_aen_error(struct nvme_dev *dev,
+			__le16 status, volatile union nvme_result *res)
+{
+	u32 result = le32_to_cpu(res->u32);
+	u32 csts;
+
+	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+		return false;
+
+	/* Currently only handle Persistent Internal Error */
+	if ((result & 0x07) != NVME_AER_ERROR ||
+	    ((result & 0xff00) >> 8) != NVME_AER_ERROR_PERSIST_INT_ERR)
+		return false;
+
+	/* NVMe Spec 1.4 says to reset the controller */
+	csts = readl(dev->bar + NVME_REG_CSTS);
+	if (!nvme_should_reset(dev, csts))
+		return false;
+
+	nvme_warn_reset(dev, csts);
+	queue_work(nvme_wq, &dev->persistent_err_work);
+	return true;
+}
+
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 				   struct io_comp_batch *iob, u16 idx)
 {
@@ -1133,6 +1167,8 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	 * for them but rather special case them here.
 	 */
 	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
+		if (nvme_check_aen_error(nvmeq->dev, cqe->status, &cqe->result))
+			return;
 		nvme_complete_async_event(&nvmeq->dev->ctrl,
 				cqe->status, &cqe->result);
 		return;
@@ -3085,6 +3121,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	INIT_WORK(&dev->persistent_err_work, nvme_persistent_err_work);
 	mutex_init(&dev->shutdown_lock);
 
 	result = nvme_setup_prp_pools(dev);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 29ec3e3..8ced243 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -712,6 +712,10 @@ enum {
 };
 
 enum {
+	NVME_AER_ERROR_PERSIST_INT_ERR	= 0x03,
+};
+
+enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,
-- 
1.8.3.1



  reply	other threads:[~2022-06-01  4:13 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-01  4:12 [PATCH 1/2] nvme-pci: Move two functions to avoid forward reference Michael Kelley
2022-06-01  4:12 ` Michael Kelley [this message]
2022-06-01  7:35   ` [PATCH 2/2] nvme-pci: handle persistent internal error AER from NVMe controller Christoph Hellwig
2022-06-01 15:56     ` Michael Kelley (LINUX)
2022-06-01 17:08       ` Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1654056747-40143-2-git-send-email-mikelley@microsoft.com \
    --to=mikelley@microsoft.com \
    --cc=axboe@fb.com \
    --cc=caroline.subramoney@microsoft.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=nathan.obr@microsoft.com \
    --cc=riwurd@microsoft.com \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox