[PATCH V4 7/7] nvme: pci: support nested EH

linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

From: ming.lei@redhat.com (Ming Lei)
Subject: [PATCH V4 7/7] nvme: pci: support nested EH
Date: Sat,  5 May 2018 21:59:05 +0800	[thread overview]
Message-ID: <20180505135905.18815-8-ming.lei@redhat.com> (raw)
In-Reply-To: <20180505135905.18815-1-ming.lei@redhat.com>

When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in
reset path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times. Also two reset work may run one by one, this may cause
hang in nvme_wait_freeze() forever.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context

2) always start a new context for handling EH, and cancel all
in-flight requests(include the timed-out ones) in nvme_dev_disable()
by quiescing timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, fails
the controller by marking its state as DELETING and fail all in-flight
request. This approach for failing controller is from Keith's previous
patch.

With this approach, blktest block/011 can be passed.

Cc: Jianchao Wang <jianchao.w.wang at oracle.com>
Cc: Christoph Hellwig <hch at lst.de>
Cc: Sagi Grimberg <sagi at grimberg.me>
Cc: linux-nvme at lists.infradead.org
Cc: Laurence Oberman <loberman at redhat.com>
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
 drivers/nvme/host/core.c |  26 ++++++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 161 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 173 insertions(+), 16 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3aaee4dbf58e..d9a62e2cc33e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -254,6 +254,8 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
 void nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
+	struct nvme_ctrl *ctrl = data;
+
 	if (!blk_mq_request_started(req))
 		return;
 
@@ -261,6 +263,8 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
 				"Cancelling I/O %d", req->tag);
 
 	nvme_req(req)->status = NVME_SC_ABORT_REQ;
+	if (ctrl->state == NVME_CTRL_DELETING)
+		nvme_req(req)->status |= NVME_SC_DNR;
 	blk_mq_complete_request(req);
 
 }
@@ -3583,6 +3587,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 99f55c6f69f8..32f76cc8bb65 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -405,6 +405,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2fbe24274ad0..105d02fcac2d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 		freeze_queue);
+static void nvme_reset_dev(struct nvme_dev *dev, bool update_state);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,20 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* EH handler */
+	spinlock_t	eh_lock;
+	bool		ctrl_shutdown_started;
+	bool		ctrl_failed;
+	unsigned int	nested_eh;
+	struct work_struct fail_ctrl_work;
+};
+
+#define  NVME_MAX_NESTED_EH	32
+struct nvme_eh_work {
+	struct work_struct	work;
+	struct nvme_dev		*dev;
+	int			seq;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1177,6 +1192,93 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, fail_ctrl_work);
+
+	dev_info(dev->ctrl.device, "EH: fail controller\n");
+	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+	nvme_dev_disable(dev, false, true);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->ctrl_shutdown_started = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_done(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->nested_eh--;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+	struct nvme_eh_work *eh_work =
+		container_of(work, struct nvme_eh_work, work);
+	struct nvme_dev *dev = eh_work->dev;
+
+	dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+			eh_work->seq);
+	nvme_dev_disable(dev, false, true);
+	nvme_eh_mark_ctrl_shutdown(dev);
+
+	dev_info(dev->ctrl.device, "EH %d: after shutdown\n",
+			eh_work->seq);
+
+	nvme_reset_dev(dev, true);
+	nvme_eh_done(dev);
+	dev_info(dev->ctrl.device, "EH %d: after recovery\n",
+			eh_work->seq);
+
+	kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	bool need_sched = false;
+	bool fail_ctrl = false;
+	struct nvme_eh_work *eh_work;
+	int seq;
+
+	spin_lock(&dev->eh_lock);
+	if (!dev->ctrl_shutdown_started) {
+		need_sched = true;
+		seq = dev->nested_eh;
+		if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+			if (!dev->ctrl_failed)
+				dev->ctrl_failed = fail_ctrl = true;
+			else
+				need_sched = false;
+		} else
+			dev->ctrl_shutdown_started = true;
+	}
+	spin_unlock(&dev->eh_lock);
+
+	if (!need_sched)
+		return;
+
+	if (fail_ctrl) {
+ fail_ctrl:
+		INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+		queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+		return;
+	}
+
+	eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+	if (!eh_work)
+		goto fail_ctrl;
+
+	eh_work->dev = dev;
+	eh_work->seq = seq;
+	INIT_WORK(&eh_work->work, nvme_eh_work);
+	queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1198,9 +1300,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	/*
@@ -1225,9 +1326,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -1241,15 +1342,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2301,12 +2400,26 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
+	/*
+	 * safe to sync timeout after queues are quiesced, then all
+	 * requests(include the time-out ones) will be canceled.
+	 */
+	nvme_quiesce_timeout(&dev->ctrl);
+	blk_quiesce_timeout(dev->ctrl.admin_q);
 
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests have been canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2365,7 +2478,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 		nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_reset_dev(struct nvme_dev *dev)
+static void nvme_reset_dev(struct nvme_dev *dev, bool update_state)
 {
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
 	int result = -ENODEV;
@@ -2373,7 +2486,19 @@ static void nvme_reset_dev(struct nvme_dev *dev)
 
 	mutex_lock(&dev->ctrl.reset_lock);
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+	if (update_state) {
+		if (dev->ctrl.state != NVME_CTRL_RESETTING &&
+		    dev->ctrl.state != NVME_CTRL_CONNECTING) {
+		    if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+			dev_warn(dev->ctrl.device, "failed to change state to %d\n",
+					NVME_CTRL_RESETTING);
+			goto out;
+		    }
+		}
+	}
+
+	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING &&
+				dev->ctrl.state != NVME_CTRL_CONNECTING))
 		goto out;
 
 	/*
@@ -2387,10 +2512,12 @@ static void nvme_reset_dev(struct nvme_dev *dev)
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
 	 * initializing procedure here.
 	 */
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
-		dev_warn(dev->ctrl.device,
-			"failed to mark controller CONNECTING\n");
-		goto out;
+	if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
+		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+			dev_warn(dev->ctrl.device,
+				 "failed to mark controller CONNECTING\n");
+			goto out;
+		}
 	}
 
 	result = nvme_pci_enable(dev);
@@ -2483,7 +2610,7 @@ static void nvme_reset_work(struct work_struct *work)
 	struct nvme_dev *dev =
 		container_of(work, struct nvme_dev, ctrl.reset_work);
 
-	nvme_reset_dev(dev);
+	nvme_reset_dev(dev, false);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2625,6 +2752,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	spin_lock_init(&dev->eh_lock);
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
-- 
2.9.5

next prev parent reply	other threads:[~2018-05-05 13:59 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-05 13:58 [PATCH V4 0/7] nvme: pci: fix & improve timeout handling Ming Lei
2018-05-05 13:58 ` [PATCH V4 1/7] block: introduce blk_quiesce_timeout() and blk_unquiesce_timeout() Ming Lei
2018-05-10 15:01   ` Bart Van Assche
2018-05-10 21:00     ` Ming Lei
2018-05-05 13:59 ` [PATCH V4 2/7] nvme: pci: cover timeout for admin commands running in EH Ming Lei
2018-05-05 13:59 ` [PATCH V4 3/7] nvme: pci: only wait freezing if queue is frozen Ming Lei
2018-05-05 13:59 ` [PATCH V4 4/7] nvme: pci: freeze queue in nvme_dev_disable() in case of error recovery Ming Lei
2018-05-05 13:59 ` [PATCH V4 5/7] nvme: core: introduce 'reset_lock' for sync reset state and reset activities Ming Lei
2018-05-05 13:59 ` [PATCH V4 6/7] nvme: pci: prepare for supporting error recovery from resetting context Ming Lei
2018-05-07 15:04   ` James Smart
2018-05-10 20:53     ` Ming Lei
2018-05-05 13:59 ` Ming Lei [this message]
2018-05-05 23:11 ` [PATCH V4 0/7] nvme: pci: fix & improve timeout handling Laurence Oberman
2018-05-05 23:31   ` Laurence Oberman
2018-05-05 23:51     ` Laurence Oberman
2018-05-08 15:09       ` Keith Busch
2018-05-10 10:28   ` Ming Lei
2018-05-10 21:59     ` Laurence Oberman
2018-05-10 22:10       ` Ming Lei
2018-05-09  5:46 ` jianchao.wang
2018-05-10  2:09   ` Ming Lei

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:3aaee4dbf58 dfblob:d9a62e2cc33 dfblob:99f55c6f69f
dfblob:32f76cc8bb6 dfblob:2fbe24274ad dfblob:105d02fcac2 )
 OR (
bs:"[PATCH V4 7/7] nvme: pci: support nested EH" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180505135905.18815-8-ming.lei@redhat.com \
    --to=ming.lei@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).