[PATCHv2] nvme-pci: do not directly handle subsys reset fallout

Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCHv2] nvme-pci: do not directly handle subsys reset fallout
@ 2024-06-24 17:26 Keith Busch
  2024-06-24 17:36 ` Christoph Hellwig
  2024-06-25 10:02 ` Nilay Shroff
  0 siblings, 2 replies; 5+ messages in thread
From: Keith Busch @ 2024-06-24 17:26 UTC (permalink / raw)
  To: hch, sagi, linux-nvme; +Cc: Keith Busch, Nilay Shroff

From: Keith Busch <kbusch@kernel.org>

Scheduling reset_work after a nvme subsystem reset is expected to fail
on pcie, but this also prevents potential handling from the platform's
pcie services may provide that may successfully recovering the link
without re-enumeration. Such examples include AER, DPC, and power's EEH.

Provide a pci specific operation that safely initiates a subsystem
reset, and instead of scheduling reset work, read back the status
register to trigger a pcie read error.

Since this only affects pci, the other fabrics drivers subscribe to a
generic nvmf subsystem reset that is exactly the same as before. The
loop fabric doesn't use it because nvmet doesn't support setting that
property anyway.

And since we're using the magic NSSR value in two places now, provide a
symbolic define for it.

Reported-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
Changes from v1, applied comments from Christoph:

  Split out and require new ctrl subsystem reset op

  Comments updated for correctness and clarity

  Use a symbolic name for the special reset value

 drivers/nvme/host/fabrics.c | 15 +++++++++++++++
 drivers/nvme/host/fabrics.h |  1 +
 drivers/nvme/host/fc.c      |  1 +
 drivers/nvme/host/nvme.h    | 14 +++-----------
 drivers/nvme/host/pci.c     | 35 +++++++++++++++++++++++++++++++++++
 drivers/nvme/host/rdma.c    |  1 +
 drivers/nvme/host/tcp.c     |  1 +
 include/linux/nvme.h        |  3 +++
 8 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ceb9c0ed3120b..bb52049cb7c54 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -280,6 +280,21 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
 }
 EXPORT_SYMBOL_GPL(nvmf_reg_write32);
 
+int nvmf_subsystem_reset(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	if (!nvme_wait_reset(ctrl))
+		return -EBUSY;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, NVME_SUBSYS_RESET);
+	if (ret)
+		return ret;
+
+	return nvme_try_sched_reset(ctrl);
+}
+EXPORT_SYMBOL_GPL(nvmf_subsystem_reset);
+
 /**
  * nvmf_log_connect_error() - Error-parsing-diagnostic print out function for
  * 				connect() errors.
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 602135910ae9c..21d75dc4a3a09 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -217,6 +217,7 @@ static inline unsigned int nvmf_nr_io_queues(struct nvmf_ctrl_options *opts)
 int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
 int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
+int nvmf_subsystem_reset(struct nvme_ctrl *ctrl);
 int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
 int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
 int nvmf_register_transport(struct nvmf_transport_ops *ops);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index f0b0813327491..67802078c78c1 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3382,6 +3382,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
 	.reg_read32		= nvmf_reg_read32,
 	.reg_read64		= nvmf_reg_read64,
 	.reg_write32		= nvmf_reg_write32,
+	.subsystem_reset	= nvmf_subsystem_reset,
 	.free_ctrl		= nvme_fc_free_ctrl,
 	.submit_async_event	= nvme_fc_submit_async_event,
 	.delete_ctrl		= nvme_fc_delete_ctrl,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 68b400f9c42d5..9909ca8a1f476 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -551,6 +551,7 @@ struct nvme_ctrl_ops {
 	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
 	void (*free_ctrl)(struct nvme_ctrl *ctrl);
 	void (*submit_async_event)(struct nvme_ctrl *ctrl);
+	int (*subsystem_reset)(struct nvme_ctrl *ctrl);
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
@@ -649,18 +650,9 @@ int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
 
 static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
 {
-	int ret;
-
-	if (!ctrl->subsystem)
+	if (!ctrl->subsystem || !ctrl->ops->subsystem_reset)
 		return -ENOTTY;
-	if (!nvme_wait_reset(ctrl))
-		return -EBUSY;
-
-	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
-	if (ret)
-		return ret;
-
-	return nvme_try_sched_reset(ctrl);
+	return ctrl->ops->subsystem_reset(ctrl);
 }
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 102a9fb0c65ff..6cb7acef05576 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1143,6 +1143,40 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 	spin_unlock(&nvmeq->sq_lock);
 }
 
+static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	int ret = 0;
+
+	/*
+	 * Taking the shutdown_lock ensures the BAR mapping is not being
+	 * altered by reset_work. Holding this lock before the RESETTING state
+	 * change, if successful, also ensures nvme_remove won't be able to
+	 * proceed to iounmap until we're done.
+	 */
+	mutex_lock(&dev->shutdown_lock);
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (!dev->bar_mapped_size) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
+
+	/*
+	 * Read controller status to flush the previous write and trigger a
+	 * pcie read error.
+	 */
+	readl(dev->bar + NVME_REG_CSTS);
+unlock:
+	mutex_unlock(&dev->shutdown_lock);
+	return ret;
+}
+
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 {
 	struct nvme_command c = { };
@@ -2859,6 +2893,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
+	.subsystem_reset	= nvme_pci_subsystem_reset,
 	.get_address		= nvme_pci_get_address,
 	.print_device_info	= nvme_pci_print_device_info,
 	.supports_pci_p2pdma	= nvme_pci_supports_pci_p2pdma,
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 51a62b0c645a7..f514ffa671879 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2201,6 +2201,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
 	.reg_read32		= nvmf_reg_read32,
 	.reg_read64		= nvmf_reg_read64,
 	.reg_write32		= nvmf_reg_write32,
+	.subsystem_reset	= nvmf_subsystem_reset,
 	.free_ctrl		= nvme_rdma_free_ctrl,
 	.submit_async_event	= nvme_rdma_submit_async_event,
 	.delete_ctrl		= nvme_rdma_delete_ctrl,
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8b5e4327fe83b..46c1c4c5128c0 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2662,6 +2662,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
 	.reg_read32		= nvmf_reg_read32,
 	.reg_read64		= nvmf_reg_read64,
 	.reg_write32		= nvmf_reg_write32,
+	.subsystem_reset	= nvmf_subsystem_reset,
 	.free_ctrl		= nvme_tcp_free_ctrl,
 	.submit_async_event	= nvme_tcp_submit_async_event,
 	.delete_ctrl		= nvme_tcp_delete_ctrl,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c693ac344ec05..b7d06d46aa08c 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -25,6 +25,9 @@
 
 #define NVME_NSID_ALL		0xffffffff
 
+/* Special NSSR value, 'NVMe' */
+#define NVME_SUBSYS_RESET	0x4E564D65
+
 enum nvme_subsys_type {
 	/* Referral to another discovery type target subsystem */
 	NVME_NQN_DISC	= 1,
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCHv2] nvme-pci: do not directly handle subsys reset fallout
  2024-06-24 17:26 [PATCHv2] nvme-pci: do not directly handle subsys reset fallout Keith Busch
@ 2024-06-24 17:36 ` Christoph Hellwig
  2024-06-24 19:20   ` Keith Busch
  2024-06-25 10:02 ` Nilay Shroff
  1 sibling, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2024-06-24 17:36 UTC (permalink / raw)
  To: Keith Busch; +Cc: hch, sagi, linux-nvme, Keith Busch, Nilay Shroff

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCHv2] nvme-pci: do not directly handle subsys reset fallout
  2024-06-24 17:36 ` Christoph Hellwig
@ 2024-06-24 19:20   ` Keith Busch
  0 siblings, 0 replies; 5+ messages in thread
From: Keith Busch @ 2024-06-24 19:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, sagi, linux-nvme, Nilay Shroff

Applied to nvme-6.11


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCHv2] nvme-pci: do not directly handle subsys reset fallout
  2024-06-24 17:26 [PATCHv2] nvme-pci: do not directly handle subsys reset fallout Keith Busch
  2024-06-24 17:36 ` Christoph Hellwig
@ 2024-06-25 10:02 ` Nilay Shroff
  2024-06-25 16:02   ` Keith Busch
  1 sibling, 1 reply; 5+ messages in thread
From: Nilay Shroff @ 2024-06-25 10:02 UTC (permalink / raw)
  To: Keith Busch, hch, sagi, linux-nvme; +Cc: Keith Busch, Gregory Joyce

Hi Keith,

On 6/24/24 22:56, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Scheduling reset_work after a nvme subsystem reset is expected to fail
> on pcie, but this also prevents potential handling from the platform's
> pcie services may provide that may successfully recovering the link
> without re-enumeration. Such examples include AER, DPC, and power's EEH.
> 
> Provide a pci specific operation that safely initiates a subsystem
> reset, and instead of scheduling reset work, read back the status
> register to trigger a pcie read error.
> 
> Since this only affects pci, the other fabrics drivers subscribe to a
> generic nvmf subsystem reset that is exactly the same as before. The
> loop fabric doesn't use it because nvmet doesn't support setting that
> property anyway.
> 
> And since we're using the magic NSSR value in two places now, provide a
> symbolic define for it.
> 
> Reported-by: Nilay Shroff <nilay@linux.ibm.com>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> Changes from v1, applied comments from Christoph:
> 
>   Split out and require new ctrl subsystem reset op
> 
>   Comments updated for correctness and clarity
> 
>   Use a symbolic name for the special reset value
> 
>  drivers/nvme/host/fabrics.c | 15 +++++++++++++++
>  drivers/nvme/host/fabrics.h |  1 +
>  drivers/nvme/host/fc.c      |  1 +
>  drivers/nvme/host/nvme.h    | 14 +++-----------
>  drivers/nvme/host/pci.c     | 35 +++++++++++++++++++++++++++++++++++
>  drivers/nvme/host/rdma.c    |  1 +
>  drivers/nvme/host/tcp.c     |  1 +
>  include/linux/nvme.h        |  3 +++
>  8 files changed, 60 insertions(+), 11 deletions(-)
> 
>  
>  /*
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 102a9fb0c65ff..6cb7acef05576 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1143,6 +1143,40 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
>  	spin_unlock(&nvmeq->sq_lock);
>  }
>  
> +static int nvme_pci_subsystem_reset(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_dev *dev = to_nvme_dev(ctrl);
> +	int ret = 0;
> +
> +	/*
> +	 * Taking the shutdown_lock ensures the BAR mapping is not being
> +	 * altered by reset_work. Holding this lock before the RESETTING state
> +	 * change, if successful, also ensures nvme_remove won't be able to
> +	 * proceed to iounmap until we're done.
> +	 */
> +	mutex_lock(&dev->shutdown_lock);
> +	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) {
> +		ret = -EBUSY;
> +		goto unlock;
> +	}
> +
> +	if (!dev->bar_mapped_size) {
> +		ret = -ENODEV;
> +		goto unlock;
> +	}
> +
> +	writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
> +
> +	/*
> +	 * Read controller status to flush the previous write and trigger a
> +	 * pcie read error.
> +	 */
> +	readl(dev->bar + NVME_REG_CSTS);
> +unlock:
> +	mutex_unlock(&dev->shutdown_lock);
> +	return ret;
> +}
I noticed that in your earlier proposal we were changing the NVMe controller 
state from RESETTING to LIVE just after write to NVME_REG_NSSR. In this patch,
you removed that code and due to it now pci error recovery code (nvme_error_detected()) 
couldn't change the state of controller to RESETTING and that causes the pci error 
recovery to immediately bail out without recovering the NVMe adapter. 

I think we need to change state of controller to LIVE after write to NVME_REG_NSSR.
So was there any specific reason you removed that code or was it missed out by mistake?

Thanks,
--Nilay


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCHv2] nvme-pci: do not directly handle subsys reset fallout
  2024-06-25 10:02 ` Nilay Shroff
@ 2024-06-25 16:02   ` Keith Busch
  0 siblings, 0 replies; 5+ messages in thread
From: Keith Busch @ 2024-06-25 16:02 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: Keith Busch, hch, sagi, linux-nvme, Gregory Joyce

On Tue, Jun 25, 2024 at 03:32:59PM +0530, Nilay Shroff wrote:
> On 6/24/24 22:56, Keith Busch wrote:
> > +	}
> > +
> > +	writel(NVME_SUBSYS_RESET, dev->bar + NVME_REG_NSSR);
> > +
> > +	/*
> > +	 * Read controller status to flush the previous write and trigger a
> > +	 * pcie read error.
> > +	 */
> > +	readl(dev->bar + NVME_REG_CSTS);
> > +unlock:
> > +	mutex_unlock(&dev->shutdown_lock);
> > +	return ret;
> > +}
> I noticed that in your earlier proposal we were changing the NVMe controller 
> state from RESETTING to LIVE just after write to NVME_REG_NSSR. In this patch,
> you removed that code and due to it now pci error recovery code (nvme_error_detected()) 
> couldn't change the state of controller to RESETTING and that causes the pci error 
> recovery to immediately bail out without recovering the NVMe adapter. 
> 
> I think we need to change state of controller to LIVE after write to NVME_REG_NSSR.
> So was there any specific reason you removed that code or was it missed out by mistake?

Thanks for the catch, that was unintentional. I'll back it out and
submit with the correct state transition.


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-06-25 16:02 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-06-24 17:26 [PATCHv2] nvme-pci: do not directly handle subsys reset fallout Keith Busch
2024-06-24 17:36 ` Christoph Hellwig
2024-06-24 19:20   ` Keith Busch
2024-06-25 10:02 ` Nilay Shroff
2024-06-25 16:02   ` Keith Busch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox