public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
* [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
@ 2022-06-23  6:45 Ruozhu Li
  2022-06-27 23:42 ` Chaitanya Kulkarni
  2022-06-29 14:17 ` Christoph Hellwig
  0 siblings, 2 replies; 4+ messages in thread
From: Ruozhu Li @ 2022-06-23  6:45 UTC (permalink / raw)
  To: linux-nvme; +Cc: sagi, hch

We encountered a problem that the disconnect command hangs.
After analyzing the log and stack, we found that the triggering
process is as follows:
CPU0                          CPU1
                                nvme_rdma_error_recovery_work
                                  nvme_rdma_teardown_io_queues
nvme_do_delete_ctrl                 nvme_stop_queues
  nvme_remove_namespaces
  --clear ctrl->namespaces
                                    nvme_start_queues
                                    --no ns in ctrl->namespaces
    nvme_ns_remove                  return(because ctrl is deleting)
      blk_freeze_queue
        blk_mq_freeze_queue_wait
        --wait for ns to unquiesce to clean infligt IO, hang forever

This problem was not found in older kernels because we will flush
err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
seem to be modified for functional reasons, the patch can be revert
to solve the problem.

Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout")

Signed-off-by: Ruozhu Li <liruozhu@huawei.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
v1 -> v2:
  - As suggested by sagi, modify the title to reflect that this is a
regression-fix patch

 drivers/nvme/host/core.c |  2 ++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/rdma.c | 12 +++++++++---
 drivers/nvme/host/tcp.c  | 10 +++++++---
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 24165daee3c8..2e9086a18c58 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4580,6 +4580,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 	nvme_stop_failfast_work(ctrl);
 	flush_work(&ctrl->async_event_work);
 	cancel_work_sync(&ctrl->fw_act_work);
+	if (ctrl->ops->stop_ctrl)
+		ctrl->ops->stop_ctrl(ctrl);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b72b6ecf33c..0f261dc7bd12 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -503,6 +503,7 @@ struct nvme_ctrl_ops {
 	void (*submit_async_event)(struct nvme_ctrl *ctrl);
 	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
 };
 
 /*
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f2a5e1ea508a..46c2dcf72f7e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1048,6 +1048,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
 	}
 }
 
+static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+	cancel_work_sync(&ctrl->err_work);
+	cancel_delayed_work_sync(&ctrl->reconnect_work);
+}
+
 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 {
 	struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -2252,9 +2260,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
 
 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 {
-	cancel_work_sync(&ctrl->err_work);
-	cancel_delayed_work_sync(&ctrl->reconnect_work);
-
 	nvme_rdma_teardown_io_queues(ctrl, shutdown);
 	nvme_stop_admin_queue(&ctrl->ctrl);
 	if (shutdown)
@@ -2304,6 +2309,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
 	.submit_async_event	= nvme_rdma_submit_async_event,
 	.delete_ctrl		= nvme_rdma_delete_ctrl,
 	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_rdma_stop_ctrl,
 };
 
 /*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bb67538d241b..2c93fee8ba00 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2194,9 +2194,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 
 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 {
-	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
-	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
-
 	nvme_tcp_teardown_io_queues(ctrl, shutdown);
 	nvme_stop_admin_queue(ctrl);
 	if (shutdown)
@@ -2236,6 +2233,12 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 	nvme_tcp_reconnect_or_remove(ctrl);
 }
 
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
 static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@@ -2557,6 +2560,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
 	.submit_async_event	= nvme_tcp_submit_async_event,
 	.delete_ctrl		= nvme_tcp_delete_ctrl,
 	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_tcp_stop_ctrl,
 };
 
 static bool
-- 
2.16.4



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
  2022-06-23  6:45 [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl Ruozhu Li
@ 2022-06-27 23:42 ` Chaitanya Kulkarni
  2022-06-28  3:01   ` liruozhu
  2022-06-29 14:17 ` Christoph Hellwig
  1 sibling, 1 reply; 4+ messages in thread
From: Chaitanya Kulkarni @ 2022-06-27 23:42 UTC (permalink / raw)
  To: Ruozhu Li; +Cc: sagi@grimberg.me, hch@lst.de, linux-nvme@lists.infradead.org

On 6/22/22 23:45, Ruozhu Li wrote:
> We encountered a problem that the disconnect command hangs.
> After analyzing the log and stack, we found that the triggering
> process is as follows:
> CPU0                          CPU1
>                                  nvme_rdma_error_recovery_work
>                                    nvme_rdma_teardown_io_queues
> nvme_do_delete_ctrl                 nvme_stop_queues
>    nvme_remove_namespaces
>    --clear ctrl->namespaces
>                                      nvme_start_queues
>                                      --no ns in ctrl->namespaces
>      nvme_ns_remove                  return(because ctrl is deleting)
>        blk_freeze_queue
>          blk_mq_freeze_queue_wait
>          --wait for ns to unquiesce to clean infligt IO, hang forever
> 
> This problem was not found in older kernels because we will flush
> err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
> seem to be modified for functional reasons, the patch can be revert
> to solve the problem.
> 
> Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout")
> 

without looking into the code, do you have any idea if fc and/or loop
transport also suffer from similar issue ?

-ck



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
  2022-06-27 23:42 ` Chaitanya Kulkarni
@ 2022-06-28  3:01   ` liruozhu
  0 siblings, 0 replies; 4+ messages in thread
From: liruozhu @ 2022-06-28  3:01 UTC (permalink / raw)
  To: Chaitanya Kulkarni
  Cc: sagi@grimberg.me, hch@lst.de, linux-nvme@lists.infradead.org

On 2022/6/28 7:42, Chaitanya Kulkarni wrote:

> On 6/22/22 23:45, Ruozhu Li wrote:
>> We encountered a problem that the disconnect command hangs.
>> After analyzing the log and stack, we found that the triggering
>> process is as follows:
>> CPU0                          CPU1
>>                                   nvme_rdma_error_recovery_work
>>                                     nvme_rdma_teardown_io_queues
>> nvme_do_delete_ctrl                 nvme_stop_queues
>>     nvme_remove_namespaces
>>     --clear ctrl->namespaces
>>                                       nvme_start_queues
>>                                       --no ns in ctrl->namespaces
>>       nvme_ns_remove                  return(because ctrl is deleting)
>>         blk_freeze_queue
>>           blk_mq_freeze_queue_wait
>>           --wait for ns to unquiesce to clean infligt IO, hang forever
>>
>> This problem was not found in older kernels because we will flush
>> err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
>> seem to be modified for functional reasons, the patch can be revert
>> to solve the problem.
>>
>> Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout")
>>
> without looking into the code, do you have any idea if fc and/or loop
> transport also suffer from similar issue ?
>
> -ck
I am not so familiar with the these transport code. It seems that FC 
will also do
stop\start queue in err work, and there will probably be similar problems.

The loop transport only has reset work, and it will be flushed, so there 
should
be no such problem.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
  2022-06-23  6:45 [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl Ruozhu Li
  2022-06-27 23:42 ` Chaitanya Kulkarni
@ 2022-06-29 14:17 ` Christoph Hellwig
  1 sibling, 0 replies; 4+ messages in thread
From: Christoph Hellwig @ 2022-06-29 14:17 UTC (permalink / raw)
  To: Ruozhu Li; +Cc: linux-nvme, sagi, hch

Thanks,

added to nvme-5.19, although I had to manually apply it due to a
conflict.


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-06-29 14:17 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-23  6:45 [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl Ruozhu Li
2022-06-27 23:42 ` Chaitanya Kulkarni
2022-06-28  3:01   ` liruozhu
2022-06-29 14:17 ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox