* [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
@ 2022-06-23 6:45 Ruozhu Li
2022-06-27 23:42 ` Chaitanya Kulkarni
2022-06-29 14:17 ` Christoph Hellwig
0 siblings, 2 replies; 4+ messages in thread
From: Ruozhu Li @ 2022-06-23 6:45 UTC (permalink / raw)
To: linux-nvme; +Cc: sagi, hch
We encountered a problem that the disconnect command hangs.
After analyzing the log and stack, we found that the triggering
process is as follows:
CPU0 CPU1
nvme_rdma_error_recovery_work
nvme_rdma_teardown_io_queues
nvme_do_delete_ctrl nvme_stop_queues
nvme_remove_namespaces
--clear ctrl->namespaces
nvme_start_queues
--no ns in ctrl->namespaces
nvme_ns_remove return(because ctrl is deleting)
blk_freeze_queue
blk_mq_freeze_queue_wait
--wait for ns to unquiesce to clean infligt IO, hang forever
This problem was not found in older kernels because we will flush
err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
seem to be modified for functional reasons, the patch can be revert
to solve the problem.
Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout")
Signed-off-by: Ruozhu Li <liruozhu@huawei.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
v1 -> v2:
- As suggested by sagi, modify the title to reflect that this is a
regression-fix patch
drivers/nvme/host/core.c | 2 ++
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/rdma.c | 12 +++++++++---
drivers/nvme/host/tcp.c | 10 +++++++---
4 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 24165daee3c8..2e9086a18c58 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4580,6 +4580,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
+ if (ctrl->ops->stop_ctrl)
+ ctrl->ops->stop_ctrl(ctrl);
}
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b72b6ecf33c..0f261dc7bd12 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -503,6 +503,7 @@ struct nvme_ctrl_ops {
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+ void (*stop_ctrl)(struct nvme_ctrl *ctrl);
};
/*
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f2a5e1ea508a..46c2dcf72f7e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1048,6 +1048,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
}
}
+static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+ cancel_work_sync(&ctrl->err_work);
+ cancel_delayed_work_sync(&ctrl->reconnect_work);
+}
+
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -2252,9 +2260,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
- cancel_work_sync(&ctrl->err_work);
- cancel_delayed_work_sync(&ctrl->reconnect_work);
-
nvme_rdma_teardown_io_queues(ctrl, shutdown);
nvme_stop_admin_queue(&ctrl->ctrl);
if (shutdown)
@@ -2304,6 +2309,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
+ .stop_ctrl = nvme_rdma_stop_ctrl,
};
/*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bb67538d241b..2c93fee8ba00 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2194,9 +2194,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
- cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
- cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
-
nvme_tcp_teardown_io_queues(ctrl, shutdown);
nvme_stop_admin_queue(ctrl);
if (shutdown)
@@ -2236,6 +2233,12 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
nvme_tcp_reconnect_or_remove(ctrl);
}
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+ cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+ cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@@ -2557,6 +2560,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvmf_get_address,
+ .stop_ctrl = nvme_tcp_stop_ctrl,
};
static bool
--
2.16.4
^ permalink raw reply related [flat|nested] 4+ messages in thread* Re: [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
2022-06-23 6:45 [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl Ruozhu Li
@ 2022-06-27 23:42 ` Chaitanya Kulkarni
2022-06-28 3:01 ` liruozhu
2022-06-29 14:17 ` Christoph Hellwig
1 sibling, 1 reply; 4+ messages in thread
From: Chaitanya Kulkarni @ 2022-06-27 23:42 UTC (permalink / raw)
To: Ruozhu Li; +Cc: sagi@grimberg.me, hch@lst.de, linux-nvme@lists.infradead.org
On 6/22/22 23:45, Ruozhu Li wrote:
> We encountered a problem that the disconnect command hangs.
> After analyzing the log and stack, we found that the triggering
> process is as follows:
> CPU0 CPU1
> nvme_rdma_error_recovery_work
> nvme_rdma_teardown_io_queues
> nvme_do_delete_ctrl nvme_stop_queues
> nvme_remove_namespaces
> --clear ctrl->namespaces
> nvme_start_queues
> --no ns in ctrl->namespaces
> nvme_ns_remove return(because ctrl is deleting)
> blk_freeze_queue
> blk_mq_freeze_queue_wait
> --wait for ns to unquiesce to clean infligt IO, hang forever
>
> This problem was not found in older kernels because we will flush
> err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
> seem to be modified for functional reasons, the patch can be revert
> to solve the problem.
>
> Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout")
>
without looking into the code, do you have any idea if fc and/or loop
transport also suffer from similar issue ?
-ck
^ permalink raw reply [flat|nested] 4+ messages in thread* Re: [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
2022-06-27 23:42 ` Chaitanya Kulkarni
@ 2022-06-28 3:01 ` liruozhu
0 siblings, 0 replies; 4+ messages in thread
From: liruozhu @ 2022-06-28 3:01 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: sagi@grimberg.me, hch@lst.de, linux-nvme@lists.infradead.org
On 2022/6/28 7:42, Chaitanya Kulkarni wrote:
> On 6/22/22 23:45, Ruozhu Li wrote:
>> We encountered a problem that the disconnect command hangs.
>> After analyzing the log and stack, we found that the triggering
>> process is as follows:
>> CPU0 CPU1
>> nvme_rdma_error_recovery_work
>> nvme_rdma_teardown_io_queues
>> nvme_do_delete_ctrl nvme_stop_queues
>> nvme_remove_namespaces
>> --clear ctrl->namespaces
>> nvme_start_queues
>> --no ns in ctrl->namespaces
>> nvme_ns_remove return(because ctrl is deleting)
>> blk_freeze_queue
>> blk_mq_freeze_queue_wait
>> --wait for ns to unquiesce to clean infligt IO, hang forever
>>
>> This problem was not found in older kernels because we will flush
>> err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
>> seem to be modified for functional reasons, the patch can be revert
>> to solve the problem.
>>
>> Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout")
>>
> without looking into the code, do you have any idea if fc and/or loop
> transport also suffer from similar issue ?
>
> -ck
I am not so familiar with the these transport code. It seems that FC
will also do
stop\start queue in err work, and there will probably be similar problems.
The loop transport only has reset work, and it will be flushed, so there
should
be no such problem.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl
2022-06-23 6:45 [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl Ruozhu Li
2022-06-27 23:42 ` Chaitanya Kulkarni
@ 2022-06-29 14:17 ` Christoph Hellwig
1 sibling, 0 replies; 4+ messages in thread
From: Christoph Hellwig @ 2022-06-29 14:17 UTC (permalink / raw)
To: Ruozhu Li; +Cc: linux-nvme, sagi, hch
Thanks,
added to nvme-5.19, although I had to manually apply it due to a
conflict.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2022-06-29 14:17 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-23 6:45 [PATCH v2] nvme: Fix regression when disconnect a recovering ctrl Ruozhu Li
2022-06-27 23:42 ` Chaitanya Kulkarni
2022-06-28 3:01 ` liruozhu
2022-06-29 14:17 ` Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox