[PATCH] nvme: remove disk after hw queue is started

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] nvme: remove disk after hw queue is started
@ 2017-05-08 11:24 Ming Lei
  2017-05-08 12:46 ` Ming Lei
  0 siblings, 1 reply; 8+ messages in thread
From: Ming Lei @ 2017-05-08 11:24 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Keith Busch, Christoph Hellwig,
	Sagi Grimberg
  Cc: linux-nvme, Ming Lei, stable

If hw queue is stopped, the following hang can be triggered
when doing pci reset/remove and running heavy I/O load
meantime.

This patch fixes the issue by calling nvme_uninit_ctrl()
just after nvme_dev_disable(dev, true) in nvme_remove().

[  492.232593] INFO: task nvme-test:5939 blocked for more than 120 seconds.
[  492.240081]       Not tainted 4.11.0.nvme_v4.11_debug_hang+ #3
[  492.246600] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  492.255346] nvme-test       D    0  5939   5938 0x00000080
[  492.261475] Call Trace:
[  492.264215]  __schedule+0x289/0x8f0
[  492.268105]  ? write_cache_pages+0x14c/0x510
[  492.272873]  schedule+0x36/0x80
[  492.276381]  io_schedule+0x16/0x40
[  492.280181]  wait_on_page_bit_common+0x137/0x220
[  492.285336]  ? page_cache_tree_insert+0x120/0x120
[  492.290589]  __filemap_fdatawait_range+0x128/0x1a0
[  492.295941]  filemap_fdatawait_range+0x14/0x30
[  492.300902]  filemap_fdatawait+0x23/0x30
[  492.305282]  filemap_write_and_wait+0x4c/0x80
[  492.310151]  __sync_blockdev+0x1f/0x40
[  492.314336]  fsync_bdev+0x44/0x50
[  492.318039]  invalidate_partition+0x24/0x50
[  492.322710]  del_gendisk+0xcd/0x2e0
[  492.326608]  nvme_ns_remove+0x105/0x130 [nvme_core]
[  492.332054]  nvme_remove_namespaces+0x32/0x50 [nvme_core]
[  492.338082]  nvme_uninit_ctrl+0x2d/0xa0 [nvme_core]
[  492.343519]  nvme_remove+0x5d/0x170 [nvme]
[  492.348096]  pci_device_remove+0x39/0xc0
[  492.352477]  device_release_driver_internal+0x141/0x1f0
[  492.358311]  device_release_driver+0x12/0x20
[  492.363072]  pci_stop_bus_device+0x8c/0xa0
[  492.367646]  pci_stop_and_remove_bus_device_locked+0x1a/0x30
[  492.373965]  remove_store+0x7c/0x90
[  492.377852]  dev_attr_store+0x18/0x30
[  492.381941]  sysfs_kf_write+0x3a/0x50
[  492.386028]  kernfs_fop_write+0xff/0x180
[  492.390409]  __vfs_write+0x37/0x160
[  492.394304]  ? selinux_file_permission+0xe5/0x120
[  492.399556]  ? security_file_permission+0x3b/0xc0
[  492.404807]  vfs_write+0xb2/0x1b0
[  492.408508]  ? syscall_trace_enter+0x1d0/0x2b0
[  492.413462]  SyS_write+0x55/0xc0
[  492.417064]  do_syscall_64+0x67/0x180
[  492.421155]  entry_SYSCALL64_slow_path+0x25/0x25

Cc: stable@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c8541c3dcd19..ebe13e157c00 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2185,8 +2185,8 @@ static void nvme_remove(struct pci_dev *pdev)
 	}
 
 	flush_work(&dev->reset_work);
-	nvme_uninit_ctrl(&dev->ctrl);
 	nvme_dev_disable(dev, true);
+	nvme_uninit_ctrl(&dev->ctrl);
 	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
 	nvme_release_cmb(dev);
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-08 11:24 [PATCH] nvme: remove disk after hw queue is started Ming Lei
@ 2017-05-08 12:46 ` Ming Lei
  2017-05-08 15:07   ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Ming Lei @ 2017-05-08 12:46 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Keith Busch, Christoph Hellwig,
	Sagi Grimberg
  Cc: linux-nvme, stable

On Mon, May 08, 2017 at 07:24:57PM +0800, Ming Lei wrote:
> If hw queue is stopped, the following hang can be triggered
> when doing pci reset/remove and running heavy I/O load
> meantime.
> 
> This patch fixes the issue by calling nvme_uninit_ctrl()
> just after nvme_dev_disable(dev, true) in nvme_remove().
> 
> [  492.232593] INFO: task nvme-test:5939 blocked for more than 120 seconds.
> [  492.240081]       Not tainted 4.11.0.nvme_v4.11_debug_hang+ #3
> [  492.246600] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  492.255346] nvme-test       D    0  5939   5938 0x00000080
> [  492.261475] Call Trace:
> [  492.264215]  __schedule+0x289/0x8f0
> [  492.268105]  ? write_cache_pages+0x14c/0x510
> [  492.272873]  schedule+0x36/0x80
> [  492.276381]  io_schedule+0x16/0x40
> [  492.280181]  wait_on_page_bit_common+0x137/0x220
> [  492.285336]  ? page_cache_tree_insert+0x120/0x120
> [  492.290589]  __filemap_fdatawait_range+0x128/0x1a0
> [  492.295941]  filemap_fdatawait_range+0x14/0x30
> [  492.300902]  filemap_fdatawait+0x23/0x30
> [  492.305282]  filemap_write_and_wait+0x4c/0x80
> [  492.310151]  __sync_blockdev+0x1f/0x40
> [  492.314336]  fsync_bdev+0x44/0x50
> [  492.318039]  invalidate_partition+0x24/0x50
> [  492.322710]  del_gendisk+0xcd/0x2e0
> [  492.326608]  nvme_ns_remove+0x105/0x130 [nvme_core]
> [  492.332054]  nvme_remove_namespaces+0x32/0x50 [nvme_core]
> [  492.338082]  nvme_uninit_ctrl+0x2d/0xa0 [nvme_core]
> [  492.343519]  nvme_remove+0x5d/0x170 [nvme]
> [  492.348096]  pci_device_remove+0x39/0xc0
> [  492.352477]  device_release_driver_internal+0x141/0x1f0
> [  492.358311]  device_release_driver+0x12/0x20
> [  492.363072]  pci_stop_bus_device+0x8c/0xa0
> [  492.367646]  pci_stop_and_remove_bus_device_locked+0x1a/0x30
> [  492.373965]  remove_store+0x7c/0x90
> [  492.377852]  dev_attr_store+0x18/0x30
> [  492.381941]  sysfs_kf_write+0x3a/0x50
> [  492.386028]  kernfs_fop_write+0xff/0x180
> [  492.390409]  __vfs_write+0x37/0x160
> [  492.394304]  ? selinux_file_permission+0xe5/0x120
> [  492.399556]  ? security_file_permission+0x3b/0xc0
> [  492.404807]  vfs_write+0xb2/0x1b0
> [  492.408508]  ? syscall_trace_enter+0x1d0/0x2b0
> [  492.413462]  SyS_write+0x55/0xc0
> [  492.417064]  do_syscall_64+0x67/0x180
> [  492.421155]  entry_SYSCALL64_slow_path+0x25/0x25
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  drivers/nvme/host/pci.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index c8541c3dcd19..ebe13e157c00 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2185,8 +2185,8 @@ static void nvme_remove(struct pci_dev *pdev)
>  	}
>  
>  	flush_work(&dev->reset_work);
> -	nvme_uninit_ctrl(&dev->ctrl);
>  	nvme_dev_disable(dev, true);
> +	nvme_uninit_ctrl(&dev->ctrl);
>  	nvme_dev_remove_admin(dev);
>  	nvme_free_queues(dev, 0);
>  	nvme_release_cmb(dev);

This patch should be wrong, and looks the correct fix should be
flushing 'dev->remove_work' before calling nvme_uninit_ctrl().

But it might cause deadloack by calling flush_work(&dev->remove_work)
here simply.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-08 12:46 ` Ming Lei
@ 2017-05-08 15:07   ` Keith Busch
  2017-05-08 15:11     ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2017-05-08 15:07 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Sagi Grimberg,
	linux-nvme, stable

On Mon, May 08, 2017 at 08:46:39PM +0800, Ming Lei wrote:
> On Mon, May 08, 2017 at 07:24:57PM +0800, Ming Lei wrote:
> > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> > index c8541c3dcd19..ebe13e157c00 100644
> > --- a/drivers/nvme/host/pci.c
> > +++ b/drivers/nvme/host/pci.c
> > @@ -2185,8 +2185,8 @@ static void nvme_remove(struct pci_dev *pdev)
> >  	}
> >  
> >  	flush_work(&dev->reset_work);
> > -	nvme_uninit_ctrl(&dev->ctrl);
> >  	nvme_dev_disable(dev, true);
> > +	nvme_uninit_ctrl(&dev->ctrl);
> >  	nvme_dev_remove_admin(dev);
> >  	nvme_free_queues(dev, 0);
> >  	nvme_release_cmb(dev);
> 
> This patch should be wrong, and looks the correct fix should be
> flushing 'dev->remove_work' before calling nvme_uninit_ctrl().

Yeah, disabling the device before calling "nvme_uninit_ctrl" shouldn't
be required. If you disable the device first, del_gendisk can't flush
dirty data on an orderly removal request.

> But it might cause deadloack by calling flush_work(&dev->remove_work)
> here simply.

I'm almost certain the remove_work shouldn't even be running in this
case. If the reset work can't transition the controller state correctly,
it should assume something is handling the controller.

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 26a5fd0..d81104d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1792,7 +1797,7 @@ static void nvme_reset_work(struct work_struct *work)
 		nvme_dev_disable(dev, false);
 
 	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
-		goto out;
+		return;
 
 	result = nvme_pci_enable(dev);
 	if (result)
--

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-08 15:07   ` Keith Busch
@ 2017-05-08 15:11     ` Keith Busch
  2017-05-08 16:15       ` Ming Lei
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2017-05-08 15:11 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Sagi Grimberg, stable, linux-block, linux-nvme,
	Christoph Hellwig

On Mon, May 08, 2017 at 11:07:20AM -0400, Keith Busch wrote:
> I'm almost certain the remove_work shouldn't even be running in this
> case. If the reset work can't transition the controller state correctly,
> it should assume something is handling the controller.

Here's the more complete version of what I had in mind. Does this solve
the reported issue?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 26a5fd0..46a37fb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1792,7 +1792,7 @@ static void nvme_reset_work(struct work_struct *work)
 		nvme_dev_disable(dev, false);
 
 	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
-		goto out;
+		return;
 
 	result = nvme_pci_enable(dev);
 	if (result)
@@ -1854,7 +1854,7 @@ static void nvme_reset_work(struct work_struct *work)
 
 	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
 		dev_warn(dev->ctrl.device, "failed to mark controller live\n");
-		goto out;
+		return;
 	}
 
 	if (dev->online_queues > 1)
--

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-08 15:11     ` Keith Busch
@ 2017-05-08 16:15       ` Ming Lei
  2017-05-08 17:25         ` Keith Busch
  0 siblings, 1 reply; 8+ messages in thread
From: Ming Lei @ 2017-05-08 16:15 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Sagi Grimberg, stable, linux-block, linux-nvme,
	Christoph Hellwig

On Mon, May 08, 2017 at 11:11:24AM -0400, Keith Busch wrote:
> On Mon, May 08, 2017 at 11:07:20AM -0400, Keith Busch wrote:
> > I'm almost certain the remove_work shouldn't even be running in this
> > case. If the reset work can't transition the controller state correctly,
> > it should assume something is handling the controller.
> 
> Here's the more complete version of what I had in mind. Does this solve
> the reported issue?
> 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 26a5fd0..46a37fb 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1792,7 +1792,7 @@ static void nvme_reset_work(struct work_struct *work)
>  		nvme_dev_disable(dev, false);
>  
>  	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
> -		goto out;
> +		return;
>  
>  	result = nvme_pci_enable(dev);
>  	if (result)
> @@ -1854,7 +1854,7 @@ static void nvme_reset_work(struct work_struct *work)
>  
>  	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
>  		dev_warn(dev->ctrl.device, "failed to mark controller live\n");
> -		goto out;
> +		return;
>  	}
>  
>  	if (dev->online_queues > 1)

This patch looks working, but seems any 'goto out' in this function
may have rick to cause the same race too.

Another solution I thought of is to kill queues earlier, what do you
think about the following patch?

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c8541c3dcd19..16740e8c4225 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1892,6 +1892,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 
 	kref_get(&dev->ctrl.kref);
 	nvme_dev_disable(dev, false);
+	nvme_kill_queues(&dev->ctrl);
 	if (!schedule_work(&dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
 }
@@ -1998,7 +1999,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		device_release_driver(&pdev->dev);
 	nvme_put_ctrl(&dev->ctrl);


Thanks,
Ming

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-08 16:15       ` Ming Lei
@ 2017-05-08 17:25         ` Keith Busch
  2017-05-09  1:10           ` Ming Lei
  0 siblings, 1 reply; 8+ messages in thread
From: Keith Busch @ 2017-05-08 17:25 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Sagi Grimberg, stable, linux-block, linux-nvme,
	Christoph Hellwig

On Tue, May 09, 2017 at 12:15:25AM +0800, Ming Lei wrote:
> This patch looks working, but seems any 'goto out' in this function
> may have rick to cause the same race too.

The goto was really intended for handling totally broken contronllers,
which isn't the case if someone requested to remove the pci device while
we're initializing it. Point taken, though, let me run a few tests and
see if there's a better way to handle this condition.

> Another solution I thought of is to kill queues earlier, what do you
> think about the following patch?

That should get it unstuck, but it will error all the IO that fsync_bdev
would probably rather complete successfully.

Question though, why doesn't the remove_work's nvme_kill_queues in
its current place allow forward progress already?
 
> ---
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index c8541c3dcd19..16740e8c4225 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1892,6 +1892,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
>  
>  	kref_get(&dev->ctrl.kref);
>  	nvme_dev_disable(dev, false);
> +	nvme_kill_queues(&dev->ctrl);
>  	if (!schedule_work(&dev->remove_work))
>  		nvme_put_ctrl(&dev->ctrl);
>  }
> @@ -1998,7 +1999,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
>  	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
>  	struct pci_dev *pdev = to_pci_dev(dev->dev);
>  
> -	nvme_kill_queues(&dev->ctrl);
>  	if (pci_get_drvdata(pdev))
>  		device_release_driver(&pdev->dev);
>  	nvme_put_ctrl(&dev->ctrl);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-08 17:25         ` Keith Busch
@ 2017-05-09  1:10           ` Ming Lei
  2017-05-09  3:26             ` Ming Lei
  0 siblings, 1 reply; 8+ messages in thread
From: Ming Lei @ 2017-05-09  1:10 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Sagi Grimberg, stable, linux-block, linux-nvme,
	Christoph Hellwig

Hi Keith,

Thanks for looking at this issue!

On Mon, May 08, 2017 at 01:25:12PM -0400, Keith Busch wrote:
> On Tue, May 09, 2017 at 12:15:25AM +0800, Ming Lei wrote:
> > This patch looks working, but seems any 'goto out' in this function
> > may have rick to cause the same race too.
> 
> The goto was really intended for handling totally broken contronllers,
> which isn't the case if someone requested to remove the pci device while
> we're initializing it. Point taken, though, let me run a few tests and
> see if there's a better way to handle this condition.

The thing is that remove can happen any time, either from hotplug or
unbinding driver or 'echo 1 > $PCI_PATH/remove'. At the same time,
the reset can be ongoing.

Also looks the hang in del_gendisk() is fixed by this change, but I
just found a new issue which is triggered after the NVMe PCI device is rescaned
again after last remove.

[  504.135554] VFS: Dirty inode writeback failed for block device nvme0n1p1 (err=-5).

> 
> > Another solution I thought of is to kill queues earlier, what do you
> > think about the following patch?
> 
> That should get it unstuck, but it will error all the IO that fsync_bdev
> would probably rather complete successfully.

nvme_dev_disable(false) has been completed already before killing queues in
nvme_remove_dead_ctrl(), so both hw queue is stopped and nvmeq->cq_vector is
set as -1 in nvme_suspend_queue(). That means no new I/O(include IO in
fsync_bdev) can be submitted successfully any more, so looks it is reasonable
to kill queue in nvme_remove_dead_ctrl().

> 
> Question though, why doesn't the remove_work's nvme_kill_queues in
> its current place allow forward progress already?

That is because .remove_work may not be run before del_gendisk() is
started even though the .reset_work is flushed, and we can't flush
.remove_work simply here.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] nvme: remove disk after hw queue is started
  2017-05-09  1:10           ` Ming Lei
@ 2017-05-09  3:26             ` Ming Lei
  0 siblings, 0 replies; 8+ messages in thread
From: Ming Lei @ 2017-05-09  3:26 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Sagi Grimberg, stable, linux-block, linux-nvme,
	Christoph Hellwig

Hi Keith.

On Tue, May 09, 2017 at 09:10:30AM +0800, Ming Lei wrote:
> Hi Keith,
> 
> Thanks for looking at this issue!
> 
> On Mon, May 08, 2017 at 01:25:12PM -0400, Keith Busch wrote:
> > On Tue, May 09, 2017 at 12:15:25AM +0800, Ming Lei wrote:
> > > This patch looks working, but seems any 'goto out' in this function
> > > may have rick to cause the same race too.
> > 
> > The goto was really intended for handling totally broken contronllers,
> > which isn't the case if someone requested to remove the pci device while
> > we're initializing it. Point taken, though, let me run a few tests and
> > see if there's a better way to handle this condition.
> 
> The thing is that remove can happen any time, either from hotplug or
> unbinding driver or 'echo 1 > $PCI_PATH/remove'. At the same time,
> the reset can be ongoing.
> 
> Also looks the hang in del_gendisk() is fixed by this change, but I
> just found a new issue which is triggered after the NVMe PCI device is rescaned
> again after last remove.
> 
> [  504.135554] VFS: Dirty inode writeback failed for block device nvme0n1p1 (err=-5).
> 
> > 
> > > Another solution I thought of is to kill queues earlier, what do you
> > > think about the following patch?
> > 
> > That should get it unstuck, but it will error all the IO that fsync_bdev
> > would probably rather complete successfully.
> 
> nvme_dev_disable(false) has been completed already before killing queues in
> nvme_remove_dead_ctrl(), so both hw queue is stopped and nvmeq->cq_vector is
> set as -1 in nvme_suspend_queue(). That means no new I/O(include IO in
> fsync_bdev) can be submitted successfully any more, so looks it is reasonable
> to kill queue in nvme_remove_dead_ctrl().
> 
> > 
> > Question though, why doesn't the remove_work's nvme_kill_queues in
> > its current place allow forward progress already?
> 
> That is because .remove_work may not be run before del_gendisk() is
> started even though the .reset_work is flushed, and we can't flush
> .remove_work simply here.

The following patch should be a complete version, and the io stress test
with pci reset/remove has been survived for hours, and previously
the hang can be reproduced in at most half an hour.

---
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d5e0906262ea..f74cdf8e710f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2097,6 +2097,15 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 					&nvme_ns_attr_group);
 		if (ns->ndev)
 			nvme_nvm_unregister_sysfs(ns);
+
+		/*
+		 * If queue is dead, we have to abort requests in
+		 * requeue list first because fsync_bdev() in removing disk
+		 * path may wait for these IOs, which can't
+		 * be submitted to hardware too.
+		 */
+		if (blk_queue_dying(ns->queue))
+			blk_mq_abort_requeue_list(ns->queue);
 		del_gendisk(ns->disk);
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_cleanup_queue(ns->queue);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c8541c3dcd19..661b5ff7c168 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1892,6 +1892,14 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 
 	kref_get(&dev->ctrl.kref);
 	nvme_dev_disable(dev, false);
+
+	/*
+	 * nvme_dev_disable() has suspended queues, then no new I/O
+	 * can be submitted to hardware successfully any more, so
+	 * kill queues now for avoiding race between reset failure
+	 * and remove.
+	 */
+	nvme_kill_queues(&dev->ctrl);
 	if (!schedule_work(&dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
 }
@@ -1998,7 +2006,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
-	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		device_release_driver(&pdev->dev);
 	nvme_put_ctrl(&dev->ctrl);


Thanks,
Ming

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-05-09  3:26 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-05-08 11:24 [PATCH] nvme: remove disk after hw queue is started Ming Lei
2017-05-08 12:46 ` Ming Lei
2017-05-08 15:07   ` Keith Busch
2017-05-08 15:11     ` Keith Busch
2017-05-08 16:15       ` Ming Lei
2017-05-08 17:25         ` Keith Busch
2017-05-09  1:10           ` Ming Lei
2017-05-09  3:26             ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).