* [PATCH 0/2] skip removed namespaces
@ 2019-08-01 7:16 Hannes Reinecke
2019-08-01 7:16 ` [PATCH 1/2] nvme: skip namespaces which are about to be removed Hannes Reinecke
2019-08-01 7:16 ` [PATCH 2/2] nvme: do not remove namespaces during reset Hannes Reinecke
0 siblings, 2 replies; 11+ messages in thread
From: Hannes Reinecke @ 2019-08-01 7:16 UTC (permalink / raw)
Hi all,
when traversing the namespaces lists we need to ensure to skip
any namespaces marked as 'REMOVING', as these are about to be
deleted and shouldn't be accessed anymore.
As usual, comments and reviews are welcome.
Hannes Reinecke (2):
nvme: skip namespaces which are about to be removed
nvme: do not remove namespaces during reset
drivers/nvme/host/core.c | 44 ++++++++++++++++++++++++++++++++++++++++----
1 file changed, 40 insertions(+), 4 deletions(-)
--
2.16.4
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 7:16 [PATCH 0/2] skip removed namespaces Hannes Reinecke
@ 2019-08-01 7:16 ` Hannes Reinecke
2019-08-01 18:13 ` Nadolski, Edmund
` (2 more replies)
2019-08-01 7:16 ` [PATCH 2/2] nvme: do not remove namespaces during reset Hannes Reinecke
1 sibling, 3 replies; 11+ messages in thread
From: Hannes Reinecke @ 2019-08-01 7:16 UTC (permalink / raw)
nvme_ns_remove() will only remove the namespaces from the list at
the very last step, so we might run into situations where we iterate
over namespaces which are about to be deleted.
To avoid crashes we should be skipping all namespaces with the
NVME_NS_REMOVING flag set.
Signed-off-by: Hannes Reinecke <hare at suse.com>
---
drivers/nvme/host/core.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fcfff0a17a17..177fa4185775 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1303,9 +1303,12 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (test_bit(NVME_NS_REMOVING, &ns->flags))
+ continue;
if (ns->disk && nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
+ }
up_read(&ctrl->namespaces_rwsem);
nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
@@ -1698,6 +1701,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
+ /* if ns is removing we cannot mangle with the request queue */
+ if (test_bit(NVME_NS_REMOVING, &ns->flags))
+ return;
+
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
@@ -2776,6 +2783,10 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
ret = -EINVAL;
goto out_unlock;
}
+ if (test_bit(NVME_NS_REMOVING, &ns->flags)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
dev_warn(ctrl->device,
"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
@@ -3255,6 +3266,10 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ns->head->ns_id == nsid) {
if (!kref_get_unless_zero(&ns->kref))
continue;
+ if (test_bit(NVME_NS_REMOVING, &ns->flags)) {
+ nvme_put_ns(ns);
+ continue;
+ }
ret = ns;
break;
}
@@ -3445,6 +3460,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
+ if (test_bit(NVME_NS_REMOVING, &ns->flags))
+ continue;
if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
list_move_tail(&ns->list, &rm_list);
}
--
2.16.4
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 2/2] nvme: do not remove namespaces during reset
2019-08-01 7:16 [PATCH 0/2] skip removed namespaces Hannes Reinecke
2019-08-01 7:16 ` [PATCH 1/2] nvme: skip namespaces which are about to be removed Hannes Reinecke
@ 2019-08-01 7:16 ` Hannes Reinecke
2019-08-01 18:15 ` Nadolski, Edmund
2019-08-01 18:42 ` Sagi Grimberg
1 sibling, 2 replies; 11+ messages in thread
From: Hannes Reinecke @ 2019-08-01 7:16 UTC (permalink / raw)
Whenever the controller is resetting or connecting we cannot make
any decisions about the attached namespaces, and consequently we
shouldn't remove them. So skip namespace removal during reset;
a scan will be triggered anyway after reconnecting which will
clean up any stale namespaces.
Signed-off-by: Hannes Reinecke <hare at suse.com>
---
drivers/nvme/host/core.c | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 177fa4185775..b24cf21f34d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3410,7 +3410,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
return ret;
}
-static void nvme_ns_remove(struct nvme_ns *ns)
+static void __nvme_ns_remove(struct nvme_ns *ns)
{
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
@@ -3439,6 +3439,21 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_put_ns(ns);
}
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+ /*
+ * We cannot make any assumptions about namespaces during
+ * reset; in particular we shouldn't attempt to remove them
+ * as I/O might still be queued to them.
+ * So ignore this call during reset and rely on the
+ * rescan after reset to clean up things again.
+ */
+ if (ns->ctrl->state == NVME_CTRL_RESETTING ||
+ ns->ctrl->state == NVME_CTRL_CONNECTING)
+ return;
+ __nvme_ns_remove(ns);
+}
+
static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
@@ -3458,6 +3473,10 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
struct nvme_ns *ns, *next;
LIST_HEAD(rm_list);
+ if (ctrl->state == NVME_CTRL_RESETTING ||
+ ctrl->state == NVME_CTRL_CONNECTING)
+ return;
+
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (test_bit(NVME_NS_REMOVING, &ns->flags))
@@ -3468,7 +3487,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
up_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &rm_list, list)
- nvme_ns_remove(ns);
+ __nvme_ns_remove(ns);
}
@@ -3618,7 +3637,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
up_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &ns_list, list)
- nvme_ns_remove(ns);
+ __nvme_ns_remove(ns);
}
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
--
2.16.4
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 7:16 ` [PATCH 1/2] nvme: skip namespaces which are about to be removed Hannes Reinecke
@ 2019-08-01 18:13 ` Nadolski, Edmund
2019-08-01 18:34 ` Sagi Grimberg
2019-08-01 21:36 ` Keith Busch
2 siblings, 0 replies; 11+ messages in thread
From: Nadolski, Edmund @ 2019-08-01 18:13 UTC (permalink / raw)
On 8/1/2019 1:16 AM, Hannes Reinecke wrote:
> nvme_ns_remove() will only remove the namespaces from the list at
> the very last step, so we might run into situations where we iterate
> over namespaces which are about to be deleted.
> To avoid crashes we should be skipping all namespaces with the
> NVME_NS_REMOVING flag set.
>
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
> drivers/nvme/host/core.c | 19 ++++++++++++++++++-
> 1 file changed, 18 insertions(+), 1 deletion(-)
>
LGTM, just a (probably lame) question: is the check also needed in
nvme_[start|stop|sync]_queues()?
Ed
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fcfff0a17a17..177fa4185775 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1303,9 +1303,12 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
> struct nvme_ns *ns;
>
> down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + list_for_each_entry(ns, &ctrl->namespaces, list) {
> + if (test_bit(NVME_NS_REMOVING, &ns->flags))
> + continue;
> if (ns->disk && nvme_revalidate_disk(ns->disk))
> nvme_set_queue_dying(ns);
> + }
> up_read(&ctrl->namespaces_rwsem);
>
> nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
> @@ -1698,6 +1701,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> {
> struct nvme_ns *ns = disk->private_data;
>
> + /* if ns is removing we cannot mangle with the request queue */
> + if (test_bit(NVME_NS_REMOVING, &ns->flags))
> + return;
> +
> /*
> * If identify namespace failed, use default 512 byte block size so
> * block layer can use before failing read/write for 0 capacity.
> @@ -2776,6 +2783,10 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
> ret = -EINVAL;
> goto out_unlock;
> }
> + if (test_bit(NVME_NS_REMOVING, &ns->flags)) {
> + ret = -ENODEV;
> + goto out_unlock;
> + }
>
> dev_warn(ctrl->device,
> "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
> @@ -3255,6 +3266,10 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> if (ns->head->ns_id == nsid) {
> if (!kref_get_unless_zero(&ns->kref))
> continue;
> + if (test_bit(NVME_NS_REMOVING, &ns->flags)) {
> + nvme_put_ns(ns);
> + continue;
> + }
> ret = ns;
> break;
> }
> @@ -3445,6 +3460,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>
> down_write(&ctrl->namespaces_rwsem);
> list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
> + if (test_bit(NVME_NS_REMOVING, &ns->flags))
> + continue;
> if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
> list_move_tail(&ns->list, &rm_list);
> }
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/2] nvme: do not remove namespaces during reset
2019-08-01 7:16 ` [PATCH 2/2] nvme: do not remove namespaces during reset Hannes Reinecke
@ 2019-08-01 18:15 ` Nadolski, Edmund
2019-08-01 18:42 ` Sagi Grimberg
1 sibling, 0 replies; 11+ messages in thread
From: Nadolski, Edmund @ 2019-08-01 18:15 UTC (permalink / raw)
On 8/1/2019 1:16 AM, Hannes Reinecke wrote:
> Whenever the controller is resetting or connecting we cannot make
> any decisions about the attached namespaces, and consequently we
> shouldn't remove them. So skip namespace removal during reset;
> a scan will be triggered anyway after reconnecting which will
> clean up any stale namespaces.
>
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
> drivers/nvme/host/core.c | 25 ++++++++++++++++++++++---
> 1 file changed, 22 insertions(+), 3 deletions(-)
LGTM.
Ed
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 177fa4185775..b24cf21f34d0 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3410,7 +3410,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> return ret;
> }
>
> -static void nvme_ns_remove(struct nvme_ns *ns)
> +static void __nvme_ns_remove(struct nvme_ns *ns)
> {
> if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
> return;
> @@ -3439,6 +3439,21 @@ static void nvme_ns_remove(struct nvme_ns *ns)
> nvme_put_ns(ns);
> }
>
> +static void nvme_ns_remove(struct nvme_ns *ns)
> +{
> + /*
> + * We cannot make any assumptions about namespaces during
> + * reset; in particular we shouldn't attempt to remove them
> + * as I/O might still be queued to them.
> + * So ignore this call during reset and rely on the
> + * rescan after reset to clean up things again.
> + */
> + if (ns->ctrl->state == NVME_CTRL_RESETTING ||
> + ns->ctrl->state == NVME_CTRL_CONNECTING)
> + return;
> + __nvme_ns_remove(ns);
> +}
> +
> static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> {
> struct nvme_ns *ns;
> @@ -3458,6 +3473,10 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> struct nvme_ns *ns, *next;
> LIST_HEAD(rm_list);
>
> + if (ctrl->state == NVME_CTRL_RESETTING ||
> + ctrl->state == NVME_CTRL_CONNECTING)
> + return;
> +
> down_write(&ctrl->namespaces_rwsem);
> list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
> if (test_bit(NVME_NS_REMOVING, &ns->flags))
> @@ -3468,7 +3487,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> up_write(&ctrl->namespaces_rwsem);
>
> list_for_each_entry_safe(ns, next, &rm_list, list)
> - nvme_ns_remove(ns);
> + __nvme_ns_remove(ns);
>
> }
>
> @@ -3618,7 +3637,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
> up_write(&ctrl->namespaces_rwsem);
>
> list_for_each_entry_safe(ns, next, &ns_list, list)
> - nvme_ns_remove(ns);
> + __nvme_ns_remove(ns);
> }
> EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 7:16 ` [PATCH 1/2] nvme: skip namespaces which are about to be removed Hannes Reinecke
2019-08-01 18:13 ` Nadolski, Edmund
@ 2019-08-01 18:34 ` Sagi Grimberg
2019-08-01 21:36 ` Keith Busch
2 siblings, 0 replies; 11+ messages in thread
From: Sagi Grimberg @ 2019-08-01 18:34 UTC (permalink / raw)
> nvme_ns_remove() will only remove the namespaces from the list at
> the very last step, so we might run into situations where we iterate
> over namespaces which are about to be deleted.
> To avoid crashes we should be skipping all namespaces with the
> NVME_NS_REMOVING flag set.
Hannes,
Can you please attach the actual crash that this is fixing? I think
its much easier to review and provide feedback.
Also, I think that what I proposed is addressing what you describe here
but in a cleaner way. You only raised a concern about calling
blk_cleanup_queue on a quiesced request queue which should not be
a problem because nothing is blocking on blk_queue_enter so it will
make progress.
Did you attempt to apply my suggested patch? Or do you have a
reproducer?
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/2] nvme: do not remove namespaces during reset
2019-08-01 7:16 ` [PATCH 2/2] nvme: do not remove namespaces during reset Hannes Reinecke
2019-08-01 18:15 ` Nadolski, Edmund
@ 2019-08-01 18:42 ` Sagi Grimberg
1 sibling, 0 replies; 11+ messages in thread
From: Sagi Grimberg @ 2019-08-01 18:42 UTC (permalink / raw)
> Whenever the controller is resetting or connecting we cannot make
> any decisions about the attached namespaces, and consequently we
> shouldn't remove them. So skip namespace removal during reset;
> a scan will be triggered anyway after reconnecting which will
> clean up any stale namespaces.
I personally dislike making nvme_ns_remove() to semantically
mean nvme_ns_maybe_remove()..
Is there a fundamental why this check is not performed in the
call-site and is inside nvme_ns_remove()? Are there other
calls that you have seen to be problematic? if so which? and how?
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 7:16 ` [PATCH 1/2] nvme: skip namespaces which are about to be removed Hannes Reinecke
2019-08-01 18:13 ` Nadolski, Edmund
2019-08-01 18:34 ` Sagi Grimberg
@ 2019-08-01 21:36 ` Keith Busch
2019-08-01 21:52 ` Sagi Grimberg
2 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2019-08-01 21:36 UTC (permalink / raw)
On Thu, Aug 01, 2019@12:16:43AM -0700, Hannes Reinecke wrote:
> nvme_ns_remove() will only remove the namespaces from the list at
> the very last step, so we might run into situations where we iterate
> over namespaces which are about to be deleted.
> To avoid crashes we should be skipping all namespaces with the
> NVME_NS_REMOVING flag set.
This all looks to be racing with whatever task is going to call
nvme_ns_remove().
Could we instead move these invalid namespaces off the ctrl->namespaces
list prior to calling nvme_ns_remove(), and while holding the write
lock? That way nothing can iterate the namespaces that we're deleting.
We already do that in some places, so that looks like it may be the safe
way to do this.
>
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
> drivers/nvme/host/core.c | 19 ++++++++++++++++++-
> 1 file changed, 18 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fcfff0a17a17..177fa4185775 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1303,9 +1303,12 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
> struct nvme_ns *ns;
>
> down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + list_for_each_entry(ns, &ctrl->namespaces, list) {
> + if (test_bit(NVME_NS_REMOVING, &ns->flags))
> + continue;
> if (ns->disk && nvme_revalidate_disk(ns->disk))
> nvme_set_queue_dying(ns);
> + }
> up_read(&ctrl->namespaces_rwsem);
>
> nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
> @@ -1698,6 +1701,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
> {
> struct nvme_ns *ns = disk->private_data;
>
> + /* if ns is removing we cannot mangle with the request queue */
> + if (test_bit(NVME_NS_REMOVING, &ns->flags))
> + return;
> +
> /*
> * If identify namespace failed, use default 512 byte block size so
> * block layer can use before failing read/write for 0 capacity.
> @@ -2776,6 +2783,10 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
> ret = -EINVAL;
> goto out_unlock;
> }
> + if (test_bit(NVME_NS_REMOVING, &ns->flags)) {
> + ret = -ENODEV;
> + goto out_unlock;
> + }
>
> dev_warn(ctrl->device,
> "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
> @@ -3255,6 +3266,10 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> if (ns->head->ns_id == nsid) {
> if (!kref_get_unless_zero(&ns->kref))
> continue;
> + if (test_bit(NVME_NS_REMOVING, &ns->flags)) {
> + nvme_put_ns(ns);
> + continue;
> + }
> ret = ns;
> break;
> }
> @@ -3445,6 +3460,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>
> down_write(&ctrl->namespaces_rwsem);
> list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
> + if (test_bit(NVME_NS_REMOVING, &ns->flags))
> + continue;
> if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
> list_move_tail(&ns->list, &rm_list);
> }
> --
> 2.16.4
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 21:36 ` Keith Busch
@ 2019-08-01 21:52 ` Sagi Grimberg
2019-08-01 22:10 ` Keith Busch
0 siblings, 1 reply; 11+ messages in thread
From: Sagi Grimberg @ 2019-08-01 21:52 UTC (permalink / raw)
>> nvme_ns_remove() will only remove the namespaces from the list at
>> the very last step, so we might run into situations where we iterate
>> over namespaces which are about to be deleted.
>> To avoid crashes we should be skipping all namespaces with the
>> NVME_NS_REMOVING flag set.
>
> This all looks to be racing with whatever task is going to call
> nvme_ns_remove().
>
> Could we instead move these invalid namespaces off the ctrl->namespaces
> list prior to calling nvme_ns_remove(), and while holding the write
> lock? That way nothing can iterate the namespaces that we're deleting.
> We already do that in some places, so that looks like it may be the safe
> way to do this.
This is exactly what I proposed in:
[PATCH rfc 2/2] nvme: fix possible use-after-free condition when
controller reset is racing namespace scanning
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 21:52 ` Sagi Grimberg
@ 2019-08-01 22:10 ` Keith Busch
2019-08-02 20:45 ` Sagi Grimberg
0 siblings, 1 reply; 11+ messages in thread
From: Keith Busch @ 2019-08-01 22:10 UTC (permalink / raw)
On Thu, Aug 01, 2019@02:52:37PM -0700, Sagi Grimberg wrote:
>
> > > nvme_ns_remove() will only remove the namespaces from the list at
> > > the very last step, so we might run into situations where we iterate
> > > over namespaces which are about to be deleted.
> > > To avoid crashes we should be skipping all namespaces with the
> > > NVME_NS_REMOVING flag set.
> >
> > This all looks to be racing with whatever task is going to call
> > nvme_ns_remove().
> >
> > Could we instead move these invalid namespaces off the ctrl->namespaces
> > list prior to calling nvme_ns_remove(), and while holding the write
> > lock? That way nothing can iterate the namespaces that we're deleting.
> > We already do that in some places, so that looks like it may be the safe
> > way to do this.
>
> This is exactly what I proposed in:
> [PATCH rfc 2/2] nvme: fix possible use-after-free condition when controller
> reset is racing namespace scanning
Hm, I had to look up why the list_del is done at then end. It is after
del_gendisk() because that syncs dirty buffers, which means we could have
IO that can timeout. We need the namespaces in the controller list during
removal so that timeout handlers can iterate them for cleanup. Otherwise
you could have some buffered write tasks constantly entering the queue,
preventing namespace removal. The only time should be safe to take the
namespace off list first is if we've set the queue to dying prior to
calling del_gendisk.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/2] nvme: skip namespaces which are about to be removed
2019-08-01 22:10 ` Keith Busch
@ 2019-08-02 20:45 ` Sagi Grimberg
0 siblings, 0 replies; 11+ messages in thread
From: Sagi Grimberg @ 2019-08-02 20:45 UTC (permalink / raw)
>> This is exactly what I proposed in:
>> [PATCH rfc 2/2] nvme: fix possible use-after-free condition when controller
>> reset is racing namespace scanning
>
> Hm, I had to look up why the list_del is done at then end. It is after
> del_gendisk() because that syncs dirty buffers, which means we could have
> IO that can timeout. We need the namespaces in the controller list during
> removal so that timeout handlers can iterate them for cleanup. Otherwise
> you could have some buffered write tasks constantly entering the queue,
> preventing namespace removal. The only time should be safe to take the
> namespace off list first is if we've set the queue to dying prior to
> calling del_gendisk.
Would this work?
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6956041224ec..c13cbdc262ee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3418,19 +3418,21 @@ static void nvme_ns_remove(struct nvme_ns *ns)
synchronize_rcu(); /* guarantee not available in head->list */
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent
submissions */
+ nvme_mpath_check_last_path(ns);
- if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
+ if (ns->disk && ns->disk->flags & GENHD_FL_UP)
del_gendisk(ns->disk);
- blk_cleanup_queue(ns->queue);
- if (blk_get_integrity(ns->disk))
- blk_integrity_unregister(ns->disk);
- }
down_write(&ns->ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
- nvme_mpath_check_last_path(ns);
+ if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
+ blk_cleanup_queue(ns->queue);
+ if (blk_get_integrity(ns->disk))
+ blk_integrity_unregister(ns->disk);
+ }
+
nvme_put_ns(ns);
}
--
^ permalink raw reply related [flat|nested] 11+ messages in thread
end of thread, other threads:[~2019-08-02 20:45 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-08-01 7:16 [PATCH 0/2] skip removed namespaces Hannes Reinecke
2019-08-01 7:16 ` [PATCH 1/2] nvme: skip namespaces which are about to be removed Hannes Reinecke
2019-08-01 18:13 ` Nadolski, Edmund
2019-08-01 18:34 ` Sagi Grimberg
2019-08-01 21:36 ` Keith Busch
2019-08-01 21:52 ` Sagi Grimberg
2019-08-01 22:10 ` Keith Busch
2019-08-02 20:45 ` Sagi Grimberg
2019-08-01 7:16 ` [PATCH 2/2] nvme: do not remove namespaces during reset Hannes Reinecke
2019-08-01 18:15 ` Nadolski, Edmund
2019-08-01 18:42 ` Sagi Grimberg
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox