Linux-NVME Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2] nvme: use srcu for iterating namespace list
@ 2024-05-23 17:20 Keith Busch
  2024-05-24  4:41 ` Shinichiro Kawasaki
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Keith Busch @ 2024-05-23 17:20 UTC (permalink / raw)
  To: linux-nvme, shinichiro.kawasaki, hch, sagi; +Cc: axboe, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The nvme pci driver synchronizes with all the namespace queues during a
reset to ensure that there's no pending timeout work.

Meanwhile the timeout work potentially iterates those same namespaces to
freeze their queues.

Each of those namespace iterations use the same read lock. If a write
lock should somehow get between the synchronize and freeze steps, then
forward progress is deadlocked.

We had been relying on the nvme controller state machine to ensure the
reset work wouldn't conflict with timeout work. That guarantee may be a
bit fragile to rely on, so iterate the namespace lists without taking a
lock to fix potential circular locks, as reported by lockdep.

Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2:
  Improved changelog

 drivers/nvme/host/core.c      | 97 +++++++++++++++++++++--------------
 drivers/nvme/host/ioctl.c     | 12 ++---
 drivers/nvme/host/multipath.c | 21 ++++----
 drivers/nvme/host/nvme.h      |  3 +-
 4 files changed, 78 insertions(+), 55 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7706df2373494..b7cd46f3cef69 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns, *ret = NULL;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (ns->head->ns_id == nsid) {
 			if (!nvme_get_ns(ns))
 				continue;
@@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		if (ns->head->ns_id > nsid)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
 
 	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
 		if (tmp->head->ns_id < ns->head->ns_id) {
-			list_add(&ns->list, &tmp->list);
+			list_add_rcu(&ns->list, &tmp->list);
 			return;
 		}
 	}
@@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 	if (nvme_update_ns_info(ns, info))
 		goto out_unlink_ns;
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	/*
 	 * Ensure that no namespaces are added to the ctrl list after the queues
 	 * are frozen, thereby avoiding a deadlock between scan and reset.
 	 */
 	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
-		up_write(&ctrl->namespaces_rwsem);
+		mutex_unlock(&ctrl->namespaces_lock);
 		goto out_unlink_ns;
 	}
 	nvme_ns_add_to_ctrl_list(ns);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 	nvme_get_ctrl(ctrl);
 
 	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 
  out_cleanup_ns_from_list:
 	nvme_put_ctrl(ctrl);
-	down_write(&ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
@@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
 	del_gendisk(ns->disk);
 
-	down_write(&ns->ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ns->ctrl->namespaces_rwsem);
+	mutex_lock(&ns->ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ns->ctrl->namespaces_lock);
+	synchronize_srcu(&ns->ctrl->srcu);
 
 	if (last_path)
 		nvme_mpath_shutdown_disk(ns->head);
@@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(rm_list);
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 		if (ns->head->ns_id > nsid)
-			list_move_tail(&ns->list, &rm_list);
+			list_splice_init_rcu(&ns->list, &rm_list,
+					     synchronize_rcu);
 	}
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 		nvme_ns_remove(ns);
-
 }
 
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	/* this is a no-op when called from the controller reset handler */
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_splice_init(&ctrl->namespaces, &ns_list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 
 	list_for_each_entry_safe(ns, next, &ns_list, list)
 		nvme_ns_remove(ns);
@@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
 	key_put(ctrl->tls_key);
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	cleanup_srcu_struct(&ctrl->srcu);
 	nvme_auth_stop(ctrl);
 	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
@@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->passthru_err_log_enabled = false;
 	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 	spin_lock_init(&ctrl->lock);
+	mutex_init(&ctrl->namespaces_lock);
+
+	ret = init_srcu_struct(&ctrl->srcu);
+	if (ret)
+		return ret;
+
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	xa_init(&ctrl->cels);
-	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 out:
 	if (ctrl->discard_page)
 		__free_page(ctrl->discard_page);
+	cleanup_srcu_struct(&ctrl->srcu);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mark_disk_dead(ns->disk);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
 
 void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_unfreeze_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
 }
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		if (timeout <= 0)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return timeout;
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_freeze_queue_wait(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 
 void nvme_start_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_freeze_queue_start(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
@@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_sync_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 499a8bb7cac7d..0f05058692b55 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -789,16 +789,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 		bool open_for_write)
 {
 	struct nvme_ns *ns;
-	int ret;
+	int ret, srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
 	if (list_empty(&ctrl->namespaces)) {
 		ret = -ENOTTY;
 		goto out_unlock;
 	}
 
-	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
-	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
+	if (!ns) {
 		dev_warn(ctrl->device,
 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
 		ret = -EINVAL;
@@ -808,14 +808,14 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 	dev_warn(ctrl->device,
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 	kref_get(&ns->kref);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 
 	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
 	nvme_put_ns(ns);
 	return ret;
 
 out_unlock:
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1bee176fd850e..d8b6b4648eaff 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (!ns->head->disk)
 			continue;
 		kblockd_schedule_work(&ns->head->requeue_work);
 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 static const char *nvme_ana_state_names[] = {
@@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		nvme_mpath_clear_current_path(ns);
 		kblockd_schedule_work(&ns->head->requeue_work);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 	unsigned *nr_change_groups = data;
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 			le32_to_cpu(desc->grpid),
@@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	if (!nr_nsids)
 		return 0;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		unsigned nsid;
 again:
 		nsid = le32_to_cpu(desc->nsids[n]);
@@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 		if (ns->head->ns_id > nsid)
 			goto again;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return 0;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fc31bd340a63a..a005941a8b67e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -282,7 +282,8 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
-	struct rw_semaphore namespaces_rwsem;
+	struct mutex namespaces_lock;
+	struct srcu_struct srcu;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 #ifdef CONFIG_NVME_HWMON
-- 
2.43.0



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCHv2] nvme: use srcu for iterating namespace list
  2024-05-23 17:20 [PATCHv2] nvme: use srcu for iterating namespace list Keith Busch
@ 2024-05-24  4:41 ` Shinichiro Kawasaki
  2024-05-24 15:29   ` Sagi Grimberg
  2024-05-24  7:42 ` Christoph Hellwig
  2024-05-24 15:28 ` Sagi Grimberg
  2 siblings, 1 reply; 6+ messages in thread
From: Shinichiro Kawasaki @ 2024-05-24  4:41 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-nvme@lists.infradead.org, hch@lst.de, sagi@grimberg.me,
	axboe@kernel.dk, Keith Busch

On May 23, 2024 / 10:20, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The nvme pci driver synchronizes with all the namespace queues during a
> reset to ensure that there's no pending timeout work.
> 
> Meanwhile the timeout work potentially iterates those same namespaces to
> freeze their queues.
> 
> Each of those namespace iterations use the same read lock. If a write
> lock should somehow get between the synchronize and freeze steps, then
> forward progress is deadlocked.
> 
> We had been relying on the nvme controller state machine to ensure the
> reset work wouldn't conflict with timeout work. That guarantee may be a
> bit fragile to rely on, so iterate the namespace lists without taking a
> lock to fix potential circular locks, as reported by lockdep.
> 
> Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Keith Busch <kbusch@kernel.org>

Keith, thank you very much for the patch. Sagi, Christoph, thank you for the
discussion.

I confirmed this patch avoids the lockdep WARN that I reported. I also ran my
test set with the patch and observed no regression. Looks good.

Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv2] nvme: use srcu for iterating namespace list
  2024-05-23 17:20 [PATCHv2] nvme: use srcu for iterating namespace list Keith Busch
  2024-05-24  4:41 ` Shinichiro Kawasaki
@ 2024-05-24  7:42 ` Christoph Hellwig
  2024-05-24 15:28 ` Sagi Grimberg
  2 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2024-05-24  7:42 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-nvme, shinichiro.kawasaki, hch, sagi, axboe, Keith Busch

> -	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
> -	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
> +	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
> +	if (!ns) {
>  		dev_warn(ctrl->device,
>  			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
>  		ret = -EINVAL;
> @@ -808,14 +808,14 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
>  	dev_warn(ctrl->device,
>  		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
>  	kref_get(&ns->kref);

This still needs to use a kref_geT_unless_zero as mentioned in the last
round.



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv2] nvme: use srcu for iterating namespace list
  2024-05-23 17:20 [PATCHv2] nvme: use srcu for iterating namespace list Keith Busch
  2024-05-24  4:41 ` Shinichiro Kawasaki
  2024-05-24  7:42 ` Christoph Hellwig
@ 2024-05-24 15:28 ` Sagi Grimberg
  2 siblings, 0 replies; 6+ messages in thread
From: Sagi Grimberg @ 2024-05-24 15:28 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, shinichiro.kawasaki, hch; +Cc: axboe, Keith Busch



On 23/05/2024 20:20, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> The nvme pci driver synchronizes with all the namespace queues during a
> reset to ensure that there's no pending timeout work.
>
> Meanwhile the timeout work potentially iterates those same namespaces to
> freeze their queues.
>
> Each of those namespace iterations use the same read lock. If a write
> lock should somehow get between the synchronize and freeze steps, then
> forward progress is deadlocked.
>
> We had been relying on the nvme controller state machine to ensure the
> reset work wouldn't conflict with timeout work. That guarantee may be a
> bit fragile to rely on, so iterate the namespace lists without taking a
> lock to fix potential circular locks, as reported by lockdep.
>
> Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> v2:
>    Improved changelog
>
>   drivers/nvme/host/core.c      | 97 +++++++++++++++++++++--------------
>   drivers/nvme/host/ioctl.c     | 12 ++---
>   drivers/nvme/host/multipath.c | 21 ++++----
>   drivers/nvme/host/nvme.h      |  3 +-
>   4 files changed, 78 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 7706df2373494..b7cd46f3cef69 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
>   struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>   {
>   	struct nvme_ns *ns, *ret = NULL;
> +	int srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list) {
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
>   		if (ns->head->ns_id == nsid) {
>   			if (!nvme_get_ns(ns))
>   				continue;
> @@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>   		if (ns->head->ns_id > nsid)
>   			break;
>   	}
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   	return ret;
>   }
>   EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
> @@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
>   
>   	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
>   		if (tmp->head->ns_id < ns->head->ns_id) {
> -			list_add(&ns->list, &tmp->list);
> +			list_add_rcu(&ns->list, &tmp->list);
>   			return;
>   		}
>   	}
> @@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
>   	if (nvme_update_ns_info(ns, info))
>   		goto out_unlink_ns;
>   
> -	down_write(&ctrl->namespaces_rwsem);
> +	mutex_lock(&ctrl->namespaces_lock);
>   	/*
>   	 * Ensure that no namespaces are added to the ctrl list after the queues
>   	 * are frozen, thereby avoiding a deadlock between scan and reset.
>   	 */
>   	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
> -		up_write(&ctrl->namespaces_rwsem);
> +		mutex_unlock(&ctrl->namespaces_lock);
>   		goto out_unlink_ns;
>   	}
>   	nvme_ns_add_to_ctrl_list(ns);
> -	up_write(&ctrl->namespaces_rwsem);
> +	mutex_unlock(&ctrl->namespaces_lock);
> +	synchronize_srcu(&ctrl->srcu);
>   	nvme_get_ctrl(ctrl);
>   
>   	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
> @@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
>   
>    out_cleanup_ns_from_list:
>   	nvme_put_ctrl(ctrl);
> -	down_write(&ctrl->namespaces_rwsem);
> -	list_del_init(&ns->list);
> -	up_write(&ctrl->namespaces_rwsem);
> +	mutex_lock(&ctrl->namespaces_lock);
> +	list_del_rcu(&ns->list);
> +	mutex_unlock(&ctrl->namespaces_lock);
> +	synchronize_srcu(&ctrl->srcu);
>    out_unlink_ns:
>   	mutex_lock(&ctrl->subsys->lock);
>   	list_del_rcu(&ns->siblings);
> @@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>   		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
>   	del_gendisk(ns->disk);
>   
> -	down_write(&ns->ctrl->namespaces_rwsem);
> -	list_del_init(&ns->list);
> -	up_write(&ns->ctrl->namespaces_rwsem);
> +	mutex_lock(&ns->ctrl->namespaces_lock);
> +	list_del_rcu(&ns->list);
> +	mutex_unlock(&ns->ctrl->namespaces_lock);
> +	synchronize_srcu(&ns->ctrl->srcu);
>   
>   	if (last_path)
>   		nvme_mpath_shutdown_disk(ns->head);
> @@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
>   	struct nvme_ns *ns, *next;
>   	LIST_HEAD(rm_list);
>   
> -	down_write(&ctrl->namespaces_rwsem);
> +	mutex_lock(&ctrl->namespaces_lock);
>   	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
>   		if (ns->head->ns_id > nsid)
> -			list_move_tail(&ns->list, &rm_list);
> +			list_splice_init_rcu(&ns->list, &rm_list,
> +					     synchronize_rcu);
>   	}
> -	up_write(&ctrl->namespaces_rwsem);
> +	mutex_unlock(&ctrl->namespaces_lock);
> +	synchronize_srcu(&ctrl->srcu);
>   
>   	list_for_each_entry_safe(ns, next, &rm_list, list)
>   		nvme_ns_remove(ns);
> -
>   }
>   
>   static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
> @@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
>   	/* this is a no-op when called from the controller reset handler */
>   	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
>   
> -	down_write(&ctrl->namespaces_rwsem);
> -	list_splice_init(&ctrl->namespaces, &ns_list);
> -	up_write(&ctrl->namespaces_rwsem);
> +	mutex_lock(&ctrl->namespaces_lock);
> +	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
> +	mutex_unlock(&ctrl->namespaces_lock);
> +	synchronize_srcu(&ctrl->srcu);
>   
>   	list_for_each_entry_safe(ns, next, &ns_list, list)
>   		nvme_ns_remove(ns);
> @@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
>   	key_put(ctrl->tls_key);
>   	nvme_free_cels(ctrl);
>   	nvme_mpath_uninit(ctrl);
> +	cleanup_srcu_struct(&ctrl->srcu);
>   	nvme_auth_stop(ctrl);
>   	nvme_auth_free(ctrl);
>   	__free_page(ctrl->discard_page);
> @@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>   	ctrl->passthru_err_log_enabled = false;
>   	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
>   	spin_lock_init(&ctrl->lock);
> +	mutex_init(&ctrl->namespaces_lock);
> +
> +	ret = init_srcu_struct(&ctrl->srcu);
> +	if (ret)
> +		return ret;
> +
>   	mutex_init(&ctrl->scan_lock);
>   	INIT_LIST_HEAD(&ctrl->namespaces);
>   	xa_init(&ctrl->cels);
> -	init_rwsem(&ctrl->namespaces_rwsem);
>   	ctrl->dev = dev;
>   	ctrl->ops = ops;
>   	ctrl->quirks = quirks;
> @@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>   out:
>   	if (ctrl->discard_page)
>   		__free_page(ctrl->discard_page);
> +	cleanup_srcu_struct(&ctrl->srcu);
>   	return ret;
>   }
>   EXPORT_SYMBOL_GPL(nvme_init_ctrl);
> @@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
>   void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_ns *ns;
> +	int srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
>   		blk_mark_disk_dead(ns->disk);
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   }
>   EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
>   
>   void nvme_unfreeze(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_ns *ns;
> +	int srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
>   		blk_mq_unfreeze_queue(ns->queue);
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
>   }
>   EXPORT_SYMBOL_GPL(nvme_unfreeze);
> @@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
>   int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
>   {
>   	struct nvme_ns *ns;
> +	int srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list) {
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
>   		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
>   		if (timeout <= 0)
>   			break;
>   	}
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   	return timeout;
>   }
>   EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
> @@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
>   void nvme_wait_freeze(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_ns *ns;
> +	int srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
>   		blk_mq_freeze_queue_wait(ns->queue);
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   }
>   EXPORT_SYMBOL_GPL(nvme_wait_freeze);
>   
>   void nvme_start_freeze(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_ns *ns;
> +	int srcu_idx;
>   
>   	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
>   		blk_freeze_queue_start(ns->queue);
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   }
>   EXPORT_SYMBOL_GPL(nvme_start_freeze);
>   
> @@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
>   void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_ns *ns;
> +	int srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
> +	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
>   		blk_sync_queue(ns->queue);
> -	up_read(&ctrl->namespaces_rwsem);
> +	srcu_read_unlock(&ctrl->srcu, srcu_idx);
>   }
>   EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
>   
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 499a8bb7cac7d..0f05058692b55 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -789,16 +789,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
>   		bool open_for_write)
>   {
>   	struct nvme_ns *ns;
> -	int ret;
> +	int ret, srcu_idx;
>   
> -	down_read(&ctrl->namespaces_rwsem);
> +	srcu_idx = srcu_read_lock(&ctrl->srcu);
>   	if (list_empty(&ctrl->namespaces)) {
>   		ret = -ENOTTY;
>   		goto out_unlock;
>   	}
>   
> -	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
> -	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
> +	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
> +	if (!ns) {

Hmm, this seems like a functional change. the error suggest that the 
list must
contain only a single member to execute. Not sure why though. But this looks
like the change disagrees with the assumption.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv2] nvme: use srcu for iterating namespace list
  2024-05-24  4:41 ` Shinichiro Kawasaki
@ 2024-05-24 15:29   ` Sagi Grimberg
  2024-05-27  0:33     ` Shinichiro Kawasaki
  0 siblings, 1 reply; 6+ messages in thread
From: Sagi Grimberg @ 2024-05-24 15:29 UTC (permalink / raw)
  To: Shinichiro Kawasaki, Keith Busch
  Cc: linux-nvme@lists.infradead.org, hch@lst.de, axboe@kernel.dk,
	Keith Busch



On 24/05/2024 7:41, Shinichiro Kawasaki wrote:
> On May 23, 2024 / 10:20, Keith Busch wrote:
>> From: Keith Busch <kbusch@kernel.org>
>>
>> The nvme pci driver synchronizes with all the namespace queues during a
>> reset to ensure that there's no pending timeout work.
>>
>> Meanwhile the timeout work potentially iterates those same namespaces to
>> freeze their queues.
>>
>> Each of those namespace iterations use the same read lock. If a write
>> lock should somehow get between the synchronize and freeze steps, then
>> forward progress is deadlocked.
>>
>> We had been relying on the nvme controller state machine to ensure the
>> reset work wouldn't conflict with timeout work. That guarantee may be a
>> bit fragile to rely on, so iterate the namespace lists without taking a
>> lock to fix potential circular locks, as reported by lockdep.
>>
>> Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
>> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
>> Signed-off-by: Keith Busch <kbusch@kernel.org>
> Keith, thank you very much for the patch. Sagi, Christoph, thank you for the
> discussion.
>
> I confirmed this patch avoids the lockdep WARN that I reported. I also ran my
> test set with the patch and observed no regression. Looks good.
>
> Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

I'm assuming that all blktests ran with this change? including fabrics 
and permutations?


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCHv2] nvme: use srcu for iterating namespace list
  2024-05-24 15:29   ` Sagi Grimberg
@ 2024-05-27  0:33     ` Shinichiro Kawasaki
  0 siblings, 0 replies; 6+ messages in thread
From: Shinichiro Kawasaki @ 2024-05-27  0:33 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Keith Busch, linux-nvme@lists.infradead.org, hch@lst.de,
	axboe@kernel.dk, Keith Busch

On May 24, 2024 / 18:29, Sagi Grimberg wrote:
> 
> 
> On 24/05/2024 7:41, Shinichiro Kawasaki wrote:
> > On May 23, 2024 / 10:20, Keith Busch wrote:
> > > From: Keith Busch <kbusch@kernel.org>
> > > 
> > > The nvme pci driver synchronizes with all the namespace queues during a
> > > reset to ensure that there's no pending timeout work.
> > > 
> > > Meanwhile the timeout work potentially iterates those same namespaces to
> > > freeze their queues.
> > > 
> > > Each of those namespace iterations use the same read lock. If a write
> > > lock should somehow get between the synchronize and freeze steps, then
> > > forward progress is deadlocked.
> > > 
> > > We had been relying on the nvme controller state machine to ensure the
> > > reset work wouldn't conflict with timeout work. That guarantee may be a
> > > bit fragile to rely on, so iterate the namespace lists without taking a
> > > lock to fix potential circular locks, as reported by lockdep.
> > > 
> > > Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
> > > Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > > Signed-off-by: Keith Busch <kbusch@kernel.org>
> > Keith, thank you very much for the patch. Sagi, Christoph, thank you for the
> > discussion.
> > 
> > I confirmed this patch avoids the lockdep WARN that I reported. I also ran my
> > test set with the patch and observed no regression. Looks good.
> > 
> > Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> 
> I'm assuming that all blktests ran with this change? including fabrics and
> permutations?

Yes. To be precise, I ran with following conditions:

- target groups: block dm loop nbd nvme scsi srp ublk zbd
- NVMET_TRTYPES="loop rdma tcp fc"
- NVMET_BLKDEV_TYPES: default "device file"
- TEST_DEVS: QEMU nvme device

I observed some failures but they were all known failures.
Keith sent out the v3 patch. I will run my test set again.


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-05-27  0:33 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-23 17:20 [PATCHv2] nvme: use srcu for iterating namespace list Keith Busch
2024-05-24  4:41 ` Shinichiro Kawasaki
2024-05-24 15:29   ` Sagi Grimberg
2024-05-27  0:33     ` Shinichiro Kawasaki
2024-05-24  7:42 ` Christoph Hellwig
2024-05-24 15:28 ` Sagi Grimberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox