[PATCH 0/7] block: remove all debugfs dir & don't grab debugfs

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex
@ 2025-02-09 12:20 Ming Lei
  2025-02-09 12:20 ` [PATCH 1/7] block: remove hctx->debugfs_dir Ming Lei
                   ` (7 more replies)
  0 siblings, 8 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

Hi,

The 1st 6 patch removes all kinds of debugfs dir entries of block layer,
instead always retrieves them debugfs, so avoid to maintain block
internal debugfs state.

The 7 patch removes debugfs_mutex for adding/removing debugfs entry
because we needn't the protection any more, then one lockdep warning
can be fixed.


Ming Lei (7):
  block: remove hctx->debugfs_dir
  block: remove hctx->sched_debugfs_dir
  block: remove q->sched_debugfs_dir
  block: remove q->rqos_debugfs_dir
  block: remove rqos->debugfs_dir
  block: remove q->debugfs_dir
  block: don't grab q->debugfs_mutex

 block/blk-mq-debugfs.c  | 179 +++++++++++++++++++++++++++++-----------
 block/blk-mq-sched.c    |   8 --
 block/blk-rq-qos.c      |   7 +-
 block/blk-rq-qos.h      |   3 -
 block/blk-sysfs.c       |  13 +--
 include/linux/blk-mq.h  |  10 ---
 include/linux/blkdev.h  |   3 -
 kernel/trace/blktrace.c |  27 +++++-
 8 files changed, 158 insertions(+), 92 deletions(-)

-- 
2.47.0


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/7] block: remove hctx->debugfs_dir
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-13  6:41   ` Christoph Hellwig
  2025-02-09 12:20 ` [PATCH 2/7] block: remove hctx->sched_debugfs_dir Ming Lei
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

For each hctx, its debugfs path is fixed, which can be queried from
its parent dentry and hctx queue num, so it isn't necessary to cache
it in hctx structure because it isn't used in fast path.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c | 47 ++++++++++++++++++++++++++++++++----------
 include/linux/blk-mq.h |  5 -----
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index adf5f0697b6b..16260bba4d11 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -633,8 +633,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 
 	/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (!hctx->debugfs_dir)
-			blk_mq_debugfs_register_hctx(q, hctx);
+		blk_mq_debugfs_register_hctx(q, hctx);
 		if (q->elevator && !hctx->sched_debugfs_dir)
 			blk_mq_debugfs_register_sched_hctx(q, hctx);
 	}
@@ -649,14 +648,28 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	}
 }
 
+static __must_check struct dentry *blk_mq_get_hctx_entry(
+		struct blk_mq_hw_ctx *hctx)
+{
+	char name[20];
+
+	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
+	return debugfs_lookup(name, hctx->queue->debugfs_dir);
+}
+
 static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_ctx *ctx)
 {
+	struct dentry *hctx_dir = blk_mq_get_hctx_entry(hctx);
 	struct dentry *ctx_dir;
 	char name[20];
 
+	if (IS_ERR_OR_NULL(hctx_dir))
+		return;
+
 	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
-	ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir);
+	ctx_dir = debugfs_create_dir(name, hctx_dir);
+	dput(hctx_dir);
 
 	debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs);
 }
@@ -664,6 +677,7 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 void blk_mq_debugfs_register_hctx(struct request_queue *q,
 				  struct blk_mq_hw_ctx *hctx)
 {
+	struct dentry *hctx_dir;
 	struct blk_mq_ctx *ctx;
 	char name[20];
 	int i;
@@ -672,9 +686,11 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
 		return;
 
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
-	hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
+	hctx_dir = debugfs_create_dir(name, q->debugfs_dir);
+	if (IS_ERR_OR_NULL(hctx_dir))
+		return;
 
-	debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs);
+	debugfs_create_files(hctx_dir, hctx, blk_mq_debugfs_hctx_attrs);
 
 	hctx_for_each_ctx(hctx, ctx, i)
 		blk_mq_debugfs_register_ctx(hctx, ctx);
@@ -682,11 +698,18 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
 
 void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 {
+	struct dentry *hctx_dir;
+
 	if (!hctx->queue->debugfs_dir)
 		return;
-	debugfs_remove_recursive(hctx->debugfs_dir);
+
+	hctx_dir = blk_mq_get_hctx_entry(hctx);
+	if (IS_ERR_OR_NULL(hctx_dir))
+		return;
+
+	debugfs_remove_recursive(hctx_dir);
 	hctx->sched_debugfs_dir = NULL;
-	hctx->debugfs_dir = NULL;
+	dput(hctx_dir);
 }
 
 void blk_mq_debugfs_register_hctxs(struct request_queue *q)
@@ -780,6 +803,7 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 					struct blk_mq_hw_ctx *hctx)
 {
+	struct dentry *hctx_dir = blk_mq_get_hctx_entry(hctx);
 	struct elevator_type *e = q->elevator->type;
 
 	lockdep_assert_held(&q->debugfs_mutex);
@@ -789,16 +813,17 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 	 * We will be called again later on with appropriate parent debugfs
 	 * directory from blk_register_queue()
 	 */
-	if (!hctx->debugfs_dir)
+	if (IS_ERR_OR_NULL(hctx_dir))
 		return;
 
 	if (!e->hctx_debugfs_attrs)
-		return;
+		goto exit;
 
-	hctx->sched_debugfs_dir = debugfs_create_dir("sched",
-						     hctx->debugfs_dir);
+	hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx_dir);
 	debugfs_create_files(hctx->sched_debugfs_dir, hctx,
 			     e->hctx_debugfs_attrs);
+exit:
+	dput(hctx_dir);
 }
 
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9ebb53f031cd..8c8682491403 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -427,11 +427,6 @@ struct blk_mq_hw_ctx {
 	struct kobject		kobj;
 
 #ifdef CONFIG_BLK_DEBUG_FS
-	/**
-	 * @debugfs_dir: debugfs directory for this hardware queue. Named
-	 * as cpu<cpu_number>.
-	 */
-	struct dentry		*debugfs_dir;
 	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
 	struct dentry		*sched_debugfs_dir;
 #endif
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/7] block: remove hctx->sched_debugfs_dir
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
  2025-02-09 12:20 ` [PATCH 1/7] block: remove hctx->debugfs_dir Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-13  6:42   ` Christoph Hellwig
  2025-02-09 12:20 ` [PATCH 3/7] block: remove q->sched_debugfs_dir Ming Lei
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

For each hctx, its sched debugfs path is fixed, which can be queried from
its parent dentry directly, so it isn't necessary to cache it in hctx
instance because it isn't used in fast path.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c | 31 ++++++++++++++++++++++++-------
 include/linux/blk-mq.h |  5 -----
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 16260bba4d11..3abb38ea2577 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -634,7 +634,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		blk_mq_debugfs_register_hctx(q, hctx);
-		if (q->elevator && !hctx->sched_debugfs_dir)
+		if (q->elevator)
 			blk_mq_debugfs_register_sched_hctx(q, hctx);
 	}
 
@@ -657,6 +657,19 @@ static __must_check struct dentry *blk_mq_get_hctx_entry(
 	return debugfs_lookup(name, hctx->queue->debugfs_dir);
 }
 
+static __must_check struct dentry *blk_mq_get_hctx_sched_entry(
+		struct blk_mq_hw_ctx *hctx)
+{
+	struct dentry *hctx_dir = blk_mq_get_hctx_entry(hctx);
+	struct dentry *sched_dir = NULL;
+
+	if (hctx_dir) {
+		sched_dir = debugfs_lookup("sched", hctx_dir);
+		dput(hctx_dir);
+	}
+	return sched_dir;
+}
+
 static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_ctx *ctx)
 {
@@ -708,7 +721,6 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 		return;
 
 	debugfs_remove_recursive(hctx_dir);
-	hctx->sched_debugfs_dir = NULL;
 	dput(hctx_dir);
 }
 
@@ -805,6 +817,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 {
 	struct dentry *hctx_dir = blk_mq_get_hctx_entry(hctx);
 	struct elevator_type *e = q->elevator->type;
+	struct dentry *sched_dir;
 
 	lockdep_assert_held(&q->debugfs_mutex);
 
@@ -819,19 +832,23 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 	if (!e->hctx_debugfs_attrs)
 		goto exit;
 
-	hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx_dir);
-	debugfs_create_files(hctx->sched_debugfs_dir, hctx,
-			     e->hctx_debugfs_attrs);
+	sched_dir = debugfs_create_dir("sched", hctx_dir);
+	debugfs_create_files(sched_dir, hctx, e->hctx_debugfs_attrs);
 exit:
 	dput(hctx_dir);
 }
 
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 {
+	struct dentry *sched_dir;
+
 	lockdep_assert_held(&hctx->queue->debugfs_mutex);
 
 	if (!hctx->queue->debugfs_dir)
 		return;
-	debugfs_remove_recursive(hctx->sched_debugfs_dir);
-	hctx->sched_debugfs_dir = NULL;
+	sched_dir = blk_mq_get_hctx_sched_entry(hctx);
+	if (sched_dir) {
+		debugfs_remove_recursive(sched_dir);
+		dput(sched_dir);
+	}
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8c8682491403..965aeea75ddd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -426,11 +426,6 @@ struct blk_mq_hw_ctx {
 	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;
 
-#ifdef CONFIG_BLK_DEBUG_FS
-	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
-	struct dentry		*sched_debugfs_dir;
-#endif
-
 	/**
 	 * @hctx_list: if this hctx is not in use, this is an entry in
 	 * q->unused_hctx_list.
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/7] block: remove q->sched_debugfs_dir
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
  2025-02-09 12:20 ` [PATCH 1/7] block: remove hctx->debugfs_dir Ming Lei
  2025-02-09 12:20 ` [PATCH 2/7] block: remove hctx->sched_debugfs_dir Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-09 12:20 ` [PATCH 4/7] block: remove q->rqos_debugfs_dir Ming Lei
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

For each request_queue, its sched debugfs path is fixed, which can be
queried from its parent dentry directly, so it isn't necessary to cache
it in request_queue instance because it isn't used in fast path.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c | 10 +++++-----
 block/blk-sysfs.c      |  1 -
 include/linux/blkdev.h |  1 -
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3abb38ea2577..3d3346fc4c4e 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -628,7 +628,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	 * didn't exist yet (because we don't know what to name the directory
 	 * until the queue is registered to a gendisk).
 	 */
-	if (q->elevator && !q->sched_debugfs_dir)
+	if (q->elevator)
 		blk_mq_debugfs_register_sched(q);
 
 	/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
@@ -745,6 +745,7 @@ void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 void blk_mq_debugfs_register_sched(struct request_queue *q)
 {
 	struct elevator_type *e = q->elevator->type;
+	struct dentry *sched_dir;
 
 	lockdep_assert_held(&q->debugfs_mutex);
 
@@ -758,17 +759,16 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
 	if (!e->queue_debugfs_attrs)
 		return;
 
-	q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir);
+	sched_dir = debugfs_create_dir("sched", q->debugfs_dir);
 
-	debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
+	debugfs_create_files(sched_dir, q, e->queue_debugfs_attrs);
 }
 
 void blk_mq_debugfs_unregister_sched(struct request_queue *q)
 {
 	lockdep_assert_held(&q->debugfs_mutex);
 
-	debugfs_remove_recursive(q->sched_debugfs_dir);
-	q->sched_debugfs_dir = NULL;
+	debugfs_lookup_and_remove("sched", q->debugfs_dir);
 }
 
 static const char *rq_qos_id_to_name(enum rq_qos_id id)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6f548a4376aa..35d92ed10fdb 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -748,7 +748,6 @@ static void blk_debugfs_remove(struct gendisk *disk)
 	blk_trace_shutdown(q);
 	debugfs_remove_recursive(q->debugfs_dir);
 	q->debugfs_dir = NULL;
-	q->sched_debugfs_dir = NULL;
 	q->rqos_debugfs_dir = NULL;
 	mutex_unlock(&q->debugfs_mutex);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 248416ecd01c..5efe6f00d86f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -598,7 +598,6 @@ struct request_queue {
 	struct list_head	tag_set_list;
 
 	struct dentry		*debugfs_dir;
-	struct dentry		*sched_debugfs_dir;
 	struct dentry		*rqos_debugfs_dir;
 	/*
 	 * Serializes all debugfs metadata operations using the above dentries.
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/7] block: remove q->rqos_debugfs_dir
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
                   ` (2 preceding siblings ...)
  2025-02-09 12:20 ` [PATCH 3/7] block: remove q->sched_debugfs_dir Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-09 12:20 ` [PATCH 5/7] block: remove rqos->debugfs_dir Ming Lei
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

For each request_queue, its rqos debugfs path is fixed, which can be
queried from its parent dentry directly, so it isn't necessary to cache
it in request_queue instance because it isn't used in fast path.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c | 12 ++++++++----
 block/blk-sysfs.c      |  1 -
 include/linux/blkdev.h |  1 -
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3d3346fc4c4e..9ccaf506514e 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -638,6 +638,8 @@ void blk_mq_debugfs_register(struct request_queue *q)
 			blk_mq_debugfs_register_sched_hctx(q, hctx);
 	}
 
+	debugfs_create_dir("rqos", q->debugfs_dir);
+
 	if (q->rq_qos) {
 		struct rq_qos *rqos = q->rq_qos;
 
@@ -798,17 +800,19 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
 	const char *dir_name = rq_qos_id_to_name(rqos->id);
+	struct dentry *rqos_top;
 
 	lockdep_assert_held(&q->debugfs_mutex);
 
 	if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
 		return;
 
-	if (!q->rqos_debugfs_dir)
-		q->rqos_debugfs_dir = debugfs_create_dir("rqos",
-							 q->debugfs_dir);
+	rqos_top = debugfs_lookup("rqos", q->debugfs_dir);
+	if (IS_ERR_OR_NULL(rqos_top))
+		return;
 
-	rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir);
+	rqos->debugfs_dir = debugfs_create_dir(dir_name, rqos_top);
+	dput(rqos_top);
 	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
 }
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 35d92ed10fdb..0679116bb195 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -748,7 +748,6 @@ static void blk_debugfs_remove(struct gendisk *disk)
 	blk_trace_shutdown(q);
 	debugfs_remove_recursive(q->debugfs_dir);
 	q->debugfs_dir = NULL;
-	q->rqos_debugfs_dir = NULL;
 	mutex_unlock(&q->debugfs_mutex);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5efe6f00d86f..7663f0c482de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -598,7 +598,6 @@ struct request_queue {
 	struct list_head	tag_set_list;
 
 	struct dentry		*debugfs_dir;
-	struct dentry		*rqos_debugfs_dir;
 	/*
 	 * Serializes all debugfs metadata operations using the above dentries.
 	 */
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 5/7] block: remove rqos->debugfs_dir
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
                   ` (3 preceding siblings ...)
  2025-02-09 12:20 ` [PATCH 4/7] block: remove q->rqos_debugfs_dir Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-09 12:20 ` [PATCH 6/7] block: remove q->debugfs_dir Ming Lei
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

For each rqos instance, its debugfs path is fixed, which can be queried
from its parent dentry & rqos name directly, so it isn't necessary to
cache it in rqos instance because it isn't used in fast path.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c | 30 ++++++++++++++++++++++--------
 block/blk-rq-qos.h     |  3 ---
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 9ccaf506514e..40eb104fc1d5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -786,14 +786,27 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
 	return "unknown";
 }
 
+static __must_check struct dentry *blk_mq_debugfs_get_rqos_top(
+		struct request_queue *q)
+{
+	return debugfs_lookup("rqos", q->debugfs_dir);
+}
+
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 {
-	lockdep_assert_held(&rqos->disk->queue->debugfs_mutex);
+	struct request_queue *q = rqos->disk->queue;
+	struct dentry *rqos_top;
+
+	lockdep_assert_held(&q->debugfs_mutex);
+
+	if (!q->debugfs_dir)
+		return;
 
-	if (!rqos->disk->queue->debugfs_dir)
+	rqos_top = blk_mq_debugfs_get_rqos_top(q);
+	if (IS_ERR_OR_NULL(rqos_top))
 		return;
-	debugfs_remove_recursive(rqos->debugfs_dir);
-	rqos->debugfs_dir = NULL;
+	debugfs_lookup_and_remove(rq_qos_id_to_name(rqos->id), rqos_top);
+	dput(rqos_top);
 }
 
 void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
@@ -801,19 +814,20 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 	struct request_queue *q = rqos->disk->queue;
 	const char *dir_name = rq_qos_id_to_name(rqos->id);
 	struct dentry *rqos_top;
+	struct dentry *rqos_dir;
 
 	lockdep_assert_held(&q->debugfs_mutex);
 
-	if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
+	if (!rqos->ops->debugfs_attrs)
 		return;
 
-	rqos_top = debugfs_lookup("rqos", q->debugfs_dir);
+	rqos_top = blk_mq_debugfs_get_rqos_top(q);
 	if (IS_ERR_OR_NULL(rqos_top))
 		return;
 
-	rqos->debugfs_dir = debugfs_create_dir(dir_name, rqos_top);
+	rqos_dir = debugfs_create_dir(dir_name, rqos_top);
 	dput(rqos_top);
-	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
+	debugfs_create_files(rqos_dir, rqos, rqos->ops->debugfs_attrs);
 }
 
 void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..49c31f1e5578 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -29,9 +29,6 @@ struct rq_qos {
 	struct gendisk *disk;
 	enum rq_qos_id id;
 	struct rq_qos *next;
-#ifdef CONFIG_BLK_DEBUG_FS
-	struct dentry *debugfs_dir;
-#endif
 };
 
 struct rq_qos_ops {
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 6/7] block: remove q->debugfs_dir
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
                   ` (4 preceding siblings ...)
  2025-02-09 12:20 ` [PATCH 5/7] block: remove rqos->debugfs_dir Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-09 12:20 ` [PATCH 7/7] block: don't grab q->debugfs_mutex Ming Lei
  2025-02-17  9:41 ` [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Christoph Hellwig
  7 siblings, 0 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

For each rqos instance, its debugfs path is fixed, which can be queried
from its block debugfs dentry & disk name directly, so it isn't necessary to
cache it in request_queue instance because it isn't used in fast path.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c  | 73 ++++++++++++++++++++++++++++++-----------
 block/blk-sysfs.c       |  5 ++-
 include/linux/blkdev.h  |  1 -
 kernel/trace/blktrace.c | 25 +++++++++++---
 4 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 40eb104fc1d5..6d98c2a6e7c6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -616,15 +616,25 @@ static void debugfs_create_files(struct dentry *parent, void *data,
 				    (void *)attr, &blk_mq_debugfs_fops);
 }
 
+static __must_check struct dentry *blk_mq_get_queue_entry(
+		struct request_queue *q)
+{
+	return debugfs_lookup(q->disk->disk_name, blk_debugfs_root);
+}
+
 void blk_mq_debugfs_register(struct request_queue *q)
 {
+	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
+	if (IS_ERR_OR_NULL(queue_dir))
+		return;
+
+	debugfs_create_files(queue_dir, q, blk_mq_debugfs_queue_attrs);
 
 	/*
-	 * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
+	 * blk_mq_init_sched() attempted to do this already, but queue debugfs_dir
 	 * didn't exist yet (because we don't know what to name the directory
 	 * until the queue is registered to a gendisk).
 	 */
@@ -638,7 +648,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 			blk_mq_debugfs_register_sched_hctx(q, hctx);
 	}
 
-	debugfs_create_dir("rqos", q->debugfs_dir);
+	debugfs_create_dir("rqos", queue_dir);
 
 	if (q->rq_qos) {
 		struct rq_qos *rqos = q->rq_qos;
@@ -648,15 +658,25 @@ void blk_mq_debugfs_register(struct request_queue *q)
 			rqos = rqos->next;
 		}
 	}
+
+	dput(queue_dir);
 }
 
 static __must_check struct dentry *blk_mq_get_hctx_entry(
 		struct blk_mq_hw_ctx *hctx)
 {
+	struct dentry *queue_dir = blk_mq_get_queue_entry(hctx->queue);
+	struct dentry *dir;
 	char name[20];
 
+	if (IS_ERR_OR_NULL(queue_dir))
+		return NULL;
+
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
-	return debugfs_lookup(name, hctx->queue->debugfs_dir);
+	dir = debugfs_lookup(name, queue_dir);
+	dput(queue_dir);
+
+	return dir;
 }
 
 static __must_check struct dentry *blk_mq_get_hctx_sched_entry(
@@ -692,32 +712,32 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 void blk_mq_debugfs_register_hctx(struct request_queue *q,
 				  struct blk_mq_hw_ctx *hctx)
 {
+	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
 	struct dentry *hctx_dir;
 	struct blk_mq_ctx *ctx;
 	char name[20];
 	int i;
 
-	if (!q->debugfs_dir)
+	if (IS_ERR_OR_NULL(queue_dir))
 		return;
 
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
-	hctx_dir = debugfs_create_dir(name, q->debugfs_dir);
+	hctx_dir = debugfs_create_dir(name, queue_dir);
 	if (IS_ERR_OR_NULL(hctx_dir))
-		return;
+		goto exit;
 
 	debugfs_create_files(hctx_dir, hctx, blk_mq_debugfs_hctx_attrs);
 
 	hctx_for_each_ctx(hctx, ctx, i)
 		blk_mq_debugfs_register_ctx(hctx, ctx);
+exit:
+	dput(queue_dir);
 }
 
 void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct dentry *hctx_dir;
 
-	if (!hctx->queue->debugfs_dir)
-		return;
-
 	hctx_dir = blk_mq_get_hctx_entry(hctx);
 	if (IS_ERR_OR_NULL(hctx_dir))
 		return;
@@ -746,6 +766,7 @@ void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 
 void blk_mq_debugfs_register_sched(struct request_queue *q)
 {
+	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
 	struct elevator_type *e = q->elevator->type;
 	struct dentry *sched_dir;
 
@@ -755,22 +776,29 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
 	 * If the parent directory has not been created yet, return, we will be
 	 * called again later on and the directory/files will be created then.
 	 */
-	if (!q->debugfs_dir)
+	if (IS_ERR_OR_NULL(queue_dir))
 		return;
 
 	if (!e->queue_debugfs_attrs)
-		return;
+		goto exit;
 
-	sched_dir = debugfs_create_dir("sched", q->debugfs_dir);
+	sched_dir = debugfs_create_dir("sched", queue_dir);
 
 	debugfs_create_files(sched_dir, q, e->queue_debugfs_attrs);
+exit:
+	dput(queue_dir);
 }
 
 void blk_mq_debugfs_unregister_sched(struct request_queue *q)
 {
+	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
+
 	lockdep_assert_held(&q->debugfs_mutex);
 
-	debugfs_lookup_and_remove("sched", q->debugfs_dir);
+	if (IS_ERR_OR_NULL(queue_dir))
+		return;
+	debugfs_lookup_and_remove("sched", queue_dir);
+	dput(queue_dir);
 }
 
 static const char *rq_qos_id_to_name(enum rq_qos_id id)
@@ -789,24 +817,33 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
 static __must_check struct dentry *blk_mq_debugfs_get_rqos_top(
 		struct request_queue *q)
 {
-	return debugfs_lookup("rqos", q->debugfs_dir);
+	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
+	struct dentry *dir = NULL;
+
+	if (queue_dir)
+		dir = debugfs_lookup("rqos", queue_dir);
+	dput(queue_dir);
+	return dir;
 }
 
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
+	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
 	struct dentry *rqos_top;
 
 	lockdep_assert_held(&q->debugfs_mutex);
 
-	if (!q->debugfs_dir)
+	if (IS_ERR_OR_NULL(queue_dir))
 		return;
 
 	rqos_top = blk_mq_debugfs_get_rqos_top(q);
 	if (IS_ERR_OR_NULL(rqos_top))
-		return;
+		goto exit;
 	debugfs_lookup_and_remove(rq_qos_id_to_name(rqos->id), rqos_top);
 	dput(rqos_top);
+exit:
+	dput(queue_dir);
 }
 
 void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
@@ -862,8 +899,6 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 
 	lockdep_assert_held(&hctx->queue->debugfs_mutex);
 
-	if (!hctx->queue->debugfs_dir)
-		return;
 	sched_dir = blk_mq_get_hctx_sched_entry(hctx);
 	if (sched_dir) {
 		debugfs_remove_recursive(sched_dir);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0679116bb195..68bd84e06aac 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -746,8 +746,7 @@ static void blk_debugfs_remove(struct gendisk *disk)
 
 	mutex_lock(&q->debugfs_mutex);
 	blk_trace_shutdown(q);
-	debugfs_remove_recursive(q->debugfs_dir);
-	q->debugfs_dir = NULL;
+	debugfs_lookup_and_remove(disk->disk_name, blk_debugfs_root);
 	mutex_unlock(&q->debugfs_mutex);
 }
 
@@ -773,7 +772,7 @@ int blk_register_queue(struct gendisk *disk)
 	mutex_lock(&q->sysfs_lock);
 
 	mutex_lock(&q->debugfs_mutex);
-	q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
+	debugfs_create_dir(disk->disk_name, blk_debugfs_root);
 	if (queue_is_mq(q))
 		blk_mq_debugfs_register(q);
 	mutex_unlock(&q->debugfs_mutex);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7663f0c482de..adde68134ce4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -597,7 +597,6 @@ struct request_queue {
 	struct blk_mq_tag_set	*tag_set;
 	struct list_head	tag_set_list;
 
-	struct dentry		*debugfs_dir;
 	/*
 	 * Serializes all debugfs metadata operations using the above dentries.
 	 */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3679a6d18934..32cda8f5d008 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -311,17 +311,31 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	local_irq_restore(flags);
 }
 
+static struct dentry *blk_get_queue_debugfs_dir(struct request_queue *q)
+{
+	struct dentry *dir = NULL;;
+
+	if (q->disk)
+		dir = debugfs_lookup(q->disk->disk_name, blk_debugfs_root);
+	return dir;
+}
+
 static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
 {
 	relay_close(bt->rchan);
 
 	/*
 	 * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
-	 * under 'q->debugfs_dir', thus lookup and remove them.
+	 * under block queue debugfs dir, thus lookup and remove them.
 	 */
 	if (!bt->dir) {
-		debugfs_lookup_and_remove("dropped", q->debugfs_dir);
-		debugfs_lookup_and_remove("msg", q->debugfs_dir);
+		struct dentry *dir = blk_get_queue_debugfs_dir(q);
+
+		if (!IS_ERR_OR_NULL(dir)) {
+			debugfs_lookup_and_remove("dropped", dir);
+			debugfs_lookup_and_remove("msg", dir);
+			dput(dir);
+		}
 	} else {
 		debugfs_remove(bt->dir);
 	}
@@ -517,6 +531,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 {
 	struct blk_trace *bt = NULL;
 	struct dentry *dir = NULL;
+	struct dentry *dir_to_drop = NULL;
 	int ret;
 
 	lockdep_assert_held(&q->debugfs_mutex);
@@ -563,7 +578,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	 * directory that will be removed once the trace ends.
 	 */
 	if (bdev && !bdev_is_partition(bdev))
-		dir = q->debugfs_dir;
+		dir_to_drop = dir = blk_get_queue_debugfs_dir(q);
 	else
 		bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
 
@@ -614,6 +629,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 err:
 	if (ret)
 		blk_trace_free(q, bt);
+	if (!IS_ERR_OR_NULL(dir_to_drop))
+		dput(dir_to_drop);
 	return ret;
 }
 
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
                   ` (5 preceding siblings ...)
  2025-02-09 12:20 ` [PATCH 6/7] block: remove q->debugfs_dir Ming Lei
@ 2025-02-09 12:20 ` Ming Lei
  2025-02-09 16:54   ` kernel test robot
  2025-02-10  0:52   ` Shinichiro Kawasaki
  2025-02-17  9:41 ` [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Christoph Hellwig
  7 siblings, 2 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-09 12:20 UTC (permalink / raw)
  To: Jens Axboe, linux-block
  Cc: Christoph Hellwig, Nilay Shroff, Shinichiro Kawasaki, Ming Lei

All block internal state for dealing adding/removing debugfs entries
have been removed, and debugfs can sync everything for us in fs level,
so don't grab q->debugfs_mutex for adding/removing block internal debugfs
entries.

Now q->debugfs_mutex is only used for blktrace, meantime move creating
queue debugfs dir code out of q->sysfs_lock. Both the two locks are
connected with queue freeze IO lock.  Then queue freeze IO lock chain
with debugfs lock is cut.

The following lockdep report can be fixed:

https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/

Follows contexts which adds/removes debugfs entries:

- update nr_hw_queues

- add/remove disks

- elevator switch

- blktrace

blktrace only adds entries under disk top directory, so we can ignore it,
because it can only work iff disk is added. Also nothing overlapped with
the other two contex, blktrace context is fine.

Elevator switch is only allowed after disk is added, so there isn't race
with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
previous elevator, so no race between these two. Elevator switch context
is fine.

So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
adding/removing hctx entries, there might be race with add/remove disk,
which is just fine in reality:

- blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
won't be added/removed at the same time

- even though there is race between the two contexts, it is just fine,
since hctx won't be freed until queue is dead

- we never see reports in this area without holding debugfs in
blk_mq_update_nr_hw_queues()

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c  | 12 ------------
 block/blk-mq-sched.c    |  8 --------
 block/blk-rq-qos.c      |  7 +------
 block/blk-sysfs.c       |  6 +-----
 kernel/trace/blktrace.c |  2 ++
 5 files changed, 4 insertions(+), 31 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6d98c2a6e7c6..9601823730e2 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -770,8 +770,6 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
 	struct elevator_type *e = q->elevator->type;
 	struct dentry *sched_dir;
 
-	lockdep_assert_held(&q->debugfs_mutex);
-
 	/*
 	 * If the parent directory has not been created yet, return, we will be
 	 * called again later on and the directory/files will be created then.
@@ -793,8 +791,6 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
 {
 	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
 
-	lockdep_assert_held(&q->debugfs_mutex);
-
 	if (IS_ERR_OR_NULL(queue_dir))
 		return;
 	debugfs_lookup_and_remove("sched", queue_dir);
@@ -832,8 +828,6 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 	struct dentry *queue_dir = blk_mq_get_queue_entry(q);
 	struct dentry *rqos_top;
 
-	lockdep_assert_held(&q->debugfs_mutex);
-
 	if (IS_ERR_OR_NULL(queue_dir))
 		return;
 
@@ -853,8 +847,6 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 	struct dentry *rqos_top;
 	struct dentry *rqos_dir;
 
-	lockdep_assert_held(&q->debugfs_mutex);
-
 	if (!rqos->ops->debugfs_attrs)
 		return;
 
@@ -874,8 +866,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 	struct elevator_type *e = q->elevator->type;
 	struct dentry *sched_dir;
 
-	lockdep_assert_held(&q->debugfs_mutex);
-
 	/*
 	 * If the parent debugfs directory has not been created yet, return;
 	 * We will be called again later on with appropriate parent debugfs
@@ -897,8 +887,6 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	struct dentry *sched_dir;
 
-	lockdep_assert_held(&hctx->queue->debugfs_mutex);
-
 	sched_dir = blk_mq_get_hctx_sched_entry(hctx);
 	if (sched_dir) {
 		debugfs_remove_recursive(sched_dir);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 7442ca27c2bf..1ca127297b2c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -469,9 +469,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	if (ret)
 		goto err_free_map_and_rqs;
 
-	mutex_lock(&q->debugfs_mutex);
 	blk_mq_debugfs_register_sched(q);
-	mutex_unlock(&q->debugfs_mutex);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (e->ops.init_hctx) {
@@ -484,9 +482,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 				return ret;
 			}
 		}
-		mutex_lock(&q->debugfs_mutex);
 		blk_mq_debugfs_register_sched_hctx(q, hctx);
-		mutex_unlock(&q->debugfs_mutex);
 	}
 
 	return 0;
@@ -527,9 +523,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 	unsigned int flags = 0;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		mutex_lock(&q->debugfs_mutex);
 		blk_mq_debugfs_unregister_sched_hctx(hctx);
-		mutex_unlock(&q->debugfs_mutex);
 
 		if (e->type->ops.exit_hctx && hctx->sched_data) {
 			e->type->ops.exit_hctx(hctx, i);
@@ -538,9 +532,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 		flags = hctx->flags;
 	}
 
-	mutex_lock(&q->debugfs_mutex);
 	blk_mq_debugfs_unregister_sched(q);
-	mutex_unlock(&q->debugfs_mutex);
 
 	if (e->type->ops.exit_sched)
 		e->type->ops.exit_sched(e);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d4d4f4dc0e23..529640ed2ff5 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -320,11 +320,8 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 
 	blk_mq_unfreeze_queue(q, memflags);
 
-	if (rqos->ops->debugfs_attrs) {
-		mutex_lock(&q->debugfs_mutex);
+	if (rqos->ops->debugfs_attrs)
 		blk_mq_debugfs_register_rqos(rqos);
-		mutex_unlock(&q->debugfs_mutex);
-	}
 
 	return 0;
 ebusy:
@@ -349,7 +346,5 @@ void rq_qos_del(struct rq_qos *rqos)
 	}
 	blk_mq_unfreeze_queue(q, memflags);
 
-	mutex_lock(&q->debugfs_mutex);
 	blk_mq_debugfs_unregister_rqos(rqos);
-	mutex_unlock(&q->debugfs_mutex);
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 68bd84e06aac..b0bfb4c82e0e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -744,10 +744,8 @@ static void blk_debugfs_remove(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 
-	mutex_lock(&q->debugfs_mutex);
 	blk_trace_shutdown(q);
 	debugfs_lookup_and_remove(disk->disk_name, blk_debugfs_root);
-	mutex_unlock(&q->debugfs_mutex);
 }
 
 /**
@@ -769,14 +767,12 @@ int blk_register_queue(struct gendisk *disk)
 		if (ret)
 			goto out_put_queue_kobj;
 	}
-	mutex_lock(&q->sysfs_lock);
 
-	mutex_lock(&q->debugfs_mutex);
 	debugfs_create_dir(disk->disk_name, blk_debugfs_root);
 	if (queue_is_mq(q))
 		blk_mq_debugfs_register(q);
-	mutex_unlock(&q->debugfs_mutex);
 
+	mutex_lock(&q->sysfs_lock);
 	ret = disk_register_independent_access_ranges(disk);
 	if (ret)
 		goto out_debugfs_remove;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 32cda8f5d008..13efd48adcc3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -775,9 +775,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
+	mutex_lock(&q->debugfs_mutex);
 	if (rcu_dereference_protected(q->blk_trace,
 				      lockdep_is_held(&q->debugfs_mutex)))
 		__blk_trace_remove(q);
+	mutex_unlock(&q->debugfs_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
-- 
2.47.0


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-09 12:20 ` [PATCH 7/7] block: don't grab q->debugfs_mutex Ming Lei
@ 2025-02-09 16:54   ` kernel test robot
  2025-02-10  0:52   ` Shinichiro Kawasaki
  1 sibling, 0 replies; 21+ messages in thread
From: kernel test robot @ 2025-02-09 16:54 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, linux-block
  Cc: oe-kbuild-all, Christoph Hellwig, Nilay Shroff,
	Shinichiro Kawasaki, Ming Lei

Hi Ming,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.14-rc1 next-20250207]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Ming-Lei/block-remove-hctx-debugfs_dir/20250209-202320
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20250209122035.1327325-8-ming.lei%40redhat.com
patch subject: [PATCH 7/7] block: don't grab q->debugfs_mutex
config: arc-randconfig-001-20250209 (https://download.01.org/0day-ci/archive/20250210/202502100010.Kok1H3KF-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250210/202502100010.Kok1H3KF-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202502100010.Kok1H3KF-lkp@intel.com/

All warnings (new ones prefixed by >>):

   block/blk-sysfs.c: In function 'blk_debugfs_remove':
>> block/blk-sysfs.c:745:31: warning: unused variable 'q' [-Wunused-variable]
     745 |         struct request_queue *q = disk->queue;
         |                               ^


vim +/q +745 block/blk-sysfs.c

8324aa91d1e11a Jens Axboe        2008-01-29  742  
6fc75f309d291d Christoph Hellwig 2022-11-14  743  static void blk_debugfs_remove(struct gendisk *disk)
6fc75f309d291d Christoph Hellwig 2022-11-14  744  {
6fc75f309d291d Christoph Hellwig 2022-11-14 @745  	struct request_queue *q = disk->queue;
6fc75f309d291d Christoph Hellwig 2022-11-14  746  
6fc75f309d291d Christoph Hellwig 2022-11-14  747  	blk_trace_shutdown(q);
abe3d073fa86b7 Ming Lei          2025-02-09  748  	debugfs_lookup_and_remove(disk->disk_name, blk_debugfs_root);
6fc75f309d291d Christoph Hellwig 2022-11-14  749  }
6fc75f309d291d Christoph Hellwig 2022-11-14  750  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-09 12:20 ` [PATCH 7/7] block: don't grab q->debugfs_mutex Ming Lei
  2025-02-09 16:54   ` kernel test robot
@ 2025-02-10  0:52   ` Shinichiro Kawasaki
  2025-02-10  1:42     ` Ming Lei
  2025-02-10 13:36     ` Nilay Shroff
  1 sibling, 2 replies; 21+ messages in thread
From: Shinichiro Kawasaki @ 2025-02-10  0:52 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block@vger.kernel.org, hch, Nilay Shroff

On Feb 09, 2025 / 20:20, Ming Lei wrote:
> All block internal state for dealing adding/removing debugfs entries
> have been removed, and debugfs can sync everything for us in fs level,
> so don't grab q->debugfs_mutex for adding/removing block internal debugfs
> entries.
> 
> Now q->debugfs_mutex is only used for blktrace, meantime move creating
> queue debugfs dir code out of q->sysfs_lock. Both the two locks are
> connected with queue freeze IO lock.  Then queue freeze IO lock chain
> with debugfs lock is cut.
> 
> The following lockdep report can be fixed:
> 
> https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
> 
> Follows contexts which adds/removes debugfs entries:
> 
> - update nr_hw_queues
> 
> - add/remove disks
> 
> - elevator switch
> 
> - blktrace
> 
> blktrace only adds entries under disk top directory, so we can ignore it,
> because it can only work iff disk is added. Also nothing overlapped with
> the other two contex, blktrace context is fine.
> 
> Elevator switch is only allowed after disk is added, so there isn't race
> with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
> previous elevator, so no race between these two. Elevator switch context
> is fine.
> 
> So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
> adding/removing hctx entries, there might be race with add/remove disk,
> which is just fine in reality:
> 
> - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
> won't be added/removed at the same time
> 
> - even though there is race between the two contexts, it is just fine,
> since hctx won't be freed until queue is dead
> 
> - we never see reports in this area without holding debugfs in
> blk_mq_update_nr_hw_queues()
> 
> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>

Ming, thank you for this quick action. I applied this series on top of
v6.14-rc1 kernel and ran the block/002 test case. Unfortunately, still if fails
occasionally with the lockdep "WARNING: possible circular locking dependency
detected" below. Now debugfs_mutex is not reported as one of the dependent
locks, then I think this fix is working as expected. Instead, eq->sysfs_lock
creates similar dependency. My mere guess is that this patch avoids one
dependency, but still another dependency is left.


[  115.085704] [   T1023] run blktests block/002 at 2025-02-10 09:22:22
[  115.383653] [   T1054] sd 9:0:0:0: [sdd] Synchronizing SCSI cache
[  115.641933] [   T1055] scsi_debug:sdebug_driver_probe: scsi_debug: trim poll_queues to 0. poll_q/nr_hw = (0/1)
[  115.642961] [   T1055] scsi host9: scsi_debug: version 0191 [20210520]
                            dev_size_mb=8, opts=0x0, submit_queues=1, statistics=0
[  115.646207] [   T1055] scsi 9:0:0:0: Direct-Access     Linux    scsi_debug       0191 PQ: 0 ANSI: 7
[  115.648225] [      C0] scsi 9:0:0:0: Power-on or device reset occurred
[  115.654243] [   T1055] sd 9:0:0:0: Attached scsi generic sg3 type 0
[  115.656248] [    T100] sd 9:0:0:0: [sdd] 16384 512-byte logical blocks: (8.39 MB/8.00 MiB)
[  115.658403] [    T100] sd 9:0:0:0: [sdd] Write Protect is off
[  115.659125] [    T100] sd 9:0:0:0: [sdd] Mode Sense: 73 00 10 08
[  115.661621] [    T100] sd 9:0:0:0: [sdd] Write cache: enabled, read cache: enabled, supports DPO and FUA
[  115.669276] [    T100] sd 9:0:0:0: [sdd] permanent stream count = 5
[  115.673375] [    T100] sd 9:0:0:0: [sdd] Preferred minimum I/O size 512 bytes
[  115.673974] [    T100] sd 9:0:0:0: [sdd] Optimal transfer size 524288 bytes
[  115.710112] [    T100] sd 9:0:0:0: [sdd] Attached SCSI disk

[  116.464802] [   T1079] ======================================================
[  116.465540] [   T1079] WARNING: possible circular locking dependency detected
[  116.466107] [   T1079] 6.14.0-rc1+ #253 Not tainted
[  116.466581] [   T1079] ------------------------------------------------------
[  116.467141] [   T1079] blktrace/1079 is trying to acquire lock:
[  116.467708] [   T1079] ffff88810539d1e0 (&mm->mmap_lock){++++}-{4:4}, at: __might_fault+0x99/0x120
[  116.468439] [   T1079] 
                          but task is already holding lock:
[  116.469052] [   T1079] ffff88810a5fd758 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: relay_file_read+0xa3/0x8a0
[  116.469901] [   T1079] 
                          which lock already depends on the new lock.

[  116.470762] [   T1079] 
                          the existing dependency chain (in reverse order) is:
[  116.473187] [   T1079] 
                          -> #5 (&sb->s_type->i_mutex_key#3){++++}-{4:4}:
[  116.475670] [   T1079]        down_read+0x9b/0x470
[  116.477001] [   T1079]        lookup_one_unlocked+0xe9/0x120
[  116.478333] [   T1079]        lookup_positive_unlocked+0x1d/0x90
[  116.479648] [   T1079]        debugfs_lookup+0x47/0xa0
[  116.480833] [   T1079]        blk_mq_debugfs_unregister_sched_hctx+0x23/0x50
[  116.482215] [   T1079]        blk_mq_exit_sched+0xb6/0x2b0
[  116.483466] [   T1079]        elevator_switch+0x12a/0x4b0
[  116.484676] [   T1079]        elv_iosched_store+0x29f/0x380
[  116.485841] [   T1079]        queue_attr_store+0x313/0x480
[  116.487078] [   T1079]        kernfs_fop_write_iter+0x39e/0x5a0
[  116.488358] [   T1079]        vfs_write+0x5f9/0xe90
[  116.489460] [   T1079]        ksys_write+0xf6/0x1c0
[  116.490582] [   T1079]        do_syscall_64+0x93/0x180
[  116.491694] [   T1079]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  116.492996] [   T1079] 
                          -> #4 (&eq->sysfs_lock){+.+.}-{4:4}:
[  116.495135] [   T1079]        __mutex_lock+0x1aa/0x1360
[  116.496363] [   T1079]        elevator_switch+0x11f/0x4b0
[  116.497499] [   T1079]        elv_iosched_store+0x29f/0x380
[  116.498660] [   T1079]        queue_attr_store+0x313/0x480
[  116.499752] [   T1079]        kernfs_fop_write_iter+0x39e/0x5a0
[  116.500884] [   T1079]        vfs_write+0x5f9/0xe90
[  116.501964] [   T1079]        ksys_write+0xf6/0x1c0
[  116.503056] [   T1079]        do_syscall_64+0x93/0x180
[  116.504194] [   T1079]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  116.505356] [   T1079] 
                          -> #3 (&q->q_usage_counter(io)#2){++++}-{0:0}:
[  116.507272] [   T1079]        blk_mq_submit_bio+0x19b8/0x2070
[  116.508341] [   T1079]        __submit_bio+0x36b/0x6d0
[  116.509351] [   T1079]        submit_bio_noacct_nocheck+0x542/0xca0
[  116.510447] [   T1079]        iomap_readahead+0x4c4/0x7e0
[  116.511485] [   T1079]        read_pages+0x198/0xaf0
[  116.512468] [   T1079]        page_cache_ra_order+0x4d3/0xb50
[  116.513497] [   T1079]        filemap_get_pages+0x2c7/0x1850
[  116.514511] [   T1079]        filemap_read+0x31d/0xc30
[  116.515492] [   T1079]        xfs_file_buffered_read+0x1e9/0x2a0 [xfs]
[  116.517374] [   T1079]        xfs_file_read_iter+0x25f/0x4a0 [xfs]
[  116.519428] [   T1079]        vfs_read+0x6bc/0xa20
[  116.520904] [   T1079]        ksys_read+0xf6/0x1c0
[  116.522378] [   T1079]        do_syscall_64+0x93/0x180
[  116.523840] [   T1079]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  116.525467] [   T1079] 
                          -> #2 (mapping.invalidate_lock#2){++++}-{4:4}:
[  116.528191] [   T1079]        down_read+0x9b/0x470
[  116.529609] [   T1079]        filemap_fault+0x231/0x2ac0
[  116.531033] [   T1079]        __do_fault+0xf4/0x5d0
[  116.532461] [   T1079]        do_fault+0x965/0x11d0
[  116.533859] [   T1079]        __handle_mm_fault+0x1060/0x1f40
[  116.535343] [   T1079]        handle_mm_fault+0x21a/0x6b0
[  116.536915] [   T1079]        do_user_addr_fault+0x322/0xaa0
[  116.538409] [   T1079]        exc_page_fault+0x7a/0x110
[  116.539811] [   T1079]        asm_exc_page_fault+0x22/0x30
[  116.541247] [   T1079] 
                          -> #1 (&vma->vm_lock->lock){++++}-{4:4}:
[  116.543820] [   T1079]        down_write+0x8d/0x200
[  116.545202] [   T1079]        vma_link+0x1ff/0x590
[  116.546588] [   T1079]        insert_vm_struct+0x15a/0x340
[  116.548000] [   T1079]        alloc_bprm+0x626/0xbf0
[  116.549396] [   T1079]        kernel_execve+0x85/0x2f0
[  116.550784] [   T1079]        call_usermodehelper_exec_async+0x21b/0x430
[  116.552352] [   T1079]        ret_from_fork+0x30/0x70
[  116.553751] [   T1079]        ret_from_fork_asm+0x1a/0x30
[  116.555164] [   T1079] 
                          -> #0 (&mm->mmap_lock){++++}-{4:4}:
[  116.557848] [   T1079]        __lock_acquire+0x2f52/0x5ea0
[  116.559293] [   T1079]        lock_acquire+0x1b1/0x540
[  116.560678] [   T1079]        __might_fault+0xb9/0x120
[  116.562051] [   T1079]        _copy_to_user+0x1e/0x80
[  116.563446] [   T1079]        relay_file_read+0x149/0x8a0
[  116.564851] [   T1079]        full_proxy_read+0x110/0x1d0
[  116.566262] [   T1079]        vfs_read+0x1bb/0xa20
[  116.567591] [   T1079]        ksys_read+0xf6/0x1c0
[  116.568909] [   T1079]        do_syscall_64+0x93/0x180
[  116.570283] [   T1079]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  116.571781] [   T1079] 
                          other info that might help us debug this:

[  116.575369] [   T1079] Chain exists of:
                            &mm->mmap_lock --> &eq->sysfs_lock --> &sb->s_type->i_mutex_key#3

[  116.579323] [   T1079]  Possible unsafe locking scenario:

[  116.580961] [   T1079]        CPU0                    CPU1
[  116.581866] [   T1079]        ----                    ----
[  116.582737] [   T1079]   lock(&sb->s_type->i_mutex_key#3);
[  116.583611] [   T1079]                                lock(&eq->sysfs_lock);
[  116.584596] [   T1079]                                lock(&sb->s_type->i_mutex_key#3);
[  116.585646] [   T1079]   rlock(&mm->mmap_lock);
[  116.586453] [   T1079] 
                           *** DEADLOCK ***

[  116.588484] [   T1079] 2 locks held by blktrace/1079:
[  116.589318] [   T1079]  #0: ffff888101390af8 (&f->f_pos_lock){+.+.}-{4:4}, at: fdget_pos+0x233/0x2f0
[  116.590461] [   T1079]  #1: ffff88810a5fd758 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: relay_file_read+0xa3/0x8a0
[  116.591720] [   T1079] 
                          stack backtrace:
[  116.593166] [   T1079] CPU: 0 UID: 0 PID: 1079 Comm: blktrace Not tainted 6.14.0-rc1+ #253
[  116.593169] [   T1079] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014
[  116.593172] [   T1079] Call Trace:
[  116.593176] [   T1079]  <TASK>
[  116.593178] [   T1079]  dump_stack_lvl+0x6a/0x90
[  116.593186] [   T1079]  print_circular_bug.cold+0x1e0/0x274
[  116.593191] [   T1079]  check_noncircular+0x306/0x3f0
[  116.593195] [   T1079]  ? __pfx_check_noncircular+0x10/0x10
[  116.593200] [   T1079]  ? lockdep_lock+0xca/0x1c0
[  116.593202] [   T1079]  ? __pfx_lockdep_lock+0x10/0x10
[  116.593206] [   T1079]  __lock_acquire+0x2f52/0x5ea0
[  116.593211] [   T1079]  ? lockdep_unlock+0xf1/0x250
[  116.593213] [   T1079]  ? __pfx___lock_acquire+0x10/0x10
[  116.593216] [   T1079]  ? lock_acquire+0x1b1/0x540
[  116.593220] [   T1079]  lock_acquire+0x1b1/0x540
[  116.593222] [   T1079]  ? __might_fault+0x99/0x120
[  116.593226] [   T1079]  ? __pfx_lock_acquire+0x10/0x10
[  116.593228] [   T1079]  ? lock_is_held_type+0xd5/0x130
[  116.593234] [   T1079]  ? __pfx___might_resched+0x10/0x10
[  116.593239] [   T1079]  ? _raw_spin_unlock+0x29/0x50
[  116.593243] [   T1079]  ? __might_fault+0x99/0x120
[  116.593245] [   T1079]  __might_fault+0xb9/0x120
[  116.593247] [   T1079]  ? __might_fault+0x99/0x120
[  116.593249] [   T1079]  ? __check_object_size+0x3f3/0x540
[  116.593253] [   T1079]  _copy_to_user+0x1e/0x80
[  116.593256] [   T1079]  relay_file_read+0x149/0x8a0
[  116.593261] [   T1079]  ? selinux_file_permission+0x36d/0x420
[  116.593270] [   T1079]  full_proxy_read+0x110/0x1d0
[  116.593272] [   T1079]  ? rw_verify_area+0x2f7/0x520
[  116.593275] [   T1079]  vfs_read+0x1bb/0xa20
[  116.593278] [   T1079]  ? __pfx___mutex_lock+0x10/0x10
[  116.593280] [   T1079]  ? __pfx_lock_release+0x10/0x10
[  116.593283] [   T1079]  ? lock_release+0x45b/0x7a0
[  116.593285] [   T1079]  ? __pfx_vfs_read+0x10/0x10
[  116.593289] [   T1079]  ? __fget_files+0x1ae/0x2e0
[  116.593294] [   T1079]  ksys_read+0xf6/0x1c0
[  116.593296] [   T1079]  ? __pfx_ksys_read+0x10/0x10
[  116.593300] [   T1079]  do_syscall_64+0x93/0x180
[  116.593303] [   T1079]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[  116.593306] [   T1079]  ? do_syscall_64+0x9f/0x180
[  116.593308] [   T1079]  ? lockdep_hardirqs_on+0x78/0x100
[  116.593310] [   T1079]  ? do_syscall_64+0x9f/0x180
[  116.593313] [   T1079]  ? __pfx_vm_mmap_pgoff+0x10/0x10
[  116.593317] [   T1079]  ? __fget_files+0x1ae/0x2e0
[  116.593321] [   T1079]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[  116.593324] [   T1079]  ? do_syscall_64+0x9f/0x180
[  116.593326] [   T1079]  ? lockdep_hardirqs_on+0x78/0x100
[  116.593328] [   T1079]  ? do_syscall_64+0x9f/0x180
[  116.593331] [   T1079]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  116.593334] [   T1079] RIP: 0033:0x7f9586b50e4a
[  116.593338] [   T1079] Code: 55 48 89 e5 48 83 ec 20 48 89 55 e8 48 89 75 f0 89 7d f8 e8 28 58 f8 ff 48 8b 55 e8 48 8b 75 f0 41 89 c0 8b 7d f8 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 2e 44 89 c7 48 89 45 f8 e8 82 58 f8 ff 48 8b
[  116.593341] [   T1079] RSP: 002b:00007f9586a3ed80 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  116.593348] [   T1079] RAX: ffffffffffffffda RBX: 00007f9580000bb0 RCX: 00007f9586b50e4a
[  116.593349] [   T1079] RDX: 0000000000080000 RSI: 00007f9584800000 RDI: 0000000000000004
[  116.593351] [   T1079] RBP: 00007f9586a3eda0 R08: 0000000000000000 R09: 0000000000000000
[  116.593352] [   T1079] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000
[  116.593353] [   T1079] R13: 000056218e3f97e0 R14: 0000000000000000 R15: 00007f9580002c90
[  116.593359] [   T1079]  </TASK>
[  116.687238] [   T1023] sd 9:0:0:0: [sdd] Synchronizing SCSI cache

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-10  0:52   ` Shinichiro Kawasaki
@ 2025-02-10  1:42     ` Ming Lei
  2025-02-10  7:01       ` Nilay Shroff
  2025-02-10 13:36     ` Nilay Shroff
  1 sibling, 1 reply; 21+ messages in thread
From: Ming Lei @ 2025-02-10  1:42 UTC (permalink / raw)
  To: Shinichiro Kawasaki
  Cc: Jens Axboe, linux-block@vger.kernel.org, hch, Nilay Shroff,
	Ming Lei

On Mon, Feb 10, 2025 at 8:52 AM Shinichiro Kawasaki
<shinichiro.kawasaki@wdc.com> wrote:
>
> On Feb 09, 2025 / 20:20, Ming Lei wrote:
> > All block internal state for dealing adding/removing debugfs entries
> > have been removed, and debugfs can sync everything for us in fs level,
> > so don't grab q->debugfs_mutex for adding/removing block internal debugfs
> > entries.
> >
> > Now q->debugfs_mutex is only used for blktrace, meantime move creating
> > queue debugfs dir code out of q->sysfs_lock. Both the two locks are
> > connected with queue freeze IO lock.  Then queue freeze IO lock chain
> > with debugfs lock is cut.
> >
> > The following lockdep report can be fixed:
> >
> > https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
> >
> > Follows contexts which adds/removes debugfs entries:
> >
> > - update nr_hw_queues
> >
> > - add/remove disks
> >
> > - elevator switch
> >
> > - blktrace
> >
> > blktrace only adds entries under disk top directory, so we can ignore it,
> > because it can only work iff disk is added. Also nothing overlapped with
> > the other two contex, blktrace context is fine.
> >
> > Elevator switch is only allowed after disk is added, so there isn't race
> > with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
> > previous elevator, so no race between these two. Elevator switch context
> > is fine.
> >
> > So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
> > adding/removing hctx entries, there might be race with add/remove disk,
> > which is just fine in reality:
> >
> > - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
> > won't be added/removed at the same time
> >
> > - even though there is race between the two contexts, it is just fine,
> > since hctx won't be freed until queue is dead
> >
> > - we never see reports in this area without holding debugfs in
> > blk_mq_update_nr_hw_queues()
> >
> > Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
>
> Ming, thank you for this quick action. I applied this series on top of
> v6.14-rc1 kernel and ran the block/002 test case. Unfortunately, still if fails
> occasionally with the lockdep "WARNING: possible circular locking dependency
> detected" below. Now debugfs_mutex is not reported as one of the dependent
> locks, then I think this fix is working as expected. Instead, eq->sysfs_lock
> creates similar dependency. My mere guess is that this patch avoids one
> dependency, but still another dependency is left.

Indeed, this patch cuts dependency on both q->sysfs_lock and q->debugfs_lock,
but elevator ->sysfs_lock isn't covered, :-(

Thanks,
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-10  1:42     ` Ming Lei
@ 2025-02-10  7:01       ` Nilay Shroff
  2025-02-10  8:15         ` Ming Lei
  2025-02-10  8:25         ` Shinichiro Kawasaki
  0 siblings, 2 replies; 21+ messages in thread
From: Nilay Shroff @ 2025-02-10  7:01 UTC (permalink / raw)
  To: Ming Lei, Shinichiro Kawasaki
  Cc: Jens Axboe, linux-block@vger.kernel.org, hch, Ming Lei



On 2/10/25 7:12 AM, Ming Lei wrote:
> On Mon, Feb 10, 2025 at 8:52 AM Shinichiro Kawasaki
> <shinichiro.kawasaki@wdc.com> wrote:
>>
>> On Feb 09, 2025 / 20:20, Ming Lei wrote:
>>> All block internal state for dealing adding/removing debugfs entries
>>> have been removed, and debugfs can sync everything for us in fs level,
>>> so don't grab q->debugfs_mutex for adding/removing block internal debugfs
>>> entries.
>>>
>>> Now q->debugfs_mutex is only used for blktrace, meantime move creating
>>> queue debugfs dir code out of q->sysfs_lock. Both the two locks are
>>> connected with queue freeze IO lock.  Then queue freeze IO lock chain
>>> with debugfs lock is cut.
>>>
>>> The following lockdep report can be fixed:
>>>
>>> https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
>>>
>>> Follows contexts which adds/removes debugfs entries:
>>>
>>> - update nr_hw_queues
>>>
>>> - add/remove disks
>>>
>>> - elevator switch
>>>
>>> - blktrace
>>>
>>> blktrace only adds entries under disk top directory, so we can ignore it,
>>> because it can only work iff disk is added. Also nothing overlapped with
>>> the other two contex, blktrace context is fine.
>>>
>>> Elevator switch is only allowed after disk is added, so there isn't race
>>> with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
>>> previous elevator, so no race between these two. Elevator switch context
>>> is fine.
>>>
>>> So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
>>> adding/removing hctx entries, there might be race with add/remove disk,
>>> which is just fine in reality:
>>>
>>> - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
>>> won't be added/removed at the same time
>>>
>>> - even though there is race between the two contexts, it is just fine,
>>> since hctx won't be freed until queue is dead
>>>
>>> - we never see reports in this area without holding debugfs in
>>> blk_mq_update_nr_hw_queues()
>>>
>>> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>
>> Ming, thank you for this quick action. I applied this series on top of
>> v6.14-rc1 kernel and ran the block/002 test case. Unfortunately, still if fails
>> occasionally with the lockdep "WARNING: possible circular locking dependency
>> detected" below. Now debugfs_mutex is not reported as one of the dependent
>> locks, then I think this fix is working as expected. Instead, eq->sysfs_lock
>> creates similar dependency. My mere guess is that this patch avoids one
>> dependency, but still another dependency is left.
> 
> Indeed, this patch cuts dependency on both q->sysfs_lock and q->debugfs_lock,
Glad to see that with this patch we're able to cut the dependency between
q->sysfs_lock and q->debugfs_lock.

> but elevator ->sysfs_lock isn't covered, :-(
> 
I believe that shall be fixed with the current effort undergoing here:
https://lore.kernel.org/all/20250205144506.663819-1-nilay@linux.ibm.com/

Thanks,
--Nilay

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-10  7:01       ` Nilay Shroff
@ 2025-02-10  8:15         ` Ming Lei
  2025-02-10  8:25         ` Shinichiro Kawasaki
  1 sibling, 0 replies; 21+ messages in thread
From: Ming Lei @ 2025-02-10  8:15 UTC (permalink / raw)
  To: Nilay Shroff
  Cc: Shinichiro Kawasaki, Jens Axboe, linux-block@vger.kernel.org, hch,
	Ming Lei

On Mon, Feb 10, 2025 at 12:31:19PM +0530, Nilay Shroff wrote:
> 
> 
> On 2/10/25 7:12 AM, Ming Lei wrote:
> > On Mon, Feb 10, 2025 at 8:52 AM Shinichiro Kawasaki
> > <shinichiro.kawasaki@wdc.com> wrote:
> >>
> >> On Feb 09, 2025 / 20:20, Ming Lei wrote:
> >>> All block internal state for dealing adding/removing debugfs entries
> >>> have been removed, and debugfs can sync everything for us in fs level,
> >>> so don't grab q->debugfs_mutex for adding/removing block internal debugfs
> >>> entries.
> >>>
> >>> Now q->debugfs_mutex is only used for blktrace, meantime move creating
> >>> queue debugfs dir code out of q->sysfs_lock. Both the two locks are
> >>> connected with queue freeze IO lock.  Then queue freeze IO lock chain
> >>> with debugfs lock is cut.
> >>>
> >>> The following lockdep report can be fixed:
> >>>
> >>> https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
> >>>
> >>> Follows contexts which adds/removes debugfs entries:
> >>>
> >>> - update nr_hw_queues
> >>>
> >>> - add/remove disks
> >>>
> >>> - elevator switch
> >>>
> >>> - blktrace
> >>>
> >>> blktrace only adds entries under disk top directory, so we can ignore it,
> >>> because it can only work iff disk is added. Also nothing overlapped with
> >>> the other two contex, blktrace context is fine.
> >>>
> >>> Elevator switch is only allowed after disk is added, so there isn't race
> >>> with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
> >>> previous elevator, so no race between these two. Elevator switch context
> >>> is fine.
> >>>
> >>> So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
> >>> adding/removing hctx entries, there might be race with add/remove disk,
> >>> which is just fine in reality:
> >>>
> >>> - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
> >>> won't be added/removed at the same time
> >>>
> >>> - even though there is race between the two contexts, it is just fine,
> >>> since hctx won't be freed until queue is dead
> >>>
> >>> - we never see reports in this area without holding debugfs in
> >>> blk_mq_update_nr_hw_queues()
> >>>
> >>> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>
> >> Ming, thank you for this quick action. I applied this series on top of
> >> v6.14-rc1 kernel and ran the block/002 test case. Unfortunately, still if fails
> >> occasionally with the lockdep "WARNING: possible circular locking dependency
> >> detected" below. Now debugfs_mutex is not reported as one of the dependent
> >> locks, then I think this fix is working as expected. Instead, eq->sysfs_lock
> >> creates similar dependency. My mere guess is that this patch avoids one
> >> dependency, but still another dependency is left.
> > 
> > Indeed, this patch cuts dependency on both q->sysfs_lock and q->debugfs_lock,
> Glad to see that with this patch we're able to cut the dependency between
> q->sysfs_lock and q->debugfs_lock.
> 
> > but elevator ->sysfs_lock isn't covered, :-(
> > 
> I believe that shall be fixed with the current effort undergoing here:
> https://lore.kernel.org/all/20250205144506.663819-1-nilay@linux.ibm.com/

I guess it isn't, your patches don't cover elevator_queue->sysfs_lock...


Thanks,
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-10  7:01       ` Nilay Shroff
  2025-02-10  8:15         ` Ming Lei
@ 2025-02-10  8:25         ` Shinichiro Kawasaki
  2025-02-10  8:39           ` Nilay Shroff
  1 sibling, 1 reply; 21+ messages in thread
From: Shinichiro Kawasaki @ 2025-02-10  8:25 UTC (permalink / raw)
  To: Nilay Shroff
  Cc: Ming Lei, Jens Axboe, linux-block@vger.kernel.org, hch, Ming Lei

On Feb 10, 2025 / 12:31, Nilay Shroff wrote:
> 
> 
> On 2/10/25 7:12 AM, Ming Lei wrote:
> > On Mon, Feb 10, 2025 at 8:52 AM Shinichiro Kawasaki
> > <shinichiro.kawasaki@wdc.com> wrote:
> >>
> >> On Feb 09, 2025 / 20:20, Ming Lei wrote:
> >>> All block internal state for dealing adding/removing debugfs entries
> >>> have been removed, and debugfs can sync everything for us in fs level,
> >>> so don't grab q->debugfs_mutex for adding/removing block internal debugfs
> >>> entries.
> >>>
> >>> Now q->debugfs_mutex is only used for blktrace, meantime move creating
> >>> queue debugfs dir code out of q->sysfs_lock. Both the two locks are
> >>> connected with queue freeze IO lock.  Then queue freeze IO lock chain
> >>> with debugfs lock is cut.
> >>>
> >>> The following lockdep report can be fixed:
> >>>
> >>> https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
> >>>
> >>> Follows contexts which adds/removes debugfs entries:
> >>>
> >>> - update nr_hw_queues
> >>>
> >>> - add/remove disks
> >>>
> >>> - elevator switch
> >>>
> >>> - blktrace
> >>>
> >>> blktrace only adds entries under disk top directory, so we can ignore it,
> >>> because it can only work iff disk is added. Also nothing overlapped with
> >>> the other two contex, blktrace context is fine.
> >>>
> >>> Elevator switch is only allowed after disk is added, so there isn't race
> >>> with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
> >>> previous elevator, so no race between these two. Elevator switch context
> >>> is fine.
> >>>
> >>> So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
> >>> adding/removing hctx entries, there might be race with add/remove disk,
> >>> which is just fine in reality:
> >>>
> >>> - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
> >>> won't be added/removed at the same time
> >>>
> >>> - even though there is race between the two contexts, it is just fine,
> >>> since hctx won't be freed until queue is dead
> >>>
> >>> - we never see reports in this area without holding debugfs in
> >>> blk_mq_update_nr_hw_queues()
> >>>
> >>> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> >>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>
> >> Ming, thank you for this quick action. I applied this series on top of
> >> v6.14-rc1 kernel and ran the block/002 test case. Unfortunately, still if fails
> >> occasionally with the lockdep "WARNING: possible circular locking dependency
> >> detected" below. Now debugfs_mutex is not reported as one of the dependent
> >> locks, then I think this fix is working as expected. Instead, eq->sysfs_lock
> >> creates similar dependency. My mere guess is that this patch avoids one
> >> dependency, but still another dependency is left.
> > 
> > Indeed, this patch cuts dependency on both q->sysfs_lock and q->debugfs_lock,
> Glad to see that with this patch we're able to cut the dependency between
> q->sysfs_lock and q->debugfs_lock.
> 
> > but elevator ->sysfs_lock isn't covered, :-(
> > 
> I believe that shall be fixed with the current effort undergoing here:
> https://lore.kernel.org/all/20250205144506.663819-1-nilay@linux.ibm.com/

Thanks Nilay for the information. I applied your patch series on top of
v6.14-rc1 and Ming's. Then I ran block/002. Alas, still the test case fails with
the "WARNING: possible circular locking dependency detected". The lockdep splat
contents has changed as below:


[  222.549099] [   T1041] run blktests block/002 at 2025-02-10 17:13:18
[  222.887999] [   T1072] sd 9:0:0:0: [sdd] Synchronizing SCSI cache
[  223.172864] [   T1073] scsi_debug:sdebug_driver_probe: scsi_debug: trim poll_queues to 0. poll_q/nr_hw = (0/1)
[  223.173872] [   T1073] scsi host9: scsi_debug: version 0191 [20210520]
                            dev_size_mb=8, opts=0x0, submit_queues=1, statistics=0
[  223.177251] [   T1073] scsi 9:0:0:0: Direct-Access     Linux    scsi_debug       0191 PQ: 0 ANSI: 7
[  223.179393] [      C0] scsi 9:0:0:0: Power-on or device reset occurred
[  223.185351] [   T1073] sd 9:0:0:0: Attached scsi generic sg3 type 0
[  223.187514] [    T101] sd 9:0:0:0: [sdd] 16384 512-byte logical blocks: (8.39 MB/8.00 MiB)
[  223.189675] [    T101] sd 9:0:0:0: [sdd] Write Protect is off
[  223.190146] [    T101] sd 9:0:0:0: [sdd] Mode Sense: 73 00 10 08
[  223.192369] [    T101] sd 9:0:0:0: [sdd] Write cache: enabled, read cache: enabled, supports DPO and FUA
[  223.199999] [    T101] sd 9:0:0:0: [sdd] permanent stream count = 5
[  223.203950] [    T101] sd 9:0:0:0: [sdd] Preferred minimum I/O size 512 bytes
[  223.204568] [    T101] sd 9:0:0:0: [sdd] Optimal transfer size 524288 bytes
[  223.245371] [    T101] sd 9:0:0:0: [sdd] Attached SCSI disk

[  223.999340] [   T1097] ======================================================
[  224.000044] [   T1097] WARNING: possible circular locking dependency detected
[  224.000651] [   T1097] 6.14.0-rc1+ #255 Not tainted
[  224.001109] [   T1097] ------------------------------------------------------
[  224.001791] [   T1097] blktrace/1097 is trying to acquire lock:
[  224.002220] [   T1097] ffff88814c53bde0 (&mm->mmap_lock){++++}-{4:4}, at: __might_fault+0x99/0x120
[  224.002887] [   T1097] 
                          but task is already holding lock:
[  224.003423] [   T1097] ffff88810079dc68 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: relay_file_read+0xa3/0x8a0
[  224.004173] [   T1097] 
                          which lock already depends on the new lock.

[  224.004934] [   T1097] 
                          the existing dependency chain (in reverse order) is:
[  224.007356] [   T1097] 
                          -> #6 (&sb->s_type->i_mutex_key#3){++++}-{4:4}:
[  224.009859] [   T1097]        down_write+0x8d/0x200
[  224.011109] [   T1097]        start_creating.part.0+0x82/0x230
[  224.012465] [   T1097]        debugfs_create_dir+0x3a/0x4c0
[  224.013793] [   T1097]        blk_mq_debugfs_register_rqos+0x1a5/0x3a0
[  224.015101] [   T1097]        rq_qos_add+0x2f6/0x450
[  224.016312] [   T1097]        wbt_init+0x359/0x490
[  224.017507] [   T1097]        blk_register_queue+0x1c0/0x330
[  224.018800] [   T1097]        add_disk_fwnode+0x6b1/0x1010
[  224.020047] [   T1097]        sd_probe+0x94e/0xf30
[  224.021235] [   T1097]        really_probe+0x1e3/0x8a0
[  224.022402] [   T1097]        __driver_probe_device+0x18c/0x370
[  224.023652] [   T1097]        driver_probe_device+0x4a/0x120
[  224.024816] [   T1097]        __device_attach_driver+0x15e/0x270
[  224.026038] [   T1097]        bus_for_each_drv+0x114/0x1a0
[  224.027212] [   T1097]        __device_attach_async_helper+0x19c/0x240
[  224.028476] [   T1097]        async_run_entry_fn+0x96/0x4f0
[  224.029664] [   T1097]        process_one_work+0x85a/0x1460
[  224.030798] [   T1097]        worker_thread+0x5e2/0xfc0
[  224.032037] [   T1097]        kthread+0x39d/0x750
[  224.033136] [   T1097]        ret_from_fork+0x30/0x70
[  224.034232] [   T1097]        ret_from_fork_asm+0x1a/0x30
[  224.035360] [   T1097] 
                          -> #5 (&q->rq_qos_mutex){+.+.}-{4:4}:
[  224.037338] [   T1097]        __mutex_lock+0x1aa/0x1360
[  224.038397] [   T1097]        wbt_init+0x343/0x490
[  224.039497] [   T1097]        blk_register_queue+0x1c0/0x330
[  224.040612] [   T1097]        add_disk_fwnode+0x6b1/0x1010
[  224.041675] [   T1097]        sd_probe+0x94e/0xf30
[  224.042662] [   T1097]        really_probe+0x1e3/0x8a0
[  224.043672] [   T1097]        __driver_probe_device+0x18c/0x370
[  224.044728] [   T1097]        driver_probe_device+0x4a/0x120
[  224.045761] [   T1097]        __device_attach_driver+0x15e/0x270
[  224.046818] [   T1097]        bus_for_each_drv+0x114/0x1a0
[  224.047842] [   T1097]        __device_attach_async_helper+0x19c/0x240
[  224.048876] [   T1097]        async_run_entry_fn+0x96/0x4f0
[  224.049840] [   T1097]        process_one_work+0x85a/0x1460
[  224.050794] [   T1097]        worker_thread+0x5e2/0xfc0
[  224.051718] [   T1097]        kthread+0x39d/0x750
[  224.052653] [   T1097]        ret_from_fork+0x30/0x70
[  224.053548] [   T1097]        ret_from_fork_asm+0x1a/0x30
[  224.054492] [   T1097] 
                          -> #4 (&q->sysfs_lock){+.+.}-{4:4}:
[  224.056073] [   T1097]        __mutex_lock+0x1aa/0x1360
[  224.056934] [   T1097]        elv_iosched_store+0xf2/0x500
[  224.057809] [   T1097]        queue_attr_store+0x35a/0x480
[  224.058680] [   T1097]        kernfs_fop_write_iter+0x39e/0x5a0
[  224.059593] [   T1097]        vfs_write+0x5f9/0xe90
[  224.060449] [   T1097]        ksys_write+0xf6/0x1c0
[  224.061294] [   T1097]        do_syscall_64+0x93/0x180
[  224.062153] [   T1097]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  224.063087] [   T1097] 
                          -> #3 (&q->q_usage_counter(io)#2){++++}-{0:0}:
[  224.064675] [   T1097]        blk_mq_submit_bio+0x19b8/0x2070
[  224.065562] [   T1097]        __submit_bio+0x36b/0x6d0
[  224.066428] [   T1097]        submit_bio_noacct_nocheck+0x542/0xca0
[  224.067371] [   T1097]        iomap_readahead+0x4c4/0x7e0
[  224.068254] [   T1097]        read_pages+0x198/0xaf0
[  224.069085] [   T1097]        page_cache_ra_order+0x4d3/0xb50
[  224.069976] [   T1097]        filemap_get_pages+0x2c7/0x1850
[  224.070862] [   T1097]        filemap_read+0x31d/0xc30
[  224.071707] [   T1097]        xfs_file_buffered_read+0x1e9/0x2a0 [xfs]
[  224.072927] [   T1097]        xfs_file_read_iter+0x25f/0x4a0 [xfs]
[  224.074093] [   T1097]        vfs_read+0x6bc/0xa20
[  224.074918] [   T1097]        ksys_read+0xf6/0x1c0
[  224.075735] [   T1097]        do_syscall_64+0x93/0x180
[  224.076576] [   T1097]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  224.077534] [   T1097] 
                          -> #2 (mapping.invalidate_lock#2){++++}-{4:4}:
[  224.079118] [   T1097]        down_read+0x9b/0x470
[  224.079950] [   T1097]        filemap_fault+0x231/0x2ac0
[  224.080818] [   T1097]        __do_fault+0xf4/0x5d0
[  224.081658] [   T1097]        do_fault+0x965/0x11d0
[  224.082495] [   T1097]        __handle_mm_fault+0x1060/0x1f40
[  224.083408] [   T1097]        handle_mm_fault+0x21a/0x6b0
[  224.084297] [   T1097]        do_user_addr_fault+0x322/0xaa0
[  224.085205] [   T1097]        exc_page_fault+0x7a/0x110
[  224.086061] [   T1097]        asm_exc_page_fault+0x22/0x30
[  224.086941] [   T1097] 
                          -> #1 (&vma->vm_lock->lock){++++}-{4:4}:
[  224.088495] [   T1097]        down_write+0x8d/0x200
[  224.089353] [   T1097]        vma_link+0x1ff/0x590
[  224.090200] [   T1097]        insert_vm_struct+0x15a/0x340
[  224.091072] [   T1097]        alloc_bprm+0x626/0xbf0
[  224.091909] [   T1097]        kernel_execve+0x85/0x2f0
[  224.092754] [   T1097]        call_usermodehelper_exec_async+0x21b/0x430
[  224.093738] [   T1097]        ret_from_fork+0x30/0x70
[  224.094587] [   T1097]        ret_from_fork_asm+0x1a/0x30
[  224.095541] [   T1097] 
                          -> #0 (&mm->mmap_lock){++++}-{4:4}:
[  224.097077] [   T1097]        __lock_acquire+0x2f52/0x5ea0
[  224.097963] [   T1097]        lock_acquire+0x1b1/0x540
[  224.098818] [   T1097]        __might_fault+0xb9/0x120
[  224.099672] [   T1097]        _copy_to_user+0x1e/0x80
[  224.100519] [   T1097]        relay_file_read+0x149/0x8a0
[  224.101408] [   T1097]        full_proxy_read+0x110/0x1d0
[  224.102297] [   T1097]        vfs_read+0x1bb/0xa20
[  224.103115] [   T1097]        ksys_read+0xf6/0x1c0
[  224.103937] [   T1097]        do_syscall_64+0x93/0x180
[  224.104782] [   T1097]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  224.105726] [   T1097] 
                          other info that might help us debug this:

[  224.107884] [   T1097] Chain exists of:
                            &mm->mmap_lock --> &q->rq_qos_mutex --> &sb->s_type->i_mutex_key#3

[  224.110335] [   T1097]  Possible unsafe locking scenario:

[  224.111856] [   T1097]        CPU0                    CPU1
[  224.112723] [   T1097]        ----                    ----
[  224.113588] [   T1097]   lock(&sb->s_type->i_mutex_key#3);
[  224.114492] [   T1097]                                lock(&q->rq_qos_mutex);
[  224.115499] [   T1097]                                lock(&sb->s_type->i_mutex_key#3);
[  224.116609] [   T1097]   rlock(&mm->mmap_lock);
[  224.117440] [   T1097] 
                           *** DEADLOCK ***

[  224.119479] [   T1097] 2 locks held by blktrace/1097:
[  224.120325] [   T1097]  #0: ffff88812e5b80f8 (&f->f_pos_lock){+.+.}-{4:4}, at: fdget_pos+0x233/0x2f0
[  224.121474] [   T1097]  #1: ffff88810079dc68 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: relay_file_read+0xa3/0x8a0
[  224.122741] [   T1097] 
                          stack backtrace:
[  224.124186] [   T1097] CPU: 0 UID: 0 PID: 1097 Comm: blktrace Not tainted 6.14.0-rc1+ #255
[  224.124189] [   T1097] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014
[  224.124193] [   T1097] Call Trace:
[  224.124195] [   T1097]  <TASK>
[  224.124197] [   T1097]  dump_stack_lvl+0x6a/0x90
[  224.124201] [   T1097]  print_circular_bug.cold+0x1e0/0x274
[  224.124205] [   T1097]  check_noncircular+0x306/0x3f0
[  224.124209] [   T1097]  ? __pfx_check_noncircular+0x10/0x10
[  224.124213] [   T1097]  ? lockdep_lock+0xca/0x1c0
[  224.124215] [   T1097]  ? __pfx_lockdep_lock+0x10/0x10
[  224.124219] [   T1097]  __lock_acquire+0x2f52/0x5ea0
[  224.124224] [   T1097]  ? find_held_lock+0x2d/0x110
[  224.124226] [   T1097]  ? __pfx___lock_acquire+0x10/0x10
[  224.124230] [   T1097]  ? lock_acquire+0x1b1/0x540
[  224.124233] [   T1097]  lock_acquire+0x1b1/0x540
[  224.124236] [   T1097]  ? __might_fault+0x99/0x120
[  224.124239] [   T1097]  ? __pfx_lock_acquire+0x10/0x10
[  224.124242] [   T1097]  ? lock_is_held_type+0xd5/0x130
[  224.124245] [   T1097]  ? __pfx___might_resched+0x10/0x10
[  224.124248] [   T1097]  ? _raw_spin_unlock+0x29/0x50
[  224.124250] [   T1097]  ? __might_fault+0x99/0x120
[  224.124252] [   T1097]  __might_fault+0xb9/0x120
[  224.124254] [   T1097]  ? __might_fault+0x99/0x120
[  224.124256] [   T1097]  ? __check_object_size+0x3f3/0x540
[  224.124258] [   T1097]  _copy_to_user+0x1e/0x80
[  224.124261] [   T1097]  relay_file_read+0x149/0x8a0
[  224.124266] [   T1097]  ? selinux_file_permission+0x36d/0x420
[  224.124270] [   T1097]  full_proxy_read+0x110/0x1d0
[  224.124272] [   T1097]  ? rw_verify_area+0x2f7/0x520
[  224.124274] [   T1097]  vfs_read+0x1bb/0xa20
[  224.124277] [   T1097]  ? __pfx___mutex_lock+0x10/0x10
[  224.124279] [   T1097]  ? __pfx_lock_release+0x10/0x10
[  224.124282] [   T1097]  ? __pfx_vfs_read+0x10/0x10
[  224.124286] [   T1097]  ? __fget_files+0x1ae/0x2e0
[  224.124290] [   T1097]  ksys_read+0xf6/0x1c0
[  224.124293] [   T1097]  ? __pfx_ksys_read+0x10/0x10
[  224.124297] [   T1097]  do_syscall_64+0x93/0x180
[  224.124299] [   T1097]  ? __pfx___rseq_handle_notify_resume+0x10/0x10
[  224.124301] [   T1097]  ? __pfx_blkcg_maybe_throttle_current+0x10/0x10
[  224.124305] [   T1097]  ? __pfx___rseq_handle_notify_resume+0x10/0x10
[  224.124306] [   T1097]  ? __pfx_blkcg_maybe_throttle_current+0x10/0x10
[  224.124311] [   T1097]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[  224.124313] [   T1097]  ? do_syscall_64+0x9f/0x180
[  224.124315] [   T1097]  ? lockdep_hardirqs_on+0x78/0x100
[  224.124318] [   T1097]  ? do_syscall_64+0x9f/0x180
[  224.124319] [   T1097]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[  224.124322] [   T1097]  ? do_syscall_64+0x9f/0x180
[  224.124324] [   T1097]  ? lockdep_hardirqs_on+0x78/0x100
[  224.124326] [   T1097]  ? do_syscall_64+0x9f/0x180
[  224.124328] [   T1097]  ? lockdep_hardirqs_on+0x78/0x100
[  224.124330] [   T1097]  ? do_syscall_64+0x9f/0x180
[  224.124333] [   T1097]  ? handle_mm_fault+0x39d/0x6b0
[  224.124337] [   T1097]  ? lockdep_hardirqs_on_prepare+0x16d/0x400
[  224.124341] [   T1097]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  224.124344] [   T1097] RIP: 0033:0x7fdbd297be4a
[  224.124348] [   T1097] Code: 55 48 89 e5 48 83 ec 20 48 89 55 e8 48 89 75 f0 89 7d f8 e8 28 58 f8 ff 48 8b 55 e8 48 8b 75 f0 41 89 c0 8b 7d f8 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 2e 44 89 c7 48 89 45 f8 e8 82 58 f8 ff 48 8b
[  224.124350] [   T1097] RSP: 002b:00007fdbd2869d80 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[  224.124354] [   T1097] RAX: ffffffffffffffda RBX: 00007fdbcc000bb0 RCX: 00007fdbd297be4a
[  224.124356] [   T1097] RDX: 0000000000080000 RSI: 00007fdbc3000000 RDI: 0000000000000004
[  224.124357] [   T1097] RBP: 00007fdbd2869da0 R08: 0000000000000000 R09: 0000000000000000
[  224.124358] [   T1097] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000
[  224.124360] [   T1097] R13: 000055e34ca547e0 R14: 0000000000000000 R15: 00007fdbcc002c90
[  224.124364] [   T1097]  </TASK>
[  224.220675] [   T1041] sd 9:0:0:0: [sdd] Synchronizing SCSI cache

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-10  8:25         ` Shinichiro Kawasaki
@ 2025-02-10  8:39           ` Nilay Shroff
  0 siblings, 0 replies; 21+ messages in thread
From: Nilay Shroff @ 2025-02-10  8:39 UTC (permalink / raw)
  To: Shinichiro Kawasaki
  Cc: Ming Lei, Jens Axboe, linux-block@vger.kernel.org, hch, Ming Lei



On 2/10/25 1:55 PM, Shinichiro Kawasaki wrote:
> On Feb 10, 2025 / 12:31, Nilay Shroff wrote:
>>
>>
>> On 2/10/25 7:12 AM, Ming Lei wrote:
>>> On Mon, Feb 10, 2025 at 8:52 AM Shinichiro Kawasaki
>>> <shinichiro.kawasaki@wdc.com> wrote:
>>>>
>>>> On Feb 09, 2025 / 20:20, Ming Lei wrote:
>>>>> All block internal state for dealing adding/removing debugfs entries
>>>>> have been removed, and debugfs can sync everything for us in fs level,
>>>>> so don't grab q->debugfs_mutex for adding/removing block internal debugfs
>>>>> entries.
>>>>>
>>>>> Now q->debugfs_mutex is only used for blktrace, meantime move creating
>>>>> queue debugfs dir code out of q->sysfs_lock. Both the two locks are
>>>>> connected with queue freeze IO lock.  Then queue freeze IO lock chain
>>>>> with debugfs lock is cut.
>>>>>
>>>>> The following lockdep report can be fixed:
>>>>>
>>>>> https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
>>>>>
>>>>> Follows contexts which adds/removes debugfs entries:
>>>>>
>>>>> - update nr_hw_queues
>>>>>
>>>>> - add/remove disks
>>>>>
>>>>> - elevator switch
>>>>>
>>>>> - blktrace
>>>>>
>>>>> blktrace only adds entries under disk top directory, so we can ignore it,
>>>>> because it can only work iff disk is added. Also nothing overlapped with
>>>>> the other two contex, blktrace context is fine.
>>>>>
>>>>> Elevator switch is only allowed after disk is added, so there isn't race
>>>>> with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
>>>>> previous elevator, so no race between these two. Elevator switch context
>>>>> is fine.
>>>>>
>>>>> So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
>>>>> adding/removing hctx entries, there might be race with add/remove disk,
>>>>> which is just fine in reality:
>>>>>
>>>>> - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
>>>>> won't be added/removed at the same time
>>>>>
>>>>> - even though there is race between the two contexts, it is just fine,
>>>>> since hctx won't be freed until queue is dead
>>>>>
>>>>> - we never see reports in this area without holding debugfs in
>>>>> blk_mq_update_nr_hw_queues()
>>>>>
>>>>> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>>
>>>> Ming, thank you for this quick action. I applied this series on top of
>>>> v6.14-rc1 kernel and ran the block/002 test case. Unfortunately, still if fails
>>>> occasionally with the lockdep "WARNING: possible circular locking dependency
>>>> detected" below. Now debugfs_mutex is not reported as one of the dependent
>>>> locks, then I think this fix is working as expected. Instead, eq->sysfs_lock
>>>> creates similar dependency. My mere guess is that this patch avoids one
>>>> dependency, but still another dependency is left.
>>>
>>> Indeed, this patch cuts dependency on both q->sysfs_lock and q->debugfs_lock,
>> Glad to see that with this patch we're able to cut the dependency between
>> q->sysfs_lock and q->debugfs_lock.
>>
>>> but elevator ->sysfs_lock isn't covered, :-(
>>>
>> I believe that shall be fixed with the current effort undergoing here:
>> https://lore.kernel.org/all/20250205144506.663819-1-nilay@linux.ibm.com/
> 
> Thanks Nilay for the information. I applied your patch series on top of
> v6.14-rc1 and Ming's. Then I ran block/002. Alas, still the test case fails with
> the "WARNING: possible circular locking dependency detected". The lockdep splat
> contents has changed as below:
> 
Thanks Shinichiro for trying out the patch!
However this patchset is not complete yet. I have to work on few suggestions
from Christoph and Ming and then I will re-post the patchset. We're planning 
to replace q->sysfs_lock with a new dedicated lock for elevator switch case 
and that should help cut the dependency between q->q_usage_counter(io) and 
q->sysfs_lock. So I think you need to wait for sometime before the patchest 
is ready.

Thanks,
--Nilay



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 7/7] block: don't grab q->debugfs_mutex
  2025-02-10  0:52   ` Shinichiro Kawasaki
  2025-02-10  1:42     ` Ming Lei
@ 2025-02-10 13:36     ` Nilay Shroff
  1 sibling, 0 replies; 21+ messages in thread
From: Nilay Shroff @ 2025-02-10 13:36 UTC (permalink / raw)
  To: Shinichiro Kawasaki, Ming Lei
  Cc: Jens Axboe, linux-block@vger.kernel.org, hch



On 2/10/25 6:22 AM, Shinichiro Kawasaki wrote:
> On Feb 09, 2025 / 20:20, Ming Lei wrote:
>> All block internal state for dealing adding/removing debugfs entries
>> have been removed, and debugfs can sync everything for us in fs level,
>> so don't grab q->debugfs_mutex for adding/removing block internal debugfs
>> entries.
>>
>> Now q->debugfs_mutex is only used for blktrace, meantime move creating
>> queue debugfs dir code out of q->sysfs_lock. Both the two locks are
>> connected with queue freeze IO lock.  Then queue freeze IO lock chain
>> with debugfs lock is cut.
>>
>> The following lockdep report can be fixed:
>>
>> https://lore.kernel.org/linux-block/ougniadskhks7uyxguxihgeuh2pv4yaqv4q3emo4gwuolgzdt6@brotly74p6bs/
>>
>> Follows contexts which adds/removes debugfs entries:
>>
>> - update nr_hw_queues
>>
>> - add/remove disks
>>
>> - elevator switch
>>
>> - blktrace
>>
>> blktrace only adds entries under disk top directory, so we can ignore it,
>> because it can only work iff disk is added. Also nothing overlapped with
>> the other two contex, blktrace context is fine.
>>
>> Elevator switch is only allowed after disk is added, so there isn't race
>> with add/remove disk. blk_mq_update_nr_hw_queues() always restores to
>> previous elevator, so no race between these two. Elevator switch context
>> is fine.
>>
>> So far blk_mq_update_nr_hw_queues() doesn't hold debugfs lock for
>> adding/removing hctx entries, there might be race with add/remove disk,
>> which is just fine in reality:
>>
>> - blk_mq_update_nr_hw_queues() is usually for error recovery, and disk
>> won't be added/removed at the same time
>>
>> - even though there is race between the two contexts, it is just fine,
>> since hctx won't be freed until queue is dead
>>
>> - we never see reports in this area without holding debugfs in
>> blk_mq_update_nr_hw_queues()
>>
>> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> 

[...]

> 
> [  115.085704] [   T1023] run blktests block/002 at 2025-02-10 09:22:22
> [  115.383653] [   T1054] sd 9:0:0:0: [sdd] Synchronizing SCSI cache
> [  115.641933] [   T1055] scsi_debug:sdebug_driver_probe: scsi_debug: trim poll_queues to 0. poll_q/nr_hw = (0/1)
> [  115.642961] [   T1055] scsi host9: scsi_debug: version 0191 [20210520]
>                             dev_size_mb=8, opts=0x0, submit_queues=1, statistics=0
> [  115.646207] [   T1055] scsi 9:0:0:0: Direct-Access     Linux    scsi_debug       0191 PQ: 0 ANSI: 7
> [  115.648225] [      C0] scsi 9:0:0:0: Power-on or device reset occurred
> [  115.654243] [   T1055] sd 9:0:0:0: Attached scsi generic sg3 type 0
> [  115.656248] [    T100] sd 9:0:0:0: [sdd] 16384 512-byte logical blocks: (8.39 MB/8.00 MiB)
> [  115.658403] [    T100] sd 9:0:0:0: [sdd] Write Protect is off
> [  115.659125] [    T100] sd 9:0:0:0: [sdd] Mode Sense: 73 00 10 08
> [  115.661621] [    T100] sd 9:0:0:0: [sdd] Write cache: enabled, read cache: enabled, supports DPO and FUA
> [  115.669276] [    T100] sd 9:0:0:0: [sdd] permanent stream count = 5
> [  115.673375] [    T100] sd 9:0:0:0: [sdd] Preferred minimum I/O size 512 bytes
> [  115.673974] [    T100] sd 9:0:0:0: [sdd] Optimal transfer size 524288 bytes
> [  115.710112] [    T100] sd 9:0:0:0: [sdd] Attached SCSI disk
> 
> [  116.464802] [   T1079] ======================================================
> [  116.465540] [   T1079] WARNING: possible circular locking dependency detected
> [  116.466107] [   T1079] 6.14.0-rc1+ #253 Not tainted
> [  116.466581] [   T1079] ------------------------------------------------------
> [  116.467141] [   T1079] blktrace/1079 is trying to acquire lock:
> [  116.467708] [   T1079] ffff88810539d1e0 (&mm->mmap_lock){++++}-{4:4}, at: __might_fault+0x99/0x120
> [  116.468439] [   T1079] 
>                           but task is already holding lock:
> [  116.469052] [   T1079] ffff88810a5fd758 (&sb->s_type->i_mutex_key#3){++++}-{4:4}, at: relay_file_read+0xa3/0x8a0
> [  116.469901] [   T1079] 
>                           which lock already depends on the new lock.
> 
> [  116.470762] [   T1079] 
>                           the existing dependency chain (in reverse order) is:
> [  116.473187] [   T1079] 
>                           -> #5 (&sb->s_type->i_mutex_key#3){++++}-{4:4}:
> [  116.475670] [   T1079]        down_read+0x9b/0x470
> [  116.477001] [   T1079]        lookup_one_unlocked+0xe9/0x120
> [  116.478333] [   T1079]        lookup_positive_unlocked+0x1d/0x90
> [  116.479648] [   T1079]        debugfs_lookup+0x47/0xa0
> [  116.480833] [   T1079]        blk_mq_debugfs_unregister_sched_hctx+0x23/0x50
> [  116.482215] [   T1079]        blk_mq_exit_sched+0xb6/0x2b0
> [  116.483466] [   T1079]        elevator_switch+0x12a/0x4b0
> [  116.484676] [   T1079]        elv_iosched_store+0x29f/0x380
> [  116.485841] [   T1079]        queue_attr_store+0x313/0x480
> [  116.487078] [   T1079]        kernfs_fop_write_iter+0x39e/0x5a0
> [  116.488358] [   T1079]        vfs_write+0x5f9/0xe90
> [  116.489460] [   T1079]        ksys_write+0xf6/0x1c0
> [  116.490582] [   T1079]        do_syscall_64+0x93/0x180
> [  116.491694] [   T1079]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [  116.492996] [   T1079] 
>                           -> #4 (&eq->sysfs_lock){+.+.}-{4:4}:
> [  116.495135] [   T1079]        __mutex_lock+0x1aa/0x1360
> [  116.496363] [   T1079]        elevator_switch+0x11f/0x4b0
> [  116.497499] [   T1079]        elv_iosched_store+0x29f/0x380
> [  116.498660] [   T1079]        queue_attr_store+0x313/0x480
> [  116.499752] [   T1079]        kernfs_fop_write_iter+0x39e/0x5a0
> [  116.500884] [   T1079]        vfs_write+0x5f9/0xe90
> [  116.501964] [   T1079]        ksys_write+0xf6/0x1c0
> [  116.503056] [   T1079]        do_syscall_64+0x93/0x180
> [  116.504194] [   T1079]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
> [  116.505356] [   T1079] 
In the above dependency chain, I see that thread #5 is waiting on &sb->s_type->i_mutex_key
after acquiring q->sysfs_lock and eq->sysfs_lock. And thread #4 would have acquired 
q->sysfs_lock and now pending on eq->sysfs_lock. But then how is it possible that two threads
are able to acquire q->sysfs_lock at the same time (assuming this is for the same request_queue). 
Is this a false-positive report from lockdep? Or am I missing something?

Thanks,
--Nilay



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/7] block: remove hctx->debugfs_dir
  2025-02-09 12:20 ` [PATCH 1/7] block: remove hctx->debugfs_dir Ming Lei
@ 2025-02-13  6:41   ` Christoph Hellwig
  2025-02-14  2:07     ` Ming Lei
  0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2025-02-13  6:41 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Nilay Shroff,
	Shinichiro Kawasaki

On Sun, Feb 09, 2025 at 08:20:25PM +0800, Ming Lei wrote:
> For each hctx, its debugfs path is fixed, which can be queried from
> its parent dentry and hctx queue num, so it isn't necessary to cache
> it in hctx structure because it isn't used in fast path.

It's not needed, but it's also kinda convenient.  What is the argument
that speaks for recalculating the path?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/7] block: remove hctx->sched_debugfs_dir
  2025-02-09 12:20 ` [PATCH 2/7] block: remove hctx->sched_debugfs_dir Ming Lei
@ 2025-02-13  6:42   ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-02-13  6:42 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Nilay Shroff, Shinichiro Kawasaki

On Sun, Feb 09, 2025 at 08:20:26PM +0800, Ming Lei wrote:
> For each hctx, its sched debugfs path is fixed, which can be queried from
> its parent dentry directly, so it isn't necessary to cache it in hctx
> instance because it isn't used in fast path.

Same comment as for the previous patch.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/7] block: remove hctx->debugfs_dir
  2025-02-13  6:41   ` Christoph Hellwig
@ 2025-02-14  2:07     ` Ming Lei
  2025-02-17  9:12       ` Christoph Hellwig
  0 siblings, 1 reply; 21+ messages in thread
From: Ming Lei @ 2025-02-14  2:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, linux-block, Nilay Shroff, Shinichiro Kawasaki

On Thu, Feb 13, 2025 at 07:41:46AM +0100, Christoph Hellwig wrote:
> On Sun, Feb 09, 2025 at 08:20:25PM +0800, Ming Lei wrote:
> > For each hctx, its debugfs path is fixed, which can be queried from
> > its parent dentry and hctx queue num, so it isn't necessary to cache
> > it in hctx structure because it isn't used in fast path.
> 
> It's not needed, but it's also kinda convenient.  What is the argument
> that speaks for recalculating the path?

q->debugfs_lock will be not necessary if these cached entries are removed,
please see the last patch.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 1/7] block: remove hctx->debugfs_dir
  2025-02-14  2:07     ` Ming Lei
@ 2025-02-17  9:12       ` Christoph Hellwig
  0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:12 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, linux-block, Nilay Shroff,
	Shinichiro Kawasaki

On Fri, Feb 14, 2025 at 10:07:23AM +0800, Ming Lei wrote:
> On Thu, Feb 13, 2025 at 07:41:46AM +0100, Christoph Hellwig wrote:
> > On Sun, Feb 09, 2025 at 08:20:25PM +0800, Ming Lei wrote:
> > > For each hctx, its debugfs path is fixed, which can be queried from
> > > its parent dentry and hctx queue num, so it isn't necessary to cache
> > > it in hctx structure because it isn't used in fast path.
> > 
> > It's not needed, but it's also kinda convenient.  What is the argument
> > that speaks for recalculating the path?
> 
> q->debugfs_lock will be not necessary if these cached entries are removed,
> please see the last patch.

Please document the reasoning (and tradeoff if there are any) in the
commit log.  No one will see your cover letter when bisecting a bug
down to this commit in the future.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex
  2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
                   ` (6 preceding siblings ...)
  2025-02-09 12:20 ` [PATCH 7/7] block: don't grab q->debugfs_mutex Ming Lei
@ 2025-02-17  9:41 ` Christoph Hellwig
  7 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-02-17  9:41 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Nilay Shroff,
	Shinichiro Kawasaki

On Sun, Feb 09, 2025 at 08:20:24PM +0800, Ming Lei wrote:
> Hi,
> 
> The 1st 6 patch removes all kinds of debugfs dir entries of block layer,
> instead always retrieves them debugfs, so avoid to maintain block
> internal debugfs state.

I'm still not sure this is a good idea.  Yes, unregistration is not
a fast path, but having to reconstruct path names to remove them
just creates annoying racyness.  Now we'd need to make sure now
one is creating the same names at the same time.  Which is probably
fine now, but something entirely non-obvious to keep it mind.  There's
also a reason why the debugfs API isn't built that way to start with.

> The 7 patch removes debugfs_mutex for adding/removing debugfs entry
> because we needn't the protection any more, then one lockdep warning
> can be fixed.

I don't see the lockdep dependency anywhere in this thread, but I
assume it's the mess with updating nr_hw_queues again?

In that case the fix is going probably going to be the same
tag_set-wide lock Nilay is looking into for sysfs.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2025-02-17  9:41 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-02-09 12:20 [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Ming Lei
2025-02-09 12:20 ` [PATCH 1/7] block: remove hctx->debugfs_dir Ming Lei
2025-02-13  6:41   ` Christoph Hellwig
2025-02-14  2:07     ` Ming Lei
2025-02-17  9:12       ` Christoph Hellwig
2025-02-09 12:20 ` [PATCH 2/7] block: remove hctx->sched_debugfs_dir Ming Lei
2025-02-13  6:42   ` Christoph Hellwig
2025-02-09 12:20 ` [PATCH 3/7] block: remove q->sched_debugfs_dir Ming Lei
2025-02-09 12:20 ` [PATCH 4/7] block: remove q->rqos_debugfs_dir Ming Lei
2025-02-09 12:20 ` [PATCH 5/7] block: remove rqos->debugfs_dir Ming Lei
2025-02-09 12:20 ` [PATCH 6/7] block: remove q->debugfs_dir Ming Lei
2025-02-09 12:20 ` [PATCH 7/7] block: don't grab q->debugfs_mutex Ming Lei
2025-02-09 16:54   ` kernel test robot
2025-02-10  0:52   ` Shinichiro Kawasaki
2025-02-10  1:42     ` Ming Lei
2025-02-10  7:01       ` Nilay Shroff
2025-02-10  8:15         ` Ming Lei
2025-02-10  8:25         ` Shinichiro Kawasaki
2025-02-10  8:39           ` Nilay Shroff
2025-02-10 13:36     ` Nilay Shroff
2025-02-17  9:41 ` [PATCH 0/7] block: remove all debugfs dir & don't grab debugfs_mutex Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).