[PATCH v6 00/13] blk-mq: fix possible deadlocks

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v6 00/13] blk-mq: fix possible deadlocks
@ 2025-12-25 10:32 Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
                   ` (12 more replies)
  0 siblings, 13 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

changes in v6:
 - rebase on the top of block-6.19;
 - add patch 6 to cleanup  blk_mq_debugfs_unregister_rqos();
 - change patch 8, from GFP_NOIO to blkg_conf_open_bdev_frozen();
changes in v5:
 - free rwb from wbt_init() caller in patch 2;
 - don't recheck rwb in patch 2 to make code cleaner, concurrent callers
   will fail from rq_qos_add();
 - add patch 7 to fix possible deadlock in blk-throtle;
changes in v4:
 - add patch 1,2 to fix a new deadlock;
changes in v3:
 - remove changes for blk-iolatency and blk-iocost in patch 2, since
   they don't have debugfs entries.
 - add patch 9 to fix lock order for blk-throttle.
changes in v2:
 - combine two set into one;

Fix deadlocks:
 - patch 1-2, pcpu_alloc_mutex under q_usage_counter in blk-wbt;
 - patch 3-7, debugfs_mutex under q_usage_counter;
 - patch 8, fs_reclaim under rq_qos_mutex in blk-throttle;
 - patch 9-13, q_usage_counter under rq_qos_mutex;

Yu Kuai (13):
  blk-wbt: factor out a helper wbt_set_lat()
  blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under
    q_usage_counter
  blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  blk-rq-qos: fix possible debugfs_mutex deadlock
  blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
  blk-mq-debugfs: warn about possible deadlock
  blk-throttle: fix possible deadlock for fs reclaim under rq_qos_mutex
  block/blk-rq-qos: add a new helper rq_qos_add_frozen()
  blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue
  blk-iocost: fix incorrect lock order for rq_qos_mutex and freeze queue
  blk-iolatency: fix incorrect lock order for rq_qos_mutex and freeze
    queue
  block/blk-rq-qos: cleanup rq_qos_add()

 block/blk-iocost.c     |  15 ++--
 block/blk-iolatency.c  |  11 +--
 block/blk-mq-debugfs.c |  66 +++++++++++-------
 block/blk-mq-debugfs.h |   8 +--
 block/blk-rq-qos.c     |  31 ++-------
 block/blk-sysfs.c      |  39 +----------
 block/blk-throttle.c   |  27 ++++----
 block/blk-wbt.c        | 153 +++++++++++++++++++++++++++++++----------
 block/blk-wbt.h        |   7 +-
 9 files changed, 194 insertions(+), 163 deletions(-)

-- 
2.51.0


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v6 01/13] blk-wbt: factor out a helper wbt_set_lat()
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

To move implementation details inside blk-wbt.c, prepare to fix possible
deadlock to call wbt_init() while queue is frozen in the next patch.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-sysfs.c | 39 ++----------------------------------
 block/blk-wbt.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++---
 block/blk-wbt.h   |  7 ++-----
 3 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e0a70d26972b..a580688c3ad5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -636,11 +636,8 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
 static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 				  size_t count)
 {
-	struct request_queue *q = disk->queue;
-	struct rq_qos *rqos;
 	ssize_t ret;
 	s64 val;
-	unsigned int memflags;
 
 	ret = queue_var_store64(&val, page);
 	if (ret < 0)
@@ -648,40 +645,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 	if (val < -1)
 		return -EINVAL;
 
-	/*
-	 * Ensure that the queue is idled, in case the latency update
-	 * ends up either enabling or disabling wbt completely. We can't
-	 * have IO inflight if that happens.
-	 */
-	memflags = blk_mq_freeze_queue(q);
-
-	rqos = wbt_rq_qos(q);
-	if (!rqos) {
-		ret = wbt_init(disk);
-		if (ret)
-			goto out;
-	}
-
-	ret = count;
-	if (val == -1)
-		val = wbt_default_latency_nsec(q);
-	else if (val >= 0)
-		val *= 1000ULL;
-
-	if (wbt_get_min_lat(q) == val)
-		goto out;
-
-	blk_mq_quiesce_queue(q);
-
-	mutex_lock(&disk->rqos_state_mutex);
-	wbt_set_min_lat(q, val);
-	mutex_unlock(&disk->rqos_state_mutex);
-
-	blk_mq_unquiesce_queue(q);
-out:
-	blk_mq_unfreeze_queue(q, memflags);
-
-	return ret;
+	ret = wbt_set_lat(disk, val);
+	return ret ? ret : count;
 }
 
 QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0974875f77bd..abc2190689bb 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -93,6 +93,8 @@ struct rq_wb {
 	struct rq_depth rq_depth;
 };
 
+static int wbt_init(struct gendisk *disk);
+
 static inline struct rq_wb *RQWB(struct rq_qos *rqos)
 {
 	return container_of(rqos, struct rq_wb, rqos);
@@ -506,7 +508,7 @@ u64 wbt_get_min_lat(struct request_queue *q)
 	return RQWB(rqos)->min_lat_nsec;
 }
 
-void wbt_set_min_lat(struct request_queue *q, u64 val)
+static void wbt_set_min_lat(struct request_queue *q, u64 val)
 {
 	struct rq_qos *rqos = wbt_rq_qos(q);
 	if (!rqos)
@@ -741,7 +743,7 @@ void wbt_init_enable_default(struct gendisk *disk)
 		WARN_ON_ONCE(wbt_init(disk));
 }
 
-u64 wbt_default_latency_nsec(struct request_queue *q)
+static u64 wbt_default_latency_nsec(struct request_queue *q)
 {
 	/*
 	 * We default to 2msec for non-rotational storage, and 75msec
@@ -902,7 +904,7 @@ static const struct rq_qos_ops wbt_rqos_ops = {
 #endif
 };
 
-int wbt_init(struct gendisk *disk)
+static int wbt_init(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct rq_wb *rwb;
@@ -949,3 +951,45 @@ int wbt_init(struct gendisk *disk)
 	return ret;
 
 }
+
+int wbt_set_lat(struct gendisk *disk, s64 val)
+{
+	struct request_queue *q = disk->queue;
+	unsigned int memflags;
+	struct rq_qos *rqos;
+	int ret = 0;
+
+	/*
+	 * Ensure that the queue is idled, in case the latency update
+	 * ends up either enabling or disabling wbt completely. We can't
+	 * have IO inflight if that happens.
+	 */
+	memflags = blk_mq_freeze_queue(q);
+
+	rqos = wbt_rq_qos(q);
+	if (!rqos) {
+		ret = wbt_init(disk);
+		if (ret)
+			goto out;
+	}
+
+	if (val == -1)
+		val = wbt_default_latency_nsec(q);
+	else if (val >= 0)
+		val *= 1000ULL;
+
+	if (wbt_get_min_lat(q) == val)
+		goto out;
+
+	blk_mq_quiesce_queue(q);
+
+	mutex_lock(&disk->rqos_state_mutex);
+	wbt_set_min_lat(q, val);
+	mutex_unlock(&disk->rqos_state_mutex);
+
+	blk_mq_unquiesce_queue(q);
+out:
+	blk_mq_unfreeze_queue(q, memflags);
+
+	return ret;
+}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 925f22475738..6e39da17218b 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -4,16 +4,13 @@
 
 #ifdef CONFIG_BLK_WBT
 
-int wbt_init(struct gendisk *disk);
 void wbt_init_enable_default(struct gendisk *disk);
 void wbt_disable_default(struct gendisk *disk);
 void wbt_enable_default(struct gendisk *disk);
 
 u64 wbt_get_min_lat(struct request_queue *q);
-void wbt_set_min_lat(struct request_queue *q, u64 val);
-bool wbt_disabled(struct request_queue *);
-
-u64 wbt_default_latency_nsec(struct request_queue *);
+bool wbt_disabled(struct request_queue *q);
+int wbt_set_lat(struct gendisk *disk, s64 val);
 
 #else
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-30  5:34   ` Nilay Shroff
  2025-12-25 10:32 ` [PATCH v6 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

If wbt is disabled by default and user configures wbt by sysfs, queue
will be frozen first and then pcpu_alloc_mutex will be held in
blk_stat_alloc_callback().

Fix this problem by allocating memory first before queue frozen.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-wbt.c | 108 ++++++++++++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 45 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index abc2190689bb..9bef71ec645d 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -93,7 +93,7 @@ struct rq_wb {
 	struct rq_depth rq_depth;
 };
 
-static int wbt_init(struct gendisk *disk);
+static int wbt_init(struct gendisk *disk, struct rq_wb *rwb);
 
 static inline struct rq_wb *RQWB(struct rq_qos *rqos)
 {
@@ -698,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
 	}
 }
 
+static int wbt_data_dir(const struct request *rq)
+{
+	const enum req_op op = req_op(rq);
+
+	if (op == REQ_OP_READ)
+		return READ;
+	else if (op_is_write(op))
+		return WRITE;
+
+	/* don't account */
+	return -1;
+}
+
+static struct rq_wb *wbt_alloc(void)
+{
+	struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+
+	if (!rwb)
+		return NULL;
+
+	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
+	if (!rwb->cb) {
+		kfree(rwb);
+		return NULL;
+	}
+
+	return rwb;
+}
+
+static void wbt_free(struct rq_wb *rwb)
+{
+	blk_stat_free_callback(rwb->cb);
+	kfree(rwb);
+}
+
 /*
  * Enable wbt if defaults are configured that way
  */
@@ -739,8 +774,17 @@ EXPORT_SYMBOL_GPL(wbt_enable_default);
 
 void wbt_init_enable_default(struct gendisk *disk)
 {
-	if (__wbt_enable_default(disk))
-		WARN_ON_ONCE(wbt_init(disk));
+	struct rq_wb *rwb;
+
+	if (!__wbt_enable_default(disk))
+		return;
+
+	rwb = wbt_alloc();
+	if (WARN_ON_ONCE(!rwb))
+		return;
+
+	if (WARN_ON_ONCE(wbt_init(disk, rwb)))
+		wbt_free(rwb);
 }
 
 static u64 wbt_default_latency_nsec(struct request_queue *q)
@@ -755,19 +799,6 @@ static u64 wbt_default_latency_nsec(struct request_queue *q)
 		return 75000000ULL;
 }
 
-static int wbt_data_dir(const struct request *rq)
-{
-	const enum req_op op = req_op(rq);
-
-	if (op == REQ_OP_READ)
-		return READ;
-	else if (op_is_write(op))
-		return WRITE;
-
-	/* don't account */
-	return -1;
-}
-
 static void wbt_queue_depth_changed(struct rq_qos *rqos)
 {
 	RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
@@ -779,8 +810,7 @@ static void wbt_exit(struct rq_qos *rqos)
 	struct rq_wb *rwb = RQWB(rqos);
 
 	blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
-	blk_stat_free_callback(rwb->cb);
-	kfree(rwb);
+	wbt_free(rwb);
 }
 
 /*
@@ -904,22 +934,11 @@ static const struct rq_qos_ops wbt_rqos_ops = {
 #endif
 };
 
-static int wbt_init(struct gendisk *disk)
+static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
 {
 	struct request_queue *q = disk->queue;
-	struct rq_wb *rwb;
-	int i;
 	int ret;
-
-	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
-	if (!rwb)
-		return -ENOMEM;
-
-	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
-	if (!rwb->cb) {
-		kfree(rwb);
-		return -ENOMEM;
-	}
+	int i;
 
 	for (i = 0; i < WBT_NUM_RWQ; i++)
 		rq_wait_init(&rwb->rq_wait[i]);
@@ -939,38 +958,38 @@ static int wbt_init(struct gendisk *disk)
 	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
 	mutex_unlock(&q->rq_qos_mutex);
 	if (ret)
-		goto err_free;
+		return ret;
 
 	blk_stat_add_callback(q, rwb->cb);
-
 	return 0;
-
-err_free:
-	blk_stat_free_callback(rwb->cb);
-	kfree(rwb);
-	return ret;
-
 }
 
 int wbt_set_lat(struct gendisk *disk, s64 val)
 {
 	struct request_queue *q = disk->queue;
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	struct rq_wb *rwb = NULL;
 	unsigned int memflags;
-	struct rq_qos *rqos;
 	int ret = 0;
 
+	if (!rqos) {
+		rwb = wbt_alloc();
+		if (!rwb)
+			return -ENOMEM;
+	}
+
 	/*
 	 * Ensure that the queue is idled, in case the latency update
 	 * ends up either enabling or disabling wbt completely. We can't
 	 * have IO inflight if that happens.
 	 */
 	memflags = blk_mq_freeze_queue(q);
-
-	rqos = wbt_rq_qos(q);
 	if (!rqos) {
-		ret = wbt_init(disk);
-		if (ret)
+		ret = wbt_init(disk, rwb);
+		if (ret) {
+			wbt_free(rwb);
 			goto out;
+		}
 	}
 
 	if (val == -1)
@@ -990,6 +1009,5 @@ int wbt_set_lat(struct gendisk *disk, s64 val)
 	blk_mq_unquiesce_queue(q);
 out:
 	blk_mq_unfreeze_queue(q, memflags);
-
 	return ret;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  2025-12-25 10:32 ` [PATCH v6 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
@ 2025-12-30  5:34   ` Nilay Shroff
  0 siblings, 0 replies; 22+ messages in thread
From: Nilay Shroff @ 2025-12-30  5:34 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/25/25 4:02 PM, Yu Kuai wrote:
> If wbt is disabled by default and user configures wbt by sysfs, queue
> will be frozen first and then pcpu_alloc_mutex will be held in
> blk_stat_alloc_callback().
> 
> Fix this problem by allocating memory first before queue frozen.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v6 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

There is already a helper blk_mq_debugfs_register_rqos() to register
one rqos, however this helper is called synchronously when the rqos is
created with queue frozen.

Prepare to fix possible deadlock to create blk-mq debugfs entries while
queue is still frozen.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 23 +++++++++++++++--------
 block/blk-mq-debugfs.h |  5 +++++
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4896525b1c05..4fe164b6d648 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -631,14 +631,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 			blk_mq_debugfs_register_hctx(q, hctx);
 	}
 
-	if (q->rq_qos) {
-		struct rq_qos *rqos = q->rq_qos;
-
-		while (rqos) {
-			blk_mq_debugfs_register_rqos(rqos);
-			rqos = rqos->next;
-		}
-	}
+	blk_mq_debugfs_register_rq_qos(q);
 }
 
 static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
@@ -769,6 +762,20 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
 }
 
+void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
+{
+	lockdep_assert_held(&q->debugfs_mutex);
+
+	if (q->rq_qos) {
+		struct rq_qos *rqos = q->rq_qos;
+
+		while (rqos) {
+			blk_mq_debugfs_register_rqos(rqos);
+			rqos = rqos->next;
+		}
+	}
+}
+
 void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 					struct blk_mq_hw_ctx *hctx)
 {
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index c80e453e3014..54948a266889 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -33,6 +33,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx);
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
+void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
 void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
 #else
@@ -78,6 +79,10 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 }
 
+static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
+{
+}
+
 static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 {
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (2 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-30  5:35   ` Nilay Shroff
  2025-12-25 10:32 ` [PATCH v6 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Currently rq-qos debugfs entries are created from rq_qos_add(), while
rq_qos_add() can be called while queue is still frozen. This can
deadlock because creating new entries can trigger fs reclaim.

Fix this problem by delaying creating rq-qos debugfs entries after queue
is unfrozen.

- For wbt, 1) it can be initialized by default, fix it by calling new
  helper after wbt_init() from wbt_init_enable_default(); 2) it can be
  initialized by sysfs, fix it by calling new helper after queue is
  unfrozen from wbt_set_lat().
- For iocost and iolatency, they can only be initialized by blkcg
  configuration, however, they don't have debugfs entries for now, hence
  they are not handled yet.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-rq-qos.c |  7 -------
 block/blk-wbt.c    | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 654478dfbc20..d7ce99ce2e80 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -347,13 +347,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
 
 	blk_mq_unfreeze_queue(q, memflags);
-
-	if (rqos->ops->debugfs_attrs) {
-		mutex_lock(&q->debugfs_mutex);
-		blk_mq_debugfs_register_rqos(rqos);
-		mutex_unlock(&q->debugfs_mutex);
-	}
-
 	return 0;
 ebusy:
 	blk_mq_unfreeze_queue(q, memflags);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 9bef71ec645d..de3528236545 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -774,6 +774,7 @@ EXPORT_SYMBOL_GPL(wbt_enable_default);
 
 void wbt_init_enable_default(struct gendisk *disk)
 {
+	struct request_queue *q = disk->queue;
 	struct rq_wb *rwb;
 
 	if (!__wbt_enable_default(disk))
@@ -783,8 +784,14 @@ void wbt_init_enable_default(struct gendisk *disk)
 	if (WARN_ON_ONCE(!rwb))
 		return;
 
-	if (WARN_ON_ONCE(wbt_init(disk, rwb)))
+	if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
 		wbt_free(rwb);
+		return;
+	}
+
+	mutex_lock(&q->debugfs_mutex);
+	blk_mq_debugfs_register_rq_qos(q);
+	mutex_unlock(&q->debugfs_mutex);
 }
 
 static u64 wbt_default_latency_nsec(struct request_queue *q)
@@ -1009,5 +1016,9 @@ int wbt_set_lat(struct gendisk *disk, s64 val)
 	blk_mq_unquiesce_queue(q);
 out:
 	blk_mq_unfreeze_queue(q, memflags);
+	mutex_lock(&q->debugfs_mutex);
+	blk_mq_debugfs_register_rq_qos(q);
+	mutex_unlock(&q->debugfs_mutex);
+
 	return ret;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock
  2025-12-25 10:32 ` [PATCH v6 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
@ 2025-12-30  5:35   ` Nilay Shroff
  0 siblings, 0 replies; 22+ messages in thread
From: Nilay Shroff @ 2025-12-30  5:35 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/25/25 4:02 PM, Yu Kuai wrote:
> Currently rq-qos debugfs entries are created from rq_qos_add(), while
> rq_qos_add() can be called while queue is still frozen. This can
> deadlock because creating new entries can trigger fs reclaim.
> 
> Fix this problem by delaying creating rq-qos debugfs entries after queue
> is unfrozen.
> 
> - For wbt, 1) it can be initialized by default, fix it by calling new
>   helper after wbt_init() from wbt_init_enable_default(); 2) it can be
>   initialized by sysfs, fix it by calling new helper after queue is
>   unfrozen from wbt_set_lat().
> - For iocost and iolatency, they can only be initialized by blkcg
>   configuration, however, they don't have debugfs entries for now, hence
>   they are not handled yet.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v6 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (3 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() Yu Kuai
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Because it's only used inside blk-mq-debugfs.c now.

Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 2 +-
 block/blk-mq-debugfs.h | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4fe164b6d648..11f00a868541 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -744,7 +744,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 	rqos->debugfs_dir = NULL;
 }
 
-void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
+static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
 	const char *dir_name = rq_qos_id_to_name(rqos->id);
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 54948a266889..d94daa66556b 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -34,7 +34,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
-void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
 #else
 static inline void blk_mq_debugfs_register(struct request_queue *q)
@@ -75,10 +74,6 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
 {
 }
 
-static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
-{
-}
-
 static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
 {
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (4 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-30  5:33   ` Nilay Shroff
  2025-12-25 10:32 ` [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Because this helper is only used by iocost and iolatency, while they
don't have debugfs entries.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 10 ----------
 block/blk-mq-debugfs.h |  4 ----
 block/blk-rq-qos.c     |  4 ----
 3 files changed, 18 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 11f00a868541..22c182b40bc3 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -734,16 +734,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
 	return "unknown";
 }
 
-void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
-{
-	lockdep_assert_held(&rqos->disk->queue->debugfs_mutex);
-
-	if (!rqos->disk->queue->debugfs_dir)
-		return;
-	debugfs_remove_recursive(rqos->debugfs_dir);
-	rqos->debugfs_dir = NULL;
-}
-
 static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index d94daa66556b..49bb1aaa83dc 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -34,7 +34,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
-void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
 #else
 static inline void blk_mq_debugfs_register(struct request_queue *q)
 {
@@ -78,9 +77,6 @@ static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
 {
 }
 
-static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
-{
-}
 #endif
 
 #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d7ce99ce2e80..85cf74402a09 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -371,8 +371,4 @@ void rq_qos_del(struct rq_qos *rqos)
 	if (!q->rq_qos)
 		blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
 	blk_mq_unfreeze_queue(q, memflags);
-
-	mutex_lock(&q->debugfs_mutex);
-	blk_mq_debugfs_unregister_rqos(rqos);
-	mutex_unlock(&q->debugfs_mutex);
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
  2025-12-25 10:32 ` [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() Yu Kuai
@ 2025-12-30  5:33   ` Nilay Shroff
  2025-12-31  5:38     ` Yu Kuai
  0 siblings, 1 reply; 22+ messages in thread
From: Nilay Shroff @ 2025-12-30  5:33 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei

On 12/25/25 4:02 PM, Yu Kuai wrote:
> diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
> index d7ce99ce2e80..85cf74402a09 100644
> --- a/block/blk-rq-qos.c
> +++ b/block/blk-rq-qos.c
> @@ -371,8 +371,4 @@ void rq_qos_del(struct rq_qos *rqos)
>  	if (!q->rq_qos)
>  		blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
>  	blk_mq_unfreeze_queue(q, memflags);
> -
> -	mutex_lock(&q->debugfs_mutex);
> -	blk_mq_debugfs_unregister_rqos(rqos);
> -	mutex_unlock(&q->debugfs_mutex);
>  }

This change looks good overall, but I have one comment:

Do we really need to freeze the queue in rq_qos_del() here? Currently,
rq_qos_del() is only called from blk_iocost_init() and blk_iolatency_init(),
both of which run while holding ->freeze_lock. Given this, it seems
unnecessary for rq_qos_del() to freeze and unfreeze the queue again.
Instead, we could remove the freeze/unfreeze logic from rq_qos_del() and
add a WARN_ON() or assertion to ensure that the caller has already frozen
the queue before invoking it.
Moreover, with the current logic, rq_qos_del() may reverse the locking
order between ->rq_qos_mutex and ->freeze_lock if the caller has not
already acquired ->freeze_lock, which could lead to potential lock
ordering issues.

Thanks,
--Nilay

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
  2025-12-30  5:33   ` Nilay Shroff
@ 2025-12-31  5:38     ` Yu Kuai
  0 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-31  5:38 UTC (permalink / raw)
  To: Nilay Shroff, axboe, linux-block, tj, ming.lei, yukuai

Hi,

在 2025/12/30 13:33, Nilay Shroff 写道:
>
> On 12/25/25 4:02 PM, Yu Kuai wrote:
>> diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
>> index d7ce99ce2e80..85cf74402a09 100644
>> --- a/block/blk-rq-qos.c
>> +++ b/block/blk-rq-qos.c
>> @@ -371,8 +371,4 @@ void rq_qos_del(struct rq_qos *rqos)
>>   	if (!q->rq_qos)
>>   		blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
>>   	blk_mq_unfreeze_queue(q, memflags);
>> -
>> -	mutex_lock(&q->debugfs_mutex);
>> -	blk_mq_debugfs_unregister_rqos(rqos);
>> -	mutex_unlock(&q->debugfs_mutex);
>>   }
> This change looks good overall, but I have one comment:
>
> Do we really need to freeze the queue in rq_qos_del() here? Currently,
> rq_qos_del() is only called from blk_iocost_init() and blk_iolatency_init(),
> both of which run while holding ->freeze_lock. Given this, it seems
> unnecessary for rq_qos_del() to freeze and unfreeze the queue again.
> Instead, we could remove the freeze/unfreeze logic from rq_qos_del() and
> add a WARN_ON() or assertion to ensure that the caller has already frozen
> the queue before invoking it.

Sounds good.

> Moreover, with the current logic, rq_qos_del() may reverse the locking
> order between ->rq_qos_mutex and ->freeze_lock if the caller has not
> already acquired ->freeze_lock, which could lead to potential lock
> ordering issues.

Meanwhile, I see the same situation in blkcg_policy_register(), where
blk_mq_freeeze_queue() is called with rq_qos_mutex already held, I'll
clean that up as well.

>
> Thanks,
> --Nilay
>
-- 
Thansk,
Kuai

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (5 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-30  5:37   ` Nilay Shroff
                     ` (2 more replies)
  2025-12-25 10:32 ` [PATCH v6 08/13] blk-throttle: fix possible deadlock for fs reclaim under rq_qos_mutex Yu Kuai
                   ` (5 subsequent siblings)
  12 siblings, 3 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Creating new debugfs entries can trigger fs reclaim, hence we can't do
this with queue frozen, meanwhile, other locks that can be held while
queue is frozen should not be held as well.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 22c182b40bc3..1e63cfc82630 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -608,9 +608,23 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{},
 };
 
-static void debugfs_create_files(struct dentry *parent, void *data,
+static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
+				 void *data,
 				 const struct blk_mq_debugfs_attr *attr)
 {
+	lockdep_assert_held(&q->debugfs_mutex);
+	/*
+	 * Creating new debugfs entries with queue freezed has the risk of
+	 * deadlock.
+	 */
+	WARN_ON_ONCE(q->mq_freeze_depth != 0);
+	/*
+	 * debugfs_mutex should not be nested under other locks that can be
+	 * grabbed while queue is frozen.
+	 */
+	lockdep_assert_not_held(&q->elevator_lock);
+	lockdep_assert_not_held(&q->rq_qos_mutex);
+
 	if (IS_ERR_OR_NULL(parent))
 		return;
 
@@ -624,7 +638,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
+	debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->debugfs_dir)
@@ -643,7 +657,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
 	ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir);
 
-	debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs);
+	debugfs_create_files(hctx->queue, ctx_dir, ctx,
+			     blk_mq_debugfs_ctx_attrs);
 }
 
 void blk_mq_debugfs_register_hctx(struct request_queue *q,
@@ -659,7 +674,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
 	hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
 
-	debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs);
+	debugfs_create_files(q, hctx->debugfs_dir, hctx,
+			     blk_mq_debugfs_hctx_attrs);
 
 	hctx_for_each_ctx(hctx, ctx, i)
 		blk_mq_debugfs_register_ctx(hctx, ctx);
@@ -710,7 +726,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
 
 	q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir);
 
-	debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
+	debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
 }
 
 void blk_mq_debugfs_unregister_sched(struct request_queue *q)
@@ -749,7 +765,8 @@ static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 							 q->debugfs_dir);
 
 	rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir);
-	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
+	debugfs_create_files(q, rqos->debugfs_dir, rqos,
+			     rqos->ops->debugfs_attrs);
 }
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
@@ -786,7 +803,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 
 	hctx->sched_debugfs_dir = debugfs_create_dir("sched",
 						     hctx->debugfs_dir);
-	debugfs_create_files(hctx->sched_debugfs_dir, hctx,
+	debugfs_create_files(q, hctx->sched_debugfs_dir, hctx,
 			     e->hctx_debugfs_attrs);
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-25 10:32 ` [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
@ 2025-12-30  5:37   ` Nilay Shroff
  2025-12-30  6:04   ` kernel test robot
  2025-12-30 13:05   ` Nilay Shroff
  2 siblings, 0 replies; 22+ messages in thread
From: Nilay Shroff @ 2025-12-30  5:37 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/25/25 4:02 PM, Yu Kuai wrote:
> Creating new debugfs entries can trigger fs reclaim, hence we can't do
> this with queue frozen, meanwhile, other locks that can be held while
> queue is frozen should not be held as well.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-25 10:32 ` [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
  2025-12-30  5:37   ` Nilay Shroff
@ 2025-12-30  6:04   ` kernel test robot
  2025-12-30 13:05   ` Nilay Shroff
  2 siblings, 0 replies; 22+ messages in thread
From: kernel test robot @ 2025-12-30  6:04 UTC (permalink / raw)
  To: Yu Kuai
  Cc: oe-lkp, lkp, linux-block, axboe, tj, nilay, ming.lei, yukuai,
	oliver.sang



Hello,

kernel test robot noticed "RIP:debugfs_create_files" on:

commit: 492a1c791dd61f6b2abfc86a4a85acf5db1d0e32 ("[PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock")
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/blk-wbt-factor-out-a-helper-wbt_set_lat/20251225-183443
base: https://git.kernel.org/cgit/linux/kernel/git/axboe/linux.git for-next
patch link: https://lore.kernel.org/all/20251225103248.1303397-8-yukuai@fnnas.com/
patch subject: [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock

in testcase: blktests
version: blktests-x86_64-b1b99d1-1_20251223
with following parameters:

	disk: 1SSD
	test: nvme-005
	nvme_trtype: rdma



config: x86_64-rhel-9.4-func
compiler: gcc-14
test machine: 224 threads 2 sockets Intel(R) Xeon(R) Platinum 8480+ (Sapphire Rapids) with 256G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202512301342.35385eee-lkp@intel.com


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20251230/202512301342.35385eee-lkp@intel.com


[  162.300625][ T1400] nvme nvme2: creating 128 I/O queues.
[  162.876161][ T1400] nvme nvme2: mapped 128/0/0 default/read/poll queues.
[  162.901778][ T1400] ------------[ cut here ]------------
[  162.908519][ T1400] WARNING: block/blk-mq-debugfs.c:620 at debugfs_create_files+0xb8/0xe0, CPU#72: kworker/u898:10/1400
[  162.922316][ T1400] Modules linked in: siw ib_uverbs nvmet_rdma nvmet nvme_auth hkdf nvme_rdma nvme_fabrics rdma_cm iw_cm ib_cm ib_core loop f2fs binfmt_misc intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common intel_ifs i10nm_edac skx_edac_common nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp btrfs blake2b libblake2b xor zstd_compress kvm_intel raid6_pq kvm irqbypass dax_hmem ghash_clmulni_intel ast rapl cxl_acpi snd_pcm pmt_telemetry drm_client_lib spi_nor nvme iaa_crypto qat_4xxx intel_cstate cxl_port mei_me snd_timer pmt_discovery drm_shmem_helper pmt_class intel_sdsi mtd snd intel_qat ipmi_ssif intel_th_gth isst_if_mmio isst_if_mbox_pci i40e cxl_core idxd soundcore intel_th_pci i2c_i801 spi_intel_pci crc8 libie intel_uncore nvme_core einj intel_vsec cdc_ether acpi_power_meter mei drm_kms_helper pcspkr i2c_ismt intel_th wmi spi_intel isst_if_common libie_adminq i2c_smbus idxd_bus authenc ipmi_si acpi_ipmi ipmi_devintf ipmi_msghandler acpi_pad pinctrl_emmitsburg pfr_telemetry
[  162.922410][ T1400]  pfr_update drm fuse nfnetlink
[  163.034763][ T1400] CPU: 72 UID: 0 PID: 1400 Comm: kworker/u898:10 Tainted: G S                  6.19.0-rc1-00238-g492a1c791dd6 #1 PREEMPT(voluntary) 
[  163.050931][ T1400] Tainted: [S]=CPU_OUT_OF_SPEC
[  163.056656][ T1400] Hardware name: Intel Corporation EAGLESTREAM/EAGLESTREAM, BIOS SE5C7411.86B.8118.D04.2206151341 06/15/2022
[  163.070319][ T1400] Workqueue: nvme-reset-wq nvme_rdma_reset_ctrl_work [nvme_rdma]
[  163.079319][ T1400] RIP: 0010:debugfs_create_files+0xb8/0xe0
[  163.086849][ T1400] Code: 89 ef e8 1b 1f d0 ff 48 89 d8 48 c1 e8 03 42 80 3c 20 00 75 23 48 8b 2b 48 85 ed 75 b2 5b 5d 41 5c 41 5d 41 5e c3 cc cc cc cc <0f> 0b e9 5f ff ff ff e8 9c a6 66 ff eb af 48 89 df e8 d2 a6 66 ff
[  163.109632][ T1400] RSP: 0018:ffa00000136c78c0 EFLAGS: 00010202
[  163.116916][ T1400] RAX: 0000000000000007 RBX: ffffffff845523a0 RCX: ffffffff845523a0
[  163.126203][ T1400] RDX: ff110022742fcc00 RSI: ff110020dbcfa400 RDI: 0000000000000001
[  163.135503][ T1400] RBP: ffa00000136c7958 R08: 0000000000000001 R09: fff3fc00026d8f05
[  163.144962][ T1400] R10: ffa00000136c782f R11: 00000000ffffffff R12: ff110022742fcc00
[  163.154244][ T1400] R13: ff110020dbcfa400 R14: ff110022742fcc00 R15: ff110022742fccfe
[  163.163587][ T1400] FS:  0000000000000000(0000) GS:ff11003fd9cf5000(0000) knlGS:0000000000000000
[  163.173957][ T1400] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  163.181790][ T1400] CR2: 00005577f0a48e88 CR3: 000000405ca70004 CR4: 0000000000f73ef0
[  163.191083][ T1400] PKRU: 55555554
[  163.195396][ T1400] Call Trace:
[  163.199405][ T1400]  <TASK>
[  163.203027][ T1400]  blk_mq_debugfs_register_hctx+0x17a/0x440
[  163.210760][ T1400]  ? kobject_add+0x116/0x180
[  163.216245][ T1400]  ? __pfx_blk_mq_debugfs_register_hctx+0x10/0x10
[  163.224443][ T1400]  ? __pfx_mutex_unlock+0x10/0x10
[  163.230393][ T1400]  ? blk_mq_register_hctx+0x1ea/0x420
[  163.236887][ T1400]  blk_mq_debugfs_register_hctxs+0xe6/0x160
[  163.243816][ T1400]  __blk_mq_update_nr_hw_queues+0x544/0xab0
[  163.250866][ T1400]  ? __pfx___blk_mq_update_nr_hw_queues+0x10/0x10
[  163.258381][ T1400]  ? mutex_lock+0x91/0xf0
[  163.263590][ T1400]  ? __pfx_mutex_lock+0x10/0x10
[  163.269330][ T1400]  ? blk_mq_run_hw_queues+0xe1/0x400
[  163.275597][ T1400]  blk_mq_update_nr_hw_queues+0x35/0x50
[  163.282091][ T1400]  nvme_rdma_configure_io_queues.cold+0x3ff/0x72f [nvme_rdma]
[  163.290878][ T1400]  ? __pfx_nvme_rdma_configure_io_queues+0x10/0x10 [nvme_rdma]
[  163.299714][ T1400]  ? nvme_rdma_configure_admin_queue+0x3d4/0x750 [nvme_rdma]
[  163.308274][ T1400]  nvme_rdma_setup_ctrl+0x252/0x4e0 [nvme_rdma]
[  163.315608][ T1400]  ? nvme_change_ctrl_state+0x1a1/0x2e0 [nvme_core]
[  163.323275][ T1400]  nvme_rdma_reset_ctrl_work+0xa7/0x170 [nvme_rdma]
[  163.330935][ T1400]  process_one_work+0x668/0xec0
[  163.336719][ T1400]  worker_thread+0x629/0x10a0
[  163.342203][ T1400]  ? __pfx_worker_thread+0x10/0x10
[  163.348169][ T1400]  kthread+0x39b/0x750
[  163.352977][ T1400]  ? __pfx_kthread+0x10/0x10
[  163.358344][ T1400]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[  163.364774][ T1400]  ? __pfx_kthread+0x10/0x10
[  163.370133][ T1400]  ? __pfx_kthread+0x10/0x10
[  163.375530][ T1400]  ret_from_fork+0x2aa/0x490
[  163.380889][ T1400]  ? __pfx_ret_from_fork+0x10/0x10
[  163.386821][ T1400]  ? switch_fpu+0x13/0x1a0
[  163.391971][ T1400]  ? __switch_to+0x4cd/0xe70
[  163.397293][ T1400]  ? __pfx_kthread+0x10/0x10
[  163.402712][ T1400]  ret_from_fork_asm+0x1a/0x30
[  163.408231][ T1400]  </TASK>
[  163.411794][ T1400] ---[ end trace 0000000000000000 ]---
[  163.447563][ T3933] nvme nvme2: Removing ctrl: NQN "blktests-subsystem-1"
[  163.471043][ T3458] block nvme2n1: no available path - failing I/O
[  163.479008][ T3458] block nvme2n1: no available path - failing I/O
[  163.487171][ T3458] Buffer I/O error on dev nvme2n1, logical block 262142, async page read
[  164.210460][ T3990] SoftiWARP detached

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-25 10:32 ` [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
  2025-12-30  5:37   ` Nilay Shroff
  2025-12-30  6:04   ` kernel test robot
@ 2025-12-30 13:05   ` Nilay Shroff
  2025-12-31  5:59     ` Yu Kuai
  2 siblings, 1 reply; 22+ messages in thread
From: Nilay Shroff @ 2025-12-30 13:05 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/25/25 4:02 PM, Yu Kuai wrote:
> -static void debugfs_create_files(struct dentry *parent, void *data,
> +static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
> +				 void *data,
>  				 const struct blk_mq_debugfs_attr *attr)
>  {
> +	lockdep_assert_held(&q->debugfs_mutex);
> +	/*
> +	 * Creating new debugfs entries with queue freezed has the risk of
> +	 * deadlock.
> +	 */
> +	WARN_ON_ONCE(q->mq_freeze_depth != 0);
> +	/*
> +	 * debugfs_mutex should not be nested under other locks that can be
> +	 * grabbed while queue is frozen.
> +	 */
> +	lockdep_assert_not_held(&q->elevator_lock);
> +	lockdep_assert_not_held(&q->rq_qos_mutex);
> +
>  	if (IS_ERR_OR_NULL(parent))
>  		return;

I just saw that we've nr_hw_queue update code path which is
calling into the above function while registering debugfs 
entries for the hctx but it doesn't acquire ->debugfs_mutex. 
So that triggers the lockdep warning. We need to fix nr_hw_queue 
update path so that it enters into the above function after
acquiring ->debugfs_mutex. In fact, kernel test robot also
complains about the same.

Thanks,
--Nilay

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-30 13:05   ` Nilay Shroff
@ 2025-12-31  5:59     ` Yu Kuai
  0 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-31  5:59 UTC (permalink / raw)
  To: Nilay Shroff, axboe, linux-block, tj, ming.lei, yukuai

Hi,

在 2025/12/30 21:05, Nilay Shroff 写道:
>
> On 12/25/25 4:02 PM, Yu Kuai wrote:
>> -static void debugfs_create_files(struct dentry *parent, void *data,
>> +static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
>> +				 void *data,
>>   				 const struct blk_mq_debugfs_attr *attr)
>>   {
>> +	lockdep_assert_held(&q->debugfs_mutex);
>> +	/*
>> +	 * Creating new debugfs entries with queue freezed has the risk of
>> +	 * deadlock.
>> +	 */
>> +	WARN_ON_ONCE(q->mq_freeze_depth != 0);
>> +	/*
>> +	 * debugfs_mutex should not be nested under other locks that can be
>> +	 * grabbed while queue is frozen.
>> +	 */
>> +	lockdep_assert_not_held(&q->elevator_lock);
>> +	lockdep_assert_not_held(&q->rq_qos_mutex);
>> +
>>   	if (IS_ERR_OR_NULL(parent))
>>   		return;
> I just saw that we've nr_hw_queue update code path which is
> calling into the above function while registering debugfs
> entries for the hctx but it doesn't acquire ->debugfs_mutex.
> So that triggers the lockdep warning. We need to fix nr_hw_queue
> update path so that it enters into the above function after
> acquiring ->debugfs_mutex. In fact, kernel test robot also
> complains about the same.

Sure, I missed that. I'll fix it in the next version.

>
> Thanks,
> --Nilay
>
-- 
Thansk,
Kuai

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH v6 08/13] blk-throttle: fix possible deadlock for fs reclaim under rq_qos_mutex
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (6 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 09/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen() Yu Kuai
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

blk_throtl_init() can be called with rq_qos_mutex held from blkcg
configuration, and fs reclaim can be triggered because GFP_KERNEL is used
to allocate memory. This can deadlock because rq_qos_mutex can be held
with queue frozen.

Fix the problem by using blkg_conf_open_bdev_frozen(), also remove
useless queue frozen from blk_throtl_init().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-throttle.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 97188a795848..6b9e76c6a24b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1310,7 +1310,6 @@ static int blk_throtl_init(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct throtl_data *td;
-	unsigned int memflags;
 	int ret;
 
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
@@ -1319,8 +1318,6 @@ static int blk_throtl_init(struct gendisk *disk)
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
-
-	memflags = blk_mq_freeze_queue(disk->queue);
 	blk_mq_quiesce_queue(disk->queue);
 
 	q->td = td;
@@ -1334,8 +1331,6 @@ static int blk_throtl_init(struct gendisk *disk)
 	}
 
 	blk_mq_unquiesce_queue(disk->queue);
-	blk_mq_unfreeze_queue(disk->queue, memflags);
-
 	return ret;
 }
 
@@ -1345,15 +1340,18 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 {
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
+	unsigned long memflags;
 	struct throtl_grp *tg;
-	int ret;
+	int ret = 0;
 	u64 v;
 
 	blkg_conf_init(&ctx, buf);
 
-	ret = blkg_conf_open_bdev(&ctx);
-	if (ret)
+	memflags = blkg_conf_open_bdev_frozen(&ctx);
+	if (IS_ERR_VALUE(memflags)) {
+		ret = memflags;
 		goto out_finish;
+	}
 
 	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
 		ret = blk_throtl_init(ctx.bdev->bd_disk);
@@ -1382,7 +1380,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 	tg_conf_updated(tg, false);
 	ret = 0;
 out_finish:
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return ret ?: nbytes;
 }
 
@@ -1529,15 +1527,18 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 {
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
+	unsigned long memflags;
 	struct throtl_grp *tg;
+	int ret = 0;
 	u64 v[4];
-	int ret;
 
 	blkg_conf_init(&ctx, buf);
 
-	ret = blkg_conf_open_bdev(&ctx);
-	if (ret)
+	memflags = blkg_conf_open_bdev_frozen(&ctx);
+	if (IS_ERR_VALUE(memflags)) {
+		ret = memflags;
 		goto out_finish;
+	}
 
 	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
 		ret = blk_throtl_init(ctx.bdev->bd_disk);
@@ -1600,7 +1601,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	tg_conf_updated(tg, false);
 	ret = 0;
 out_finish:
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return ret ?: nbytes;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 09/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen()
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (7 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 08/13] blk-throttle: fix possible deadlock for fs reclaim under rq_qos_mutex Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 10/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue Yu Kuai
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

queue should not be frozen under rq_qos_mutex, see example from
commit 9730763f4756 ("block: correct locking order for protecting blk-wbt
parameters"), which means current implementation of rq_qos_add() is
problematic. Add a new helper and prepare to fix this problem in
following patches.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-rq-qos.c | 21 +++++++++++++++++++++
 block/blk-rq-qos.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 85cf74402a09..b8f163827477 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -322,6 +322,27 @@ void rq_qos_exit(struct request_queue *q)
 	mutex_unlock(&q->rq_qos_mutex);
 }
 
+int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
+		      enum rq_qos_id id, const struct rq_qos_ops *ops)
+{
+	struct request_queue *q = disk->queue;
+
+	WARN_ON_ONCE(q->mq_freeze_depth == 0);
+	lockdep_assert_held(&q->rq_qos_mutex);
+
+	if (rq_qos_id(q, id))
+		return -EBUSY;
+
+	rqos->disk = disk;
+	rqos->id = id;
+	rqos->ops = ops;
+	rqos->next = q->rq_qos;
+	q->rq_qos = rqos;
+	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
+
+	return 0;
+}
+
 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		const struct rq_qos_ops *ops)
 {
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index b538f2c0febc..8d9fb10ae526 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -87,6 +87,8 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
 
 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		const struct rq_qos_ops *ops);
+int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
+		      enum rq_qos_id id, const struct rq_qos_ops *ops);
 void rq_qos_del(struct rq_qos *rqos);
 
 typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 10/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (8 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 09/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen() Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 11/13] blk-iocost: " Yu Kuai
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

wbt_init() can be called from sysfs attribute and
wbt_init_enable_default(), however queue_wb_lat_store() can freeze queue
first, and then wbt_init() will hold rq_qos_mutex.

Fix this problem by converting to use new helper rq_qos_add_frozen() in
wbt_init(), and freeze queue before calling wbt_init() from
wbt_init_enable_default().

Fixes: a13bd91be223 ("block/rq_qos: protect rq_qos apis with a new lock")
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-wbt.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index de3528236545..ed8231b6b6e9 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -775,6 +775,7 @@ EXPORT_SYMBOL_GPL(wbt_enable_default);
 void wbt_init_enable_default(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
+	unsigned int memflags;
 	struct rq_wb *rwb;
 
 	if (!__wbt_enable_default(disk))
@@ -784,10 +785,13 @@ void wbt_init_enable_default(struct gendisk *disk)
 	if (WARN_ON_ONCE(!rwb))
 		return;
 
+	memflags = blk_mq_freeze_queue(q);
 	if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
+		blk_mq_unfreeze_queue(q, memflags);
 		wbt_free(rwb);
 		return;
 	}
+	blk_mq_unfreeze_queue(q, memflags);
 
 	mutex_lock(&q->debugfs_mutex);
 	blk_mq_debugfs_register_rq_qos(q);
@@ -962,7 +966,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
 	 * Assign rwb and add the stats callback.
 	 */
 	mutex_lock(&q->rq_qos_mutex);
-	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+	ret = rq_qos_add_frozen(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
 	mutex_unlock(&q->rq_qos_mutex);
 	if (ret)
 		return ret;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 11/13] blk-iocost: fix incorrect lock order for rq_qos_mutex and freeze queue
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (9 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 10/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 12/13] blk-iolatency: " Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 13/13] block/blk-rq-qos: cleanup rq_qos_add() Yu Kuai
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Like wbt, rq_qos_add() can be called from two path and the lock order
are inversely:

- From ioc_qos_write(), queue is already frozen before rq_qos_add();
- From ioc_cost_model_write(), rq_qos_add() is called directly;

Fix this problem by converting to use blkg_conf_open_bdev_frozen()
from ioc_cost_model_write(), then since all rq_qos_add() callers
already freeze queue, convert to use rq_qos_add_frozen().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-iocost.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index a0416927d33d..929fc1421d7e 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2925,7 +2925,7 @@ static int blk_iocost_init(struct gendisk *disk)
 	 * called before policy activation completion, can't assume that the
 	 * target bio has an iocg associated and need to test for NULL iocg.
 	 */
-	ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
+	ret = rq_qos_add_frozen(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
 	if (ret)
 		goto err_free_ioc;
 
@@ -3408,7 +3408,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 {
 	struct blkg_conf_ctx ctx;
 	struct request_queue *q;
-	unsigned int memflags;
+	unsigned long memflags;
 	struct ioc *ioc;
 	u64 u[NR_I_LCOEFS];
 	bool user;
@@ -3417,9 +3417,11 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 
 	blkg_conf_init(&ctx, input);
 
-	ret = blkg_conf_open_bdev(&ctx);
-	if (ret)
+	memflags = blkg_conf_open_bdev_frozen(&ctx);
+	if (IS_ERR_VALUE(memflags)) {
+		ret = memflags;
 		goto err;
+	}
 
 	body = ctx.body;
 	q = bdev_get_queue(ctx.bdev);
@@ -3436,7 +3438,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		ioc = q_to_ioc(q);
 	}
 
-	memflags = blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
 
 	spin_lock_irq(&ioc->lock);
@@ -3488,20 +3489,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	spin_unlock_irq(&ioc->lock);
 
 	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return nbytes;
 
 einval:
 	spin_unlock_irq(&ioc->lock);
 
 	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
 	ret = -EINVAL;
 err:
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return ret;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 12/13] blk-iolatency: fix incorrect lock order for rq_qos_mutex and freeze queue
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (10 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 11/13] blk-iocost: " Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  2025-12-25 10:32 ` [PATCH v6 13/13] block/blk-rq-qos: cleanup rq_qos_add() Yu Kuai
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Currently blk-iolatency will hold rq_qos_mutex first and then call
rq_qos_add() to freeze queue.

Fix this problem by converting to use blkg_conf_open_bdev_frozen()
from iolatency_set_limit(), and convert to use rq_qos_add_frozen().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-iolatency.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 45bd18f68541..1558afbf517b 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -764,8 +764,8 @@ static int blk_iolatency_init(struct gendisk *disk)
 	if (!blkiolat)
 		return -ENOMEM;
 
-	ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
-			 &blkcg_iolatency_ops);
+	ret = rq_qos_add_frozen(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
+				&blkcg_iolatency_ops);
 	if (ret)
 		goto err_free;
 	ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
@@ -831,16 +831,19 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	struct blkcg_gq *blkg;
 	struct blkg_conf_ctx ctx;
 	struct iolatency_grp *iolat;
+	unsigned long memflags;
 	char *p, *tok;
 	u64 lat_val = 0;
 	u64 oldval;
-	int ret;
+	int ret = 0;
 
 	blkg_conf_init(&ctx, buf);
 
-	ret = blkg_conf_open_bdev(&ctx);
-	if (ret)
+	memflags = blkg_conf_open_bdev_frozen(&ctx);
+	if (IS_ERR_VALUE(memflags)) {
+		ret = memflags;
 		goto out;
+	}
 
 	/*
 	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
@@ -890,7 +893,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 		iolatency_clear_scaling(blkg);
 	ret = 0;
 out:
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return ret ?: nbytes;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v6 13/13] block/blk-rq-qos: cleanup rq_qos_add()
  2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (11 preceding siblings ...)
  2025-12-25 10:32 ` [PATCH v6 12/13] blk-iolatency: " Yu Kuai
@ 2025-12-25 10:32 ` Yu Kuai
  12 siblings, 0 replies; 22+ messages in thread
From: Yu Kuai @ 2025-12-25 10:32 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Now that there is no caller of rq_qos_add(), remove it, and also rename
rq_qos_add_frozen() back to rq_qos_add().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-iocost.c    |  2 +-
 block/blk-iolatency.c |  4 ++--
 block/blk-rq-qos.c    | 35 ++---------------------------------
 block/blk-rq-qos.h    |  2 --
 block/blk-wbt.c       |  2 +-
 5 files changed, 6 insertions(+), 39 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 929fc1421d7e..0359a5b65202 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2925,7 +2925,7 @@ static int blk_iocost_init(struct gendisk *disk)
 	 * called before policy activation completion, can't assume that the
 	 * target bio has an iocg associated and need to test for NULL iocg.
 	 */
-	ret = rq_qos_add_frozen(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
+	ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
 	if (ret)
 		goto err_free_ioc;
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 1558afbf517b..5b18125e21c9 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -764,8 +764,8 @@ static int blk_iolatency_init(struct gendisk *disk)
 	if (!blkiolat)
 		return -ENOMEM;
 
-	ret = rq_qos_add_frozen(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
-				&blkcg_iolatency_ops);
+	ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
+			 &blkcg_iolatency_ops);
 	if (ret)
 		goto err_free;
 	ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index b8f163827477..20d8e53f063e 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -322,8 +322,8 @@ void rq_qos_exit(struct request_queue *q)
 	mutex_unlock(&q->rq_qos_mutex);
 }
 
-int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
-		      enum rq_qos_id id, const struct rq_qos_ops *ops)
+int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
+	       const struct rq_qos_ops *ops)
 {
 	struct request_queue *q = disk->queue;
 
@@ -343,37 +343,6 @@ int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
 	return 0;
 }
 
-int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
-		const struct rq_qos_ops *ops)
-{
-	struct request_queue *q = disk->queue;
-	unsigned int memflags;
-
-	lockdep_assert_held(&q->rq_qos_mutex);
-
-	rqos->disk = disk;
-	rqos->id = id;
-	rqos->ops = ops;
-
-	/*
-	 * No IO can be in-flight when adding rqos, so freeze queue, which
-	 * is fine since we only support rq_qos for blk-mq queue.
-	 */
-	memflags = blk_mq_freeze_queue(q);
-
-	if (rq_qos_id(q, rqos->id))
-		goto ebusy;
-	rqos->next = q->rq_qos;
-	q->rq_qos = rqos;
-	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
-
-	blk_mq_unfreeze_queue(q, memflags);
-	return 0;
-ebusy:
-	blk_mq_unfreeze_queue(q, memflags);
-	return -EBUSY;
-}
-
 void rq_qos_del(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 8d9fb10ae526..b538f2c0febc 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -87,8 +87,6 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
 
 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		const struct rq_qos_ops *ops);
-int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
-		      enum rq_qos_id id, const struct rq_qos_ops *ops);
 void rq_qos_del(struct rq_qos *rqos);
 
 typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index ed8231b6b6e9..7c3e0d324ab9 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -966,7 +966,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
 	 * Assign rwb and add the stats callback.
 	 */
 	mutex_lock(&q->rq_qos_mutex);
-	ret = rq_qos_add_frozen(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
 	mutex_unlock(&q->rq_qos_mutex);
 	if (ret)
 		return ret;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2025-12-31  5:59 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-25 10:32 [PATCH v6 00/13] blk-mq: fix possible deadlocks Yu Kuai
2025-12-25 10:32 ` [PATCH v6 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
2025-12-25 10:32 ` [PATCH v6 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
2025-12-30  5:34   ` Nilay Shroff
2025-12-25 10:32 ` [PATCH v6 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
2025-12-25 10:32 ` [PATCH v6 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
2025-12-30  5:35   ` Nilay Shroff
2025-12-25 10:32 ` [PATCH v6 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
2025-12-25 10:32 ` [PATCH v6 06/13] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() Yu Kuai
2025-12-30  5:33   ` Nilay Shroff
2025-12-31  5:38     ` Yu Kuai
2025-12-25 10:32 ` [PATCH v6 07/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
2025-12-30  5:37   ` Nilay Shroff
2025-12-30  6:04   ` kernel test robot
2025-12-30 13:05   ` Nilay Shroff
2025-12-31  5:59     ` Yu Kuai
2025-12-25 10:32 ` [PATCH v6 08/13] blk-throttle: fix possible deadlock for fs reclaim under rq_qos_mutex Yu Kuai
2025-12-25 10:32 ` [PATCH v6 09/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen() Yu Kuai
2025-12-25 10:32 ` [PATCH v6 10/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue Yu Kuai
2025-12-25 10:32 ` [PATCH v6 11/13] blk-iocost: " Yu Kuai
2025-12-25 10:32 ` [PATCH v6 12/13] blk-iolatency: " Yu Kuai
2025-12-25 10:32 ` [PATCH v6 13/13] block/blk-rq-qos: cleanup rq_qos_add() Yu Kuai

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.