linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 00/13] blk-mq: fix possible deadlocks
@ 2025-12-14 10:13 Yu Kuai
  2025-12-14 10:13 ` [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
                   ` (12 more replies)
  0 siblings, 13 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:13 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

changes in v5:
 - free rwb from wbt_init() caller in patch 2;
 - don't recheck rwb in patch 2 to make code cleaner, concurrent callers
   will fail from rq_qos_add();
 - add patch 7 to fix possible deadlock in blk-throtle;
changes in v4:
 - add patch 1,2 to fix a new deadlock;
changes in v3:
 - remove changes for blk-iolatency and blk-iocost in patch 2, since
   they don't have debugfs entries.
 - add patch 9 to fix lock order for blk-throttle.
changes in v2:
 - combine two set into one;

Fix deadlock:
 - patch 1-2, pcpu_alloc_mutex under q_usage_counter;
 - patch 3-6, debugfs_mutex under q_usage_counter;
 - patch 7, fs_reclaim under q_usage_counter;
 - patch 8-13, q_usage_counter under rq_qos_mutex;

Yu Kuai (13):
  blk-wbt: factor out a helper wbt_set_lat()
  blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under
    q_usage_counter
  blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  blk-rq-qos: fix possible debugfs_mutex deadlock
  blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  blk-mq-debugfs: warn about possible deadlock
  blk-throttle: convert to GFP_NOIO in blk_throtl_init()
  block/blk-rq-qos: add a new helper rq_qos_add_frozen()
  blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue
  blk-iocost: fix incorrect lock order for rq_qos_mutex and freeze queue
  blk-iolatency: fix incorrect lock order for rq_qos_mutex and freeze
    queue
  blk-throttle: remove useless queue frozen
  block/blk-rq-qos: cleanup rq_qos_add()

 block/blk-iocost.c     |  15 ++--
 block/blk-iolatency.c  |  11 +--
 block/blk-mq-debugfs.c |  57 +++++++++++-----
 block/blk-mq-debugfs.h |   4 +-
 block/blk-rq-qos.c     |  27 ++------
 block/blk-sysfs.c      |  39 +----------
 block/blk-throttle.c   |  13 +---
 block/blk-throttle.h   |   3 +-
 block/blk-wbt.c        | 151 ++++++++++++++++++++++++++++++-----------
 block/blk-wbt.h        |   8 +--
 10 files changed, 184 insertions(+), 144 deletions(-)

-- 
2.51.0


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat()
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
@ 2025-12-14 10:13 ` Yu Kuai
  2025-12-16 10:19   ` Ming Lei
  2025-12-18 14:18   ` Nilay Shroff
  2025-12-14 10:13 ` [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
                   ` (11 subsequent siblings)
  12 siblings, 2 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:13 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

To move implementation details inside blk-wbt.c, prepare to fix possible
deadlock to call wbt_init() while queue is frozen in the next patch.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-sysfs.c | 39 ++----------------------------------
 block/blk-wbt.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++---
 block/blk-wbt.h   |  8 ++------
 3 files changed, 51 insertions(+), 46 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8684c57498cc..96f10e1bcbd1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -636,11 +636,8 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
 static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 				  size_t count)
 {
-	struct request_queue *q = disk->queue;
-	struct rq_qos *rqos;
 	ssize_t ret;
 	s64 val;
-	unsigned int memflags;
 
 	ret = queue_var_store64(&val, page);
 	if (ret < 0)
@@ -648,40 +645,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 	if (val < -1)
 		return -EINVAL;
 
-	/*
-	 * Ensure that the queue is idled, in case the latency update
-	 * ends up either enabling or disabling wbt completely. We can't
-	 * have IO inflight if that happens.
-	 */
-	memflags = blk_mq_freeze_queue(q);
-
-	rqos = wbt_rq_qos(q);
-	if (!rqos) {
-		ret = wbt_init(disk);
-		if (ret)
-			goto out;
-	}
-
-	ret = count;
-	if (val == -1)
-		val = wbt_default_latency_nsec(q);
-	else if (val >= 0)
-		val *= 1000ULL;
-
-	if (wbt_get_min_lat(q) == val)
-		goto out;
-
-	blk_mq_quiesce_queue(q);
-
-	mutex_lock(&disk->rqos_state_mutex);
-	wbt_set_min_lat(q, val);
-	mutex_unlock(&disk->rqos_state_mutex);
-
-	blk_mq_unquiesce_queue(q);
-out:
-	blk_mq_unfreeze_queue(q, memflags);
-
-	return ret;
+	ret = wbt_set_lat(disk, val);
+	return ret ? ret : count;
 }
 
 QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index eb8037bae0bd..ba807649b29a 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -93,6 +93,8 @@ struct rq_wb {
 	struct rq_depth rq_depth;
 };
 
+static int wbt_init(struct gendisk *disk);
+
 static inline struct rq_wb *RQWB(struct rq_qos *rqos)
 {
 	return container_of(rqos, struct rq_wb, rqos);
@@ -506,7 +508,7 @@ u64 wbt_get_min_lat(struct request_queue *q)
 	return RQWB(rqos)->min_lat_nsec;
 }
 
-void wbt_set_min_lat(struct request_queue *q, u64 val)
+static void wbt_set_min_lat(struct request_queue *q, u64 val)
 {
 	struct rq_qos *rqos = wbt_rq_qos(q);
 	if (!rqos)
@@ -729,7 +731,7 @@ void wbt_enable_default(struct gendisk *disk)
 }
 EXPORT_SYMBOL_GPL(wbt_enable_default);
 
-u64 wbt_default_latency_nsec(struct request_queue *q)
+static u64 wbt_default_latency_nsec(struct request_queue *q)
 {
 	/*
 	 * We default to 2msec for non-rotational storage, and 75msec
@@ -890,7 +892,7 @@ static const struct rq_qos_ops wbt_rqos_ops = {
 #endif
 };
 
-int wbt_init(struct gendisk *disk)
+static int wbt_init(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct rq_wb *rwb;
@@ -937,3 +939,45 @@ int wbt_init(struct gendisk *disk)
 	return ret;
 
 }
+
+int wbt_set_lat(struct gendisk *disk, s64 val)
+{
+	struct request_queue *q = disk->queue;
+	unsigned int memflags;
+	struct rq_qos *rqos;
+	int ret = 0;
+
+	/*
+	 * Ensure that the queue is idled, in case the latency update
+	 * ends up either enabling or disabling wbt completely. We can't
+	 * have IO inflight if that happens.
+	 */
+	memflags = blk_mq_freeze_queue(q);
+
+	rqos = wbt_rq_qos(q);
+	if (!rqos) {
+		ret = wbt_init(disk);
+		if (ret)
+			goto out;
+	}
+
+	if (val == -1)
+		val = wbt_default_latency_nsec(q);
+	else if (val >= 0)
+		val *= 1000ULL;
+
+	if (wbt_get_min_lat(q) == val)
+		goto out;
+
+	blk_mq_quiesce_queue(q);
+
+	mutex_lock(&disk->rqos_state_mutex);
+	wbt_set_min_lat(q, val);
+	mutex_unlock(&disk->rqos_state_mutex);
+
+	blk_mq_unquiesce_queue(q);
+out:
+	blk_mq_unfreeze_queue(q, memflags);
+
+	return ret;
+}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index e5fc653b9b76..414cf1c34212 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -4,15 +4,11 @@
 
 #ifdef CONFIG_BLK_WBT
 
-int wbt_init(struct gendisk *disk);
 void wbt_disable_default(struct gendisk *disk);
 void wbt_enable_default(struct gendisk *disk);
-
-u64 wbt_get_min_lat(struct request_queue *q);
-void wbt_set_min_lat(struct request_queue *q, u64 val);
 bool wbt_disabled(struct request_queue *);
-
-u64 wbt_default_latency_nsec(struct request_queue *);
+u64 wbt_get_min_lat(struct request_queue *q);
+int wbt_set_lat(struct gendisk *disk, s64 val);
 
 #else
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
  2025-12-14 10:13 ` [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
@ 2025-12-14 10:13 ` Yu Kuai
  2025-12-16 11:03   ` Ming Lei
  2025-12-18 14:20   ` Nilay Shroff
  2025-12-14 10:13 ` [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
                   ` (10 subsequent siblings)
  12 siblings, 2 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:13 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

If wbt is disabled by default and user configures wbt by sysfs, queue
will be frozen first and then pcpu_alloc_mutex will be held in
blk_stat_alloc_callback().

Fix this problem by allocating memory first before queue frozen.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-wbt.c | 106 ++++++++++++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 45 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index ba807649b29a..696baa681717 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -93,7 +93,7 @@ struct rq_wb {
 	struct rq_depth rq_depth;
 };
 
-static int wbt_init(struct gendisk *disk);
+static int wbt_init(struct gendisk *disk, struct rq_wb *rwb);
 
 static inline struct rq_wb *RQWB(struct rq_qos *rqos)
 {
@@ -698,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
 	}
 }
 
+static int wbt_data_dir(const struct request *rq)
+{
+	const enum req_op op = req_op(rq);
+
+	if (op == REQ_OP_READ)
+		return READ;
+	else if (op_is_write(op))
+		return WRITE;
+
+	/* don't account */
+	return -1;
+}
+
+static struct rq_wb *wbt_alloc(void)
+{
+	struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+
+	if (!rwb)
+		return NULL;
+
+	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
+	if (!rwb->cb) {
+		kfree(rwb);
+		return NULL;
+	}
+
+	return rwb;
+}
+
+static void wbt_free(struct rq_wb *rwb)
+{
+	blk_stat_free_callback(rwb->cb);
+	kfree(rwb);
+}
+
 /*
  * Enable wbt if defaults are configured that way
  */
@@ -726,8 +761,15 @@ void wbt_enable_default(struct gendisk *disk)
 	if (!blk_queue_registered(q))
 		return;
 
-	if (queue_is_mq(q) && enable)
-		wbt_init(disk);
+	if (queue_is_mq(q) && enable) {
+		struct rq_wb *rwb = wbt_alloc();
+
+		if (WARN_ON_ONCE(!rwb))
+			return;
+
+		if (WARN_ON_ONCE(wbt_init(disk, rwb)))
+			wbt_free(rwb);
+	}
 }
 EXPORT_SYMBOL_GPL(wbt_enable_default);
 
@@ -743,19 +785,6 @@ static u64 wbt_default_latency_nsec(struct request_queue *q)
 		return 75000000ULL;
 }
 
-static int wbt_data_dir(const struct request *rq)
-{
-	const enum req_op op = req_op(rq);
-
-	if (op == REQ_OP_READ)
-		return READ;
-	else if (op_is_write(op))
-		return WRITE;
-
-	/* don't account */
-	return -1;
-}
-
 static void wbt_queue_depth_changed(struct rq_qos *rqos)
 {
 	RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
@@ -767,8 +796,7 @@ static void wbt_exit(struct rq_qos *rqos)
 	struct rq_wb *rwb = RQWB(rqos);
 
 	blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
-	blk_stat_free_callback(rwb->cb);
-	kfree(rwb);
+	wbt_free(rwb);
 }
 
 /*
@@ -892,22 +920,11 @@ static const struct rq_qos_ops wbt_rqos_ops = {
 #endif
 };
 
-static int wbt_init(struct gendisk *disk)
+static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
 {
 	struct request_queue *q = disk->queue;
-	struct rq_wb *rwb;
-	int i;
 	int ret;
-
-	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
-	if (!rwb)
-		return -ENOMEM;
-
-	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
-	if (!rwb->cb) {
-		kfree(rwb);
-		return -ENOMEM;
-	}
+	int i;
 
 	for (i = 0; i < WBT_NUM_RWQ; i++)
 		rq_wait_init(&rwb->rq_wait[i]);
@@ -927,38 +944,38 @@ static int wbt_init(struct gendisk *disk)
 	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
 	mutex_unlock(&q->rq_qos_mutex);
 	if (ret)
-		goto err_free;
+		return ret;
 
 	blk_stat_add_callback(q, rwb->cb);
-
 	return 0;
-
-err_free:
-	blk_stat_free_callback(rwb->cb);
-	kfree(rwb);
-	return ret;
-
 }
 
 int wbt_set_lat(struct gendisk *disk, s64 val)
 {
 	struct request_queue *q = disk->queue;
+	struct rq_qos *rqos = wbt_rq_qos(q);
+	struct rq_wb *rwb = NULL;
 	unsigned int memflags;
-	struct rq_qos *rqos;
 	int ret = 0;
 
+	if (!rqos) {
+		rwb = wbt_alloc();
+		if (!rwb)
+			return -ENOMEM;
+	}
+
 	/*
 	 * Ensure that the queue is idled, in case the latency update
 	 * ends up either enabling or disabling wbt completely. We can't
 	 * have IO inflight if that happens.
 	 */
 	memflags = blk_mq_freeze_queue(q);
-
-	rqos = wbt_rq_qos(q);
 	if (!rqos) {
-		ret = wbt_init(disk);
-		if (ret)
+		ret = wbt_init(disk, rwb);
+		if (ret) {
+			wbt_free(rwb);
 			goto out;
+		}
 	}
 
 	if (val == -1)
@@ -978,6 +995,5 @@ int wbt_set_lat(struct gendisk *disk, s64 val)
 	blk_mq_unquiesce_queue(q);
 out:
 	blk_mq_unfreeze_queue(q, memflags);
-
 	return ret;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
  2025-12-14 10:13 ` [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
  2025-12-14 10:13 ` [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
@ 2025-12-14 10:13 ` Yu Kuai
  2025-12-18 11:06   ` Ming Lei
  2025-12-18 14:28   ` Nilay Shroff
  2025-12-14 10:13 ` [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
                   ` (9 subsequent siblings)
  12 siblings, 2 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:13 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

There is already a helper blk_mq_debugfs_register_rqos() to register
one rqos, however this helper is called synchronously when the rqos is
created with queue frozen.

Prepare to fix possible deadlock to create blk-mq debugfs entries while
queue is still frozen.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 23 +++++++++++++++--------
 block/blk-mq-debugfs.h |  5 +++++
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4896525b1c05..128d2aa6a20d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -619,6 +619,20 @@ static void debugfs_create_files(struct dentry *parent, void *data,
 				    (void *)attr, data, &blk_mq_debugfs_fops);
 }
 
+void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
+{
+	lockdep_assert_held(&q->debugfs_mutex);
+
+	if (q->rq_qos) {
+		struct rq_qos *rqos = q->rq_qos;
+
+		while (rqos) {
+			blk_mq_debugfs_register_rqos(rqos);
+			rqos = rqos->next;
+		}
+	}
+}
+
 void blk_mq_debugfs_register(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -631,14 +645,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 			blk_mq_debugfs_register_hctx(q, hctx);
 	}
 
-	if (q->rq_qos) {
-		struct rq_qos *rqos = q->rq_qos;
-
-		while (rqos) {
-			blk_mq_debugfs_register_rqos(rqos);
-			rqos = rqos->next;
-		}
-	}
+	blk_mq_debugfs_register_rq_qos(q);
 }
 
 static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index c80e453e3014..54948a266889 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -33,6 +33,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx);
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
+void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
 void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
 #else
@@ -78,6 +79,10 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 }
 
+static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
+{
+}
+
 static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 {
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (2 preceding siblings ...)
  2025-12-14 10:13 ` [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
@ 2025-12-14 10:13 ` Yu Kuai
  2025-12-18 11:11   ` Ming Lei
  2025-12-18 14:33   ` Nilay Shroff
  2025-12-14 10:14 ` [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
                   ` (8 subsequent siblings)
  12 siblings, 2 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:13 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Currently rq-qos debugfs entries are created from rq_qos_add(), while
rq_qos_add() can be called while queue is still frozen. This can
deadlock because creating new entries can trigger fs reclaim.

Fix this problem by delaying creating rq-qos debugfs entries after queue
is unfrozen.

- For wbt, 1) it can be initialized by default, fix it by calling new
  helper after wbt_init() from wbt_enable_default; 2) it can be
  initialized by sysfs, fix it by calling new helper after queue is
  unfrozen from queue_wb_lat_store().
- For iocost and iolatency, they can only be initialized by blkcg
  configuration, however, they don't have debugfs entries for now, hence
  they are not handled yet.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-rq-qos.c |  7 -------
 block/blk-wbt.c    | 12 +++++++++++-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 654478dfbc20..d7ce99ce2e80 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -347,13 +347,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
 
 	blk_mq_unfreeze_queue(q, memflags);
-
-	if (rqos->ops->debugfs_attrs) {
-		mutex_lock(&q->debugfs_mutex);
-		blk_mq_debugfs_register_rqos(rqos);
-		mutex_unlock(&q->debugfs_mutex);
-	}
-
 	return 0;
 ebusy:
 	blk_mq_unfreeze_queue(q, memflags);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 696baa681717..4bf6f42bef2e 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -767,8 +767,14 @@ void wbt_enable_default(struct gendisk *disk)
 		if (WARN_ON_ONCE(!rwb))
 			return;
 
-		if (WARN_ON_ONCE(wbt_init(disk, rwb)))
+		if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
 			wbt_free(rwb);
+			return;
+		}
+
+		mutex_lock(&q->debugfs_mutex);
+		blk_mq_debugfs_register_rq_qos(q);
+		mutex_unlock(&q->debugfs_mutex);
 	}
 }
 EXPORT_SYMBOL_GPL(wbt_enable_default);
@@ -995,5 +1001,9 @@ int wbt_set_lat(struct gendisk *disk, s64 val)
 	blk_mq_unquiesce_queue(q);
 out:
 	blk_mq_unfreeze_queue(q, memflags);
+	mutex_lock(&q->debugfs_mutex);
+	blk_mq_debugfs_register_rq_qos(q);
+	mutex_unlock(&q->debugfs_mutex);
+
 	return ret;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (3 preceding siblings ...)
  2025-12-14 10:13 ` [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-18 14:52   ` Nilay Shroff
  2025-12-14 10:14 ` [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Because it's only used inside blk-mq-debugfs.c now.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 4 +++-
 block/blk-mq-debugfs.h | 5 -----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 128d2aa6a20d..99466595c0a4 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -14,6 +14,8 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
+static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
+
 static int queue_poll_stat_show(void *data, struct seq_file *m)
 {
 	return 0;
@@ -758,7 +760,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 	rqos->debugfs_dir = NULL;
 }
 
-void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
+static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
 	const char *dir_name = rq_qos_id_to_name(rqos->id);
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 54948a266889..d94daa66556b 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -34,7 +34,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
-void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
 void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
 #else
 static inline void blk_mq_debugfs_register(struct request_queue *q)
@@ -75,10 +74,6 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
 {
 }
 
-static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
-{
-}
-
 static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
 {
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (4 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-18 18:26   ` Nilay Shroff
  2025-12-14 10:14 ` [PATCH v5 07/13] blk-throttle: convert to GFP_NOIO in blk_throtl_init() Yu Kuai
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Creating new debugfs entries can trigger fs reclaim, hence we can't do
this with queue frozen, meanwhile, other locks that can be held while
queue is frozen should not be held as well.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-mq-debugfs.c | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 99466595c0a4..d54f8c29d2f4 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -610,9 +610,22 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{},
 };
 
-static void debugfs_create_files(struct dentry *parent, void *data,
+static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
+				 void *data,
 				 const struct blk_mq_debugfs_attr *attr)
 {
+	/*
+	 * Creating new debugfs entries with queue freezed has the risk of
+	 * deadlock.
+	 */
+	WARN_ON_ONCE(q->mq_freeze_depth != 0);
+	/*
+	 * debugfs_mutex should not be nested under other locks that can be
+	 * grabbed while queue is frozen.
+	 */
+	lockdep_assert_not_held(&q->elevator_lock);
+	lockdep_assert_not_held(&q->rq_qos_mutex);
+
 	if (IS_ERR_OR_NULL(parent))
 		return;
 
@@ -640,7 +653,7 @@ void blk_mq_debugfs_register(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long i;
 
-	debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
+	debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->debugfs_dir)
@@ -659,7 +672,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
 	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
 	ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir);
 
-	debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs);
+	debugfs_create_files(hctx->queue, ctx_dir, ctx,
+			     blk_mq_debugfs_ctx_attrs);
 }
 
 void blk_mq_debugfs_register_hctx(struct request_queue *q,
@@ -675,7 +689,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
 	hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
 
-	debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs);
+	debugfs_create_files(q, hctx->debugfs_dir, hctx,
+			     blk_mq_debugfs_hctx_attrs);
 
 	hctx_for_each_ctx(hctx, ctx, i)
 		blk_mq_debugfs_register_ctx(hctx, ctx);
@@ -726,7 +741,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
 
 	q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir);
 
-	debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
+	debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
 }
 
 void blk_mq_debugfs_unregister_sched(struct request_queue *q)
@@ -775,7 +790,8 @@ static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 							 q->debugfs_dir);
 
 	rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir);
-	debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
+	debugfs_create_files(q, rqos->debugfs_dir, rqos,
+			     rqos->ops->debugfs_attrs);
 }
 
 void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
@@ -798,7 +814,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 
 	hctx->sched_debugfs_dir = debugfs_create_dir("sched",
 						     hctx->debugfs_dir);
-	debugfs_create_files(hctx->sched_debugfs_dir, hctx,
+	debugfs_create_files(q, hctx->sched_debugfs_dir, hctx,
 			     e->hctx_debugfs_attrs);
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 07/13] blk-throttle: convert to GFP_NOIO in blk_throtl_init()
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (5 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 08/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen() Yu Kuai
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

blk_throtl_init() can be called with rq_qos_mutex held from blkcg
configuration, hence trigger fs reclaim can cause deadlock because
rq_qos_mutex can be held with queue frozen.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-throttle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 97188a795848..5530a9bb0620 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1313,7 +1313,7 @@ static int blk_throtl_init(struct gendisk *disk)
 	unsigned int memflags;
 	int ret;
 
-	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	td = kzalloc_node(sizeof(*td), GFP_NOIO, q->node);
 	if (!td)
 		return -ENOMEM;
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 08/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen()
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (6 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 07/13] blk-throttle: convert to GFP_NOIO in blk_throtl_init() Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 09/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue Yu Kuai
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

queue should not be frozen under rq_qos_mutex, see example from
commit 9730763f4756 ("block: correct locking order for protecting blk-wbt
parameters"), which means current implementation of rq_qos_add() is
problematic. Add a new helper and prepare to fix this problem in
following patches.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-rq-qos.c | 21 +++++++++++++++++++++
 block/blk-rq-qos.h |  2 ++
 2 files changed, 23 insertions(+)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d7ce99ce2e80..c1a94c2a9742 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -322,6 +322,27 @@ void rq_qos_exit(struct request_queue *q)
 	mutex_unlock(&q->rq_qos_mutex);
 }
 
+int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
+		      enum rq_qos_id id, const struct rq_qos_ops *ops)
+{
+	struct request_queue *q = disk->queue;
+
+	WARN_ON_ONCE(q->mq_freeze_depth == 0);
+	lockdep_assert_held(&q->rq_qos_mutex);
+
+	if (rq_qos_id(q, id))
+		return -EBUSY;
+
+	rqos->disk = disk;
+	rqos->id = id;
+	rqos->ops = ops;
+	rqos->next = q->rq_qos;
+	q->rq_qos = rqos;
+	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
+
+	return 0;
+}
+
 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		const struct rq_qos_ops *ops)
 {
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index b538f2c0febc..8d9fb10ae526 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -87,6 +87,8 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
 
 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		const struct rq_qos_ops *ops);
+int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
+		      enum rq_qos_id id, const struct rq_qos_ops *ops);
 void rq_qos_del(struct rq_qos *rqos);
 
 typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 09/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (7 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 08/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen() Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 10/13] blk-iocost: " Yu Kuai
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

wbt_init() can be called from sysfs attribute and wbt_enable_default(),
however the lock order are inversely.

- queue_wb_lat_store() freeze queue first, and then wbt_init() hold
  rq_qos_mutex. In this case queue will be frozen again inside
  rq_qos_add(), however, in this case freeze queue recursively is
  inoperative;
- wbt_enable_default() from elevator switch will hold rq_qos_mutex
  first, and then rq_qos_add() will freeze queue;

Fix this problem by converting to use new helper rq_qos_add_frozen() in
wbt_init(), and for wbt_enable_default(), freeze queue before calling
wbt_init().

Fixes: a13bd91be223 ("block/rq_qos: protect rq_qos apis with a new lock")
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-wbt.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 4bf6f42bef2e..d852317c9cb1 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -19,6 +19,7 @@
  * Copyright (C) 2016 Jens Axboe
  *
  */
+#include "linux/blk-mq.h"
 #include <linux/kernel.h>
 #include <linux/blk_types.h>
 #include <linux/slab.h>
@@ -763,14 +764,18 @@ void wbt_enable_default(struct gendisk *disk)
 
 	if (queue_is_mq(q) && enable) {
 		struct rq_wb *rwb = wbt_alloc();
+		unsigned int memflags;
 
 		if (WARN_ON_ONCE(!rwb))
 			return;
 
+		memflags = blk_mq_freeze_queue(q);
 		if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
+			blk_mq_unfreeze_queue(q, memflags);
 			wbt_free(rwb);
 			return;
 		}
+		blk_mq_unfreeze_queue(q, memflags);
 
 		mutex_lock(&q->debugfs_mutex);
 		blk_mq_debugfs_register_rq_qos(q);
@@ -947,7 +952,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
 	 * Assign rwb and add the stats callback.
 	 */
 	mutex_lock(&q->rq_qos_mutex);
-	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+	ret = rq_qos_add_frozen(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
 	mutex_unlock(&q->rq_qos_mutex);
 	if (ret)
 		return ret;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 10/13] blk-iocost: fix incorrect lock order for rq_qos_mutex and freeze queue
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (8 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 09/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 11/13] blk-iolatency: " Yu Kuai
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Like wbt, rq_qos_add() can be called from two path and the lock order
are inversely:

- From ioc_qos_write(), queue is already frozen before rq_qos_add();
- From ioc_cost_model_write(), rq_qos_add() is called directly;

Fix this problem by converting to use blkg_conf_open_bdev_frozen()
from ioc_cost_model_write(), then since all rq_qos_add() callers
already freeze queue, convert to use rq_qos_add_frozen().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-iocost.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index a0416927d33d..929fc1421d7e 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2925,7 +2925,7 @@ static int blk_iocost_init(struct gendisk *disk)
 	 * called before policy activation completion, can't assume that the
 	 * target bio has an iocg associated and need to test for NULL iocg.
 	 */
-	ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
+	ret = rq_qos_add_frozen(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
 	if (ret)
 		goto err_free_ioc;
 
@@ -3408,7 +3408,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 {
 	struct blkg_conf_ctx ctx;
 	struct request_queue *q;
-	unsigned int memflags;
+	unsigned long memflags;
 	struct ioc *ioc;
 	u64 u[NR_I_LCOEFS];
 	bool user;
@@ -3417,9 +3417,11 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 
 	blkg_conf_init(&ctx, input);
 
-	ret = blkg_conf_open_bdev(&ctx);
-	if (ret)
+	memflags = blkg_conf_open_bdev_frozen(&ctx);
+	if (IS_ERR_VALUE(memflags)) {
+		ret = memflags;
 		goto err;
+	}
 
 	body = ctx.body;
 	q = bdev_get_queue(ctx.bdev);
@@ -3436,7 +3438,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 		ioc = q_to_ioc(q);
 	}
 
-	memflags = blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
 
 	spin_lock_irq(&ioc->lock);
@@ -3488,20 +3489,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
 	spin_unlock_irq(&ioc->lock);
 
 	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return nbytes;
 
 einval:
 	spin_unlock_irq(&ioc->lock);
 
 	blk_mq_unquiesce_queue(q);
-	blk_mq_unfreeze_queue(q, memflags);
 
 	ret = -EINVAL;
 err:
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return ret;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 11/13] blk-iolatency: fix incorrect lock order for rq_qos_mutex and freeze queue
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (9 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 10/13] blk-iocost: " Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 12/13] blk-throttle: remove useless queue frozen Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 13/13] block/blk-rq-qos: cleanup rq_qos_add() Yu Kuai
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Currently blk-iolatency will hold rq_qos_mutex first and then call
rq_qos_add() to freeze queue.

Fix this problem by converting to use blkg_conf_open_bdev_frozen()
from iolatency_set_limit(), and convert to use rq_qos_add_frozen().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-iolatency.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 45bd18f68541..1558afbf517b 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -764,8 +764,8 @@ static int blk_iolatency_init(struct gendisk *disk)
 	if (!blkiolat)
 		return -ENOMEM;
 
-	ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
-			 &blkcg_iolatency_ops);
+	ret = rq_qos_add_frozen(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
+				&blkcg_iolatency_ops);
 	if (ret)
 		goto err_free;
 	ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
@@ -831,16 +831,19 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	struct blkcg_gq *blkg;
 	struct blkg_conf_ctx ctx;
 	struct iolatency_grp *iolat;
+	unsigned long memflags;
 	char *p, *tok;
 	u64 lat_val = 0;
 	u64 oldval;
-	int ret;
+	int ret = 0;
 
 	blkg_conf_init(&ctx, buf);
 
-	ret = blkg_conf_open_bdev(&ctx);
-	if (ret)
+	memflags = blkg_conf_open_bdev_frozen(&ctx);
+	if (IS_ERR_VALUE(memflags)) {
+		ret = memflags;
 		goto out;
+	}
 
 	/*
 	 * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
@@ -890,7 +893,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 		iolatency_clear_scaling(blkg);
 	ret = 0;
 out:
-	blkg_conf_exit(&ctx);
+	blkg_conf_exit_frozen(&ctx, memflags);
 	return ret ?: nbytes;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 12/13] blk-throttle: remove useless queue frozen
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (10 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 11/13] blk-iolatency: " Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  2025-12-14 10:14 ` [PATCH v5 13/13] block/blk-rq-qos: cleanup rq_qos_add() Yu Kuai
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

blk-throttle is still holding rq_qos_mutex before freezing queue from
blk_throtl_init().

However blk_throtl_bio() can be called before grabbing q_usage_counter
hence freeze queue really doesn't stop new IO issuing to blk-throtl.

Also use READ_ONCE and WRITE_ONCE for q->td because blk_throtl_init()
can concurrent with issuing IO.

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-throttle.c | 11 ++---------
 block/blk-throttle.h |  3 ++-
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 5530a9bb0620..516481722a98 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1310,7 +1310,6 @@ static int blk_throtl_init(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct throtl_data *td;
-	unsigned int memflags;
 	int ret;
 
 	td = kzalloc_node(sizeof(*td), GFP_NOIO, q->node);
@@ -1320,22 +1319,16 @@ static int blk_throtl_init(struct gendisk *disk)
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
 
-	memflags = blk_mq_freeze_queue(disk->queue);
-	blk_mq_quiesce_queue(disk->queue);
-
-	q->td = td;
+	WRITE_ONCE(q->td, td);
 	td->queue = q;
 
 	/* activate policy, blk_throtl_activated() will return true */
 	ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
 	if (ret) {
-		q->td = NULL;
+		WRITE_ONCE(q->td, NULL);
 		kfree(td);
 	}
 
-	blk_mq_unquiesce_queue(disk->queue);
-	blk_mq_unfreeze_queue(disk->queue, memflags);
-
 	return ret;
 }
 
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 9d7a42c039a1..3d177b20f9e1 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -162,7 +162,8 @@ static inline bool blk_throtl_activated(struct request_queue *q)
 	 * blkcg_policy_enabled() guarantees that the policy is activated
 	 * in the request_queue.
 	 */
-	return q->td != NULL && blkcg_policy_enabled(q, &blkcg_policy_throtl);
+	return READ_ONCE(q->td) &&
+	       blkcg_policy_enabled(q, &blkcg_policy_throtl);
 }
 
 static inline bool blk_should_throtl(struct bio *bio)
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v5 13/13] block/blk-rq-qos: cleanup rq_qos_add()
  2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
                   ` (11 preceding siblings ...)
  2025-12-14 10:14 ` [PATCH v5 12/13] blk-throttle: remove useless queue frozen Yu Kuai
@ 2025-12-14 10:14 ` Yu Kuai
  12 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-14 10:14 UTC (permalink / raw)
  To: axboe, linux-block, tj, nilay, ming.lei; +Cc: yukuai

Now that there is no caller of rq_qos_add(), remove it, and also rename
rq_qos_add_frozen() back to rq_qos_add().

Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 block/blk-iocost.c    |  2 +-
 block/blk-iolatency.c |  4 ++--
 block/blk-rq-qos.c    | 35 ++---------------------------------
 block/blk-rq-qos.h    |  2 --
 block/blk-wbt.c       |  2 +-
 5 files changed, 6 insertions(+), 39 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 929fc1421d7e..0359a5b65202 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2925,7 +2925,7 @@ static int blk_iocost_init(struct gendisk *disk)
 	 * called before policy activation completion, can't assume that the
 	 * target bio has an iocg associated and need to test for NULL iocg.
 	 */
-	ret = rq_qos_add_frozen(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
+	ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
 	if (ret)
 		goto err_free_ioc;
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 1558afbf517b..5b18125e21c9 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -764,8 +764,8 @@ static int blk_iolatency_init(struct gendisk *disk)
 	if (!blkiolat)
 		return -ENOMEM;
 
-	ret = rq_qos_add_frozen(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
-				&blkcg_iolatency_ops);
+	ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
+			 &blkcg_iolatency_ops);
 	if (ret)
 		goto err_free;
 	ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index c1a94c2a9742..8de7dae3273e 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -322,8 +322,8 @@ void rq_qos_exit(struct request_queue *q)
 	mutex_unlock(&q->rq_qos_mutex);
 }
 
-int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
-		      enum rq_qos_id id, const struct rq_qos_ops *ops)
+int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
+	       const struct rq_qos_ops *ops)
 {
 	struct request_queue *q = disk->queue;
 
@@ -343,37 +343,6 @@ int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
 	return 0;
 }
 
-int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
-		const struct rq_qos_ops *ops)
-{
-	struct request_queue *q = disk->queue;
-	unsigned int memflags;
-
-	lockdep_assert_held(&q->rq_qos_mutex);
-
-	rqos->disk = disk;
-	rqos->id = id;
-	rqos->ops = ops;
-
-	/*
-	 * No IO can be in-flight when adding rqos, so freeze queue, which
-	 * is fine since we only support rq_qos for blk-mq queue.
-	 */
-	memflags = blk_mq_freeze_queue(q);
-
-	if (rq_qos_id(q, rqos->id))
-		goto ebusy;
-	rqos->next = q->rq_qos;
-	q->rq_qos = rqos;
-	blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
-
-	blk_mq_unfreeze_queue(q, memflags);
-	return 0;
-ebusy:
-	blk_mq_unfreeze_queue(q, memflags);
-	return -EBUSY;
-}
-
 void rq_qos_del(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->disk->queue;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 8d9fb10ae526..b538f2c0febc 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -87,8 +87,6 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
 
 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
 		const struct rq_qos_ops *ops);
-int rq_qos_add_frozen(struct rq_qos *rqos, struct gendisk *disk,
-		      enum rq_qos_id id, const struct rq_qos_ops *ops);
 void rq_qos_del(struct rq_qos *rqos);
 
 typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index d852317c9cb1..4645e8fbca64 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -952,7 +952,7 @@ static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
 	 * Assign rwb and add the stats callback.
 	 */
 	mutex_lock(&q->rq_qos_mutex);
-	ret = rq_qos_add_frozen(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
 	mutex_unlock(&q->rq_qos_mutex);
 	if (ret)
 		return ret;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat()
  2025-12-14 10:13 ` [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
@ 2025-12-16 10:19   ` Ming Lei
  2025-12-18 14:18   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Ming Lei @ 2025-12-16 10:19 UTC (permalink / raw)
  To: Yu Kuai; +Cc: axboe, linux-block, tj, nilay

On Sun, Dec 14, 2025 at 06:13:56PM +0800, Yu Kuai wrote:
> To move implementation details inside blk-wbt.c, prepare to fix possible
> deadlock to call wbt_init() while queue is frozen in the next patch.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Reviewed-by: Ming Lei <ming.lei@redhat.com>


Thanks,
Ming


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  2025-12-14 10:13 ` [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
@ 2025-12-16 11:03   ` Ming Lei
  2025-12-16 11:29     ` Yu Kuai
  2025-12-18 14:20   ` Nilay Shroff
  1 sibling, 1 reply; 27+ messages in thread
From: Ming Lei @ 2025-12-16 11:03 UTC (permalink / raw)
  To: Yu Kuai; +Cc: axboe, linux-block, tj, nilay

On Sun, Dec 14, 2025 at 06:13:57PM +0800, Yu Kuai wrote:
> If wbt is disabled by default and user configures wbt by sysfs, queue
> will be frozen first and then pcpu_alloc_mutex will be held in
> blk_stat_alloc_callback().
> 
> Fix this problem by allocating memory first before queue frozen.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
> ---
>  block/blk-wbt.c | 106 ++++++++++++++++++++++++++++--------------------
>  1 file changed, 61 insertions(+), 45 deletions(-)
> 
> diff --git a/block/blk-wbt.c b/block/blk-wbt.c
> index ba807649b29a..696baa681717 100644
> --- a/block/blk-wbt.c
> +++ b/block/blk-wbt.c
> @@ -93,7 +93,7 @@ struct rq_wb {
>  	struct rq_depth rq_depth;
>  };
>  
> -static int wbt_init(struct gendisk *disk);
> +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb);
>  
>  static inline struct rq_wb *RQWB(struct rq_qos *rqos)
>  {
> @@ -698,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
>  	}
>  }
>  
> +static int wbt_data_dir(const struct request *rq)
> +{
> +	const enum req_op op = req_op(rq);
> +
> +	if (op == REQ_OP_READ)
> +		return READ;
> +	else if (op_is_write(op))
> +		return WRITE;
> +
> +	/* don't account */
> +	return -1;
> +}
> +
> +static struct rq_wb *wbt_alloc(void)
> +{
> +	struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
> +
> +	if (!rwb)
> +		return NULL;
> +
> +	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
> +	if (!rwb->cb) {
> +		kfree(rwb);
> +		return NULL;
> +	}
> +
> +	return rwb;
> +}
> +
> +static void wbt_free(struct rq_wb *rwb)
> +{
> +	blk_stat_free_callback(rwb->cb);
> +	kfree(rwb);
> +}
> +
>  /*
>   * Enable wbt if defaults are configured that way
>   */
> @@ -726,8 +761,15 @@ void wbt_enable_default(struct gendisk *disk)
>  	if (!blk_queue_registered(q))
>  		return;
>  
> -	if (queue_is_mq(q) && enable)
> -		wbt_init(disk);
> +	if (queue_is_mq(q) && enable) {
> +		struct rq_wb *rwb = wbt_alloc();
> +
> +		if (WARN_ON_ONCE(!rwb))
> +			return;
> +
> +		if (WARN_ON_ONCE(wbt_init(disk, rwb)))
> +			wbt_free(rwb);
> +	}
>  }
>  EXPORT_SYMBOL_GPL(wbt_enable_default);
>  
> @@ -743,19 +785,6 @@ static u64 wbt_default_latency_nsec(struct request_queue *q)
>  		return 75000000ULL;
>  }
>  
> -static int wbt_data_dir(const struct request *rq)
> -{
> -	const enum req_op op = req_op(rq);
> -
> -	if (op == REQ_OP_READ)
> -		return READ;
> -	else if (op_is_write(op))
> -		return WRITE;
> -
> -	/* don't account */
> -	return -1;
> -}
> -
>  static void wbt_queue_depth_changed(struct rq_qos *rqos)
>  {
>  	RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
> @@ -767,8 +796,7 @@ static void wbt_exit(struct rq_qos *rqos)
>  	struct rq_wb *rwb = RQWB(rqos);
>  
>  	blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
> -	blk_stat_free_callback(rwb->cb);
> -	kfree(rwb);
> +	wbt_free(rwb);
>  }
>  
>  /*
> @@ -892,22 +920,11 @@ static const struct rq_qos_ops wbt_rqos_ops = {
>  #endif
>  };
>  
> -static int wbt_init(struct gendisk *disk)
> +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
>  {
>  	struct request_queue *q = disk->queue;
> -	struct rq_wb *rwb;
> -	int i;
>  	int ret;
> -
> -	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
> -	if (!rwb)
> -		return -ENOMEM;
> -
> -	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
> -	if (!rwb->cb) {
> -		kfree(rwb);
> -		return -ENOMEM;
> -	}
> +	int i;
>  
>  	for (i = 0; i < WBT_NUM_RWQ; i++)
>  		rq_wait_init(&rwb->rq_wait[i]);
> @@ -927,38 +944,38 @@ static int wbt_init(struct gendisk *disk)
>  	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
>  	mutex_unlock(&q->rq_qos_mutex);
>  	if (ret)
> -		goto err_free;
> +		return ret;
>  
>  	blk_stat_add_callback(q, rwb->cb);
> -
>  	return 0;
> -
> -err_free:
> -	blk_stat_free_callback(rwb->cb);
> -	kfree(rwb);
> -	return ret;
> -
>  }
>  
>  int wbt_set_lat(struct gendisk *disk, s64 val)
>  {
>  	struct request_queue *q = disk->queue;
> +	struct rq_qos *rqos = wbt_rq_qos(q);
> +	struct rq_wb *rwb = NULL;
>  	unsigned int memflags;
> -	struct rq_qos *rqos;
>  	int ret = 0;
>  
> +	if (!rqos) {
> +		rwb = wbt_alloc();
> +		if (!rwb)
> +			return -ENOMEM;
> +	}
> +
>  	/*
>  	 * Ensure that the queue is idled, in case the latency update
>  	 * ends up either enabling or disabling wbt completely. We can't
>  	 * have IO inflight if that happens.
>  	 */
>  	memflags = blk_mq_freeze_queue(q);
> -
> -	rqos = wbt_rq_qos(q);
>  	if (!rqos) {
> -		ret = wbt_init(disk);
> -		if (ret)
> +		ret = wbt_init(disk, rwb);
> +		if (ret) {
> +			wbt_free(rwb);
>  			goto out;
> +		}

Here it actually depends on patch "block: fix race between wbt_enable_default and IO submission"
which kills wbt_init() from bfq & iocost code path, otherwise you may have
to handle -EBUSY from wbt_init().

With the mentioned patch, this fix looks fine:

Reviewed-by: Ming Lei <ming.lei@redhat.com>


Thanks,
Ming


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  2025-12-16 11:03   ` Ming Lei
@ 2025-12-16 11:29     ` Yu Kuai
  0 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-16 11:29 UTC (permalink / raw)
  To: Ming Lei; +Cc: axboe, linux-block, tj, nilay, yukuai

Hi,

在 2025/12/16 19:03, Ming Lei 写道:
>>   	if (!rqos) {
>> -		ret = wbt_init(disk);
>> -		if (ret)
>> +		ret = wbt_init(disk, rwb);
>> +		if (ret) {
>> +			wbt_free(rwb);
>>   			goto out;
>> +		}
> Here it actually depends on patch "block: fix race between wbt_enable_default and IO submission"
> which kills wbt_init() from bfq & iocost code path, otherwise you may have
> to handle -EBUSY from wbt_init().

Sure, I'll rebase if after you patch is applied.

>
> With the mentioned patch, this fix looks fine:

-- 
Thansk,
Kuai

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  2025-12-14 10:13 ` [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
@ 2025-12-18 11:06   ` Ming Lei
  2025-12-18 14:28   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Ming Lei @ 2025-12-18 11:06 UTC (permalink / raw)
  To: Yu Kuai; +Cc: axboe, linux-block, tj, nilay

On Sun, Dec 14, 2025 at 06:13:58PM +0800, Yu Kuai wrote:
> There is already a helper blk_mq_debugfs_register_rqos() to register
> one rqos, however this helper is called synchronously when the rqos is
> created with queue frozen.
> 
> Prepare to fix possible deadlock to create blk-mq debugfs entries while
> queue is still frozen.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Reviewed-by: Ming Lei <ming.lei@redhat.com>


Thanks,
Ming


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock
  2025-12-14 10:13 ` [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
@ 2025-12-18 11:11   ` Ming Lei
  2025-12-18 14:33   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Ming Lei @ 2025-12-18 11:11 UTC (permalink / raw)
  To: Yu Kuai; +Cc: axboe, linux-block, tj, nilay

On Sun, Dec 14, 2025 at 06:13:59PM +0800, Yu Kuai wrote:
> Currently rq-qos debugfs entries are created from rq_qos_add(), while
> rq_qos_add() can be called while queue is still frozen. This can
> deadlock because creating new entries can trigger fs reclaim.
> 
> Fix this problem by delaying creating rq-qos debugfs entries after queue
> is unfrozen.
> 
> - For wbt, 1) it can be initialized by default, fix it by calling new
>   helper after wbt_init() from wbt_enable_default; 2) it can be
>   initialized by sysfs, fix it by calling new helper after queue is
>   unfrozen from queue_wb_lat_store().
> - For iocost and iolatency, they can only be initialized by blkcg
>   configuration, however, they don't have debugfs entries for now, hence
>   they are not handled yet.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

This patch need to rebase on the following -rc2.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat()
  2025-12-14 10:13 ` [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
  2025-12-16 10:19   ` Ming Lei
@ 2025-12-18 14:18   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Nilay Shroff @ 2025-12-18 14:18 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/14/25 3:43 PM, Yu Kuai wrote:
> To move implementation details inside blk-wbt.c, prepare to fix possible
> deadlock to call wbt_init() while queue is frozen in the next patch.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
  2025-12-14 10:13 ` [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
  2025-12-16 11:03   ` Ming Lei
@ 2025-12-18 14:20   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Nilay Shroff @ 2025-12-18 14:20 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/14/25 3:43 PM, Yu Kuai wrote:
> If wbt is disabled by default and user configures wbt by sysfs, queue
> will be frozen first and then pcpu_alloc_mutex will be held in
> blk_stat_alloc_callback().
> 
> Fix this problem by allocating memory first before queue frozen.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
  2025-12-14 10:13 ` [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
  2025-12-18 11:06   ` Ming Lei
@ 2025-12-18 14:28   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Nilay Shroff @ 2025-12-18 14:28 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/14/25 3:43 PM, Yu Kuai wrote:
> There is already a helper blk_mq_debugfs_register_rqos() to register
> one rqos, however this helper is called synchronously when the rqos is
> created with queue frozen.
> 
> Prepare to fix possible deadlock to create blk-mq debugfs entries while
> queue is still frozen.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock
  2025-12-14 10:13 ` [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
  2025-12-18 11:11   ` Ming Lei
@ 2025-12-18 14:33   ` Nilay Shroff
  1 sibling, 0 replies; 27+ messages in thread
From: Nilay Shroff @ 2025-12-18 14:33 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/14/25 3:43 PM, Yu Kuai wrote:
> Currently rq-qos debugfs entries are created from rq_qos_add(), while
> rq_qos_add() can be called while queue is still frozen. This can
> deadlock because creating new entries can trigger fs reclaim.
> 
> Fix this problem by delaying creating rq-qos debugfs entries after queue
> is unfrozen.
> 
> - For wbt, 1) it can be initialized by default, fix it by calling new
>   helper after wbt_init() from wbt_enable_default; 2) it can be
>   initialized by sysfs, fix it by calling new helper after queue is
>   unfrozen from queue_wb_lat_store().
> - For iocost and iolatency, they can only be initialized by blkcg
>   configuration, however, they don't have debugfs entries for now, hence
>   they are not handled yet.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>

Looks good to me:
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  2025-12-14 10:14 ` [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
@ 2025-12-18 14:52   ` Nilay Shroff
  2025-12-25  9:30     ` Yu Kuai
  0 siblings, 1 reply; 27+ messages in thread
From: Nilay Shroff @ 2025-12-18 14:52 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/14/25 3:44 PM, Yu Kuai wrote:
> Because it's only used inside blk-mq-debugfs.c now.
> 
> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
> ---
>  block/blk-mq-debugfs.c | 4 +++-
>  block/blk-mq-debugfs.h | 5 -----
>  2 files changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 128d2aa6a20d..99466595c0a4 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -14,6 +14,8 @@
>  #include "blk-mq-sched.h"
>  #include "blk-rq-qos.h"
>  
> +static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
> +

The only caller of blk_mq_debugfs_register_rqos() is blk_mq_debugfs_register_rq_qos()
in this file. So if you can move the definition of blk_mq_debugfs_register_rq_qos() just 
after the definition of blk_mq_debugfs_register_rqos() in this file then you may avoid
above static declaration. Otherwise this looks good to me. So with that change, 
you may add:

Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>





^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-14 10:14 ` [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
@ 2025-12-18 18:26   ` Nilay Shroff
  2025-12-25  9:37     ` Yu Kuai
  0 siblings, 1 reply; 27+ messages in thread
From: Nilay Shroff @ 2025-12-18 18:26 UTC (permalink / raw)
  To: Yu Kuai, axboe, linux-block, tj, ming.lei



On 12/14/25 3:44 PM, Yu Kuai wrote:
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index 99466595c0a4..d54f8c29d2f4 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -610,9 +610,22 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
>  	{},
>  };
>  
> -static void debugfs_create_files(struct dentry *parent, void *data,
> +static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
> +				 void *data,
>  				 const struct blk_mq_debugfs_attr *attr)
>  {
> +	/*
> +	 * Creating new debugfs entries with queue freezed has the risk of
> +	 * deadlock.
> +	 */
> +	WARN_ON_ONCE(q->mq_freeze_depth != 0);
> +	/*
> +	 * debugfs_mutex should not be nested under other locks that can be
> +	 * grabbed while queue is frozen.
> +	 */
> +	lockdep_assert_not_held(&q->elevator_lock);
> +	lockdep_assert_not_held(&q->rq_qos_mutex);
> +

Shouldn't we also ensure that ->debugfs_mutex is already held by the caller while
the above function is invoked? That said, I also found that we need to fix the
locking order in rq_qos_del() which is invoked just before cleaning up debugfs
entries. 

Thanks,
--Nilay

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
  2025-12-18 14:52   ` Nilay Shroff
@ 2025-12-25  9:30     ` Yu Kuai
  0 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-25  9:30 UTC (permalink / raw)
  To: Nilay Shroff, axboe, linux-block, tj, ming.lei, yukuai

Hi,

在 2025/12/18 22:52, Nilay Shroff 写道:
>
> On 12/14/25 3:44 PM, Yu Kuai wrote:
>> Because it's only used inside blk-mq-debugfs.c now.
>>
>> Signed-off-by: Yu Kuai <yukuai@fnnas.com>
>> ---
>>   block/blk-mq-debugfs.c | 4 +++-
>>   block/blk-mq-debugfs.h | 5 -----
>>   2 files changed, 3 insertions(+), 6 deletions(-)
>>
>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
>> index 128d2aa6a20d..99466595c0a4 100644
>> --- a/block/blk-mq-debugfs.c
>> +++ b/block/blk-mq-debugfs.c
>> @@ -14,6 +14,8 @@
>>   #include "blk-mq-sched.h"
>>   #include "blk-rq-qos.h"
>>   
>> +static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
>> +
> The only caller of blk_mq_debugfs_register_rqos() is blk_mq_debugfs_register_rq_qos()
> in this file. So if you can move the definition of blk_mq_debugfs_register_rq_qos() just
> after the definition of blk_mq_debugfs_register_rqos() in this file then you may avoid
> above static declaration. Otherwise this looks good to me. So with that change,

Yes, this looks better, I'll do that in patch 3.

> you may add:
>
> Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
>
>
>
>
>
-- 
Thansk,
Kuai

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock
  2025-12-18 18:26   ` Nilay Shroff
@ 2025-12-25  9:37     ` Yu Kuai
  0 siblings, 0 replies; 27+ messages in thread
From: Yu Kuai @ 2025-12-25  9:37 UTC (permalink / raw)
  To: Nilay Shroff, axboe, linux-block, tj, ming.lei, yukuai

Hi,

在 2025/12/19 2:26, Nilay Shroff 写道:
>
> On 12/14/25 3:44 PM, Yu Kuai wrote:
>> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
>> index 99466595c0a4..d54f8c29d2f4 100644
>> --- a/block/blk-mq-debugfs.c
>> +++ b/block/blk-mq-debugfs.c
>> @@ -610,9 +610,22 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
>>   	{},
>>   };
>>   
>> -static void debugfs_create_files(struct dentry *parent, void *data,
>> +static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
>> +				 void *data,
>>   				 const struct blk_mq_debugfs_attr *attr)
>>   {
>> +	/*
>> +	 * Creating new debugfs entries with queue freezed has the risk of
>> +	 * deadlock.
>> +	 */
>> +	WARN_ON_ONCE(q->mq_freeze_depth != 0);
>> +	/*
>> +	 * debugfs_mutex should not be nested under other locks that can be
>> +	 * grabbed while queue is frozen.
>> +	 */
>> +	lockdep_assert_not_held(&q->elevator_lock);
>> +	lockdep_assert_not_held(&q->rq_qos_mutex);
>> +
> Shouldn't we also ensure that ->debugfs_mutex is already held by the caller while
> the above function is invoked? That said, I also found that we need to fix the
> locking order in rq_qos_del() which is invoked just before cleaning up debugfs
> entries.

I can add the assert for debugfs_mutex(), however, for rq_qos_del(), I'll prefer
to delete blk_mq_debugfs_unregister_rqos() directly, it's only called by iocost
and iolatency that don't have debugfs.

>
> Thanks,
> --Nilay
>
-- 
Thansk,
Kuai

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2025-12-25  9:44 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-14 10:13 [PATCH v5 00/13] blk-mq: fix possible deadlocks Yu Kuai
2025-12-14 10:13 ` [PATCH v5 01/13] blk-wbt: factor out a helper wbt_set_lat() Yu Kuai
2025-12-16 10:19   ` Ming Lei
2025-12-18 14:18   ` Nilay Shroff
2025-12-14 10:13 ` [PATCH v5 02/13] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter Yu Kuai
2025-12-16 11:03   ` Ming Lei
2025-12-16 11:29     ` Yu Kuai
2025-12-18 14:20   ` Nilay Shroff
2025-12-14 10:13 ` [PATCH v5 03/13] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos Yu Kuai
2025-12-18 11:06   ` Ming Lei
2025-12-18 14:28   ` Nilay Shroff
2025-12-14 10:13 ` [PATCH v5 04/13] blk-rq-qos: fix possible debugfs_mutex deadlock Yu Kuai
2025-12-18 11:11   ` Ming Lei
2025-12-18 14:33   ` Nilay Shroff
2025-12-14 10:14 ` [PATCH v5 05/13] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Yu Kuai
2025-12-18 14:52   ` Nilay Shroff
2025-12-25  9:30     ` Yu Kuai
2025-12-14 10:14 ` [PATCH v5 06/13] blk-mq-debugfs: warn about possible deadlock Yu Kuai
2025-12-18 18:26   ` Nilay Shroff
2025-12-25  9:37     ` Yu Kuai
2025-12-14 10:14 ` [PATCH v5 07/13] blk-throttle: convert to GFP_NOIO in blk_throtl_init() Yu Kuai
2025-12-14 10:14 ` [PATCH v5 08/13] block/blk-rq-qos: add a new helper rq_qos_add_frozen() Yu Kuai
2025-12-14 10:14 ` [PATCH v5 09/13] blk-wbt: fix incorrect lock order for rq_qos_mutex and freeze queue Yu Kuai
2025-12-14 10:14 ` [PATCH v5 10/13] blk-iocost: " Yu Kuai
2025-12-14 10:14 ` [PATCH v5 11/13] blk-iolatency: " Yu Kuai
2025-12-14 10:14 ` [PATCH v5 12/13] blk-throttle: remove useless queue frozen Yu Kuai
2025-12-14 10:14 ` [PATCH v5 13/13] block/blk-rq-qos: cleanup rq_qos_add() Yu Kuai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).