All of lore.kernel.org
 help / color / mirror / Atom feed
From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-block@vger.kernel.org
Cc: hch@lst.de, ming.lei@redhat.com, dlemoal@kernel.org,
	axboe@kernel.dk, gjoyce@ibm.com
Subject: [PATCHv2 3/6] block: Introduce a dedicated lock for protecting queue elevator updates
Date: Tue, 18 Feb 2025 13:58:56 +0530	[thread overview]
Message-ID: <20250218082908.265283-4-nilay@linux.ibm.com> (raw)
In-Reply-To: <20250218082908.265283-1-nilay@linux.ibm.com>

A queue's elevator can be updated either when modifying nr_hw_queues
or through the sysfs scheduler attribute. To prevent simultaneous
updates from causing race conditions, introduce a dedicated lock
q->elevator_lock. Currently, elevator switching/updating is protected
using q->sysfs_lock, but this has led to lockdep splats[1] due to
inconsistent lock ordering between q->sysfs_lock and the freeze-lock
in multiple block layer call sites.

As the scope of q->sysfs_lock is not well-defined, its misuse has
resulted in numerous lockdep warnings. To resolve this, replace q->
sysfs_lock with a new dedicated q->elevator_lock, which will be
exclusively used to protect elevator switching and updates.

[1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/

Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
 block/blk-core.c       |   1 +
 block/blk-mq.c         |  12 ++--
 block/blk-sysfs.c      | 133 ++++++++++++++++++++++++++++-------------
 block/elevator.c       |  18 ++++--
 block/genhd.c          |   9 ++-
 include/linux/blkdev.h |   1 +
 6 files changed, 119 insertions(+), 55 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d6c4fa3943b5..222cdcb662c2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -431,6 +431,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
 	mutex_init(&q->debugfs_mutex);
 	mutex_init(&q->sysfs_lock);
 	mutex_init(&q->limits_lock);
+	mutex_init(&q->elevator_lock);
 	mutex_init(&q->rq_qos_mutex);
 	spin_lock_init(&q->queue_lock);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40490ac88045..f58e11dee8a0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4467,7 +4467,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	unsigned long i, j;
 
 	/* protect against switching io scheduler  */
-	mutex_lock(&q->sysfs_lock);
+	mutex_lock(&q->elevator_lock);
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int old_node;
 		int node = blk_mq_get_hctx_node(set, i);
@@ -4500,7 +4500,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 
 	xa_for_each_start(&q->hctx_table, j, hctx, j)
 		blk_mq_exit_hctx(q, set, hctx, j);
-	mutex_unlock(&q->sysfs_lock);
+	mutex_unlock(&q->elevator_lock);
 
 	/* unregister cpuhp callbacks for exited hctxs */
 	blk_mq_remove_hw_queues_cpuhp(q);
@@ -4934,7 +4934,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
 		return false;
 
 	/* q->elevator needs protection from ->sysfs_lock */
-	mutex_lock(&q->sysfs_lock);
+	mutex_lock(&q->elevator_lock);
 
 	/* the check has to be done with holding sysfs_lock */
 	if (!q->elevator) {
@@ -4950,7 +4950,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
 	list_add(&qe->node, head);
 	elevator_disable(q);
 unlock:
-	mutex_unlock(&q->sysfs_lock);
+	mutex_unlock(&q->elevator_lock);
 
 	return true;
 }
@@ -4980,11 +4980,11 @@ static void blk_mq_elv_switch_back(struct list_head *head,
 	list_del(&qe->node);
 	kfree(qe);
 
-	mutex_lock(&q->sysfs_lock);
+	mutex_lock(&q->elevator_lock);
 	elevator_switch(q, t);
 	/* drop the reference acquired in blk_mq_elv_switch_none */
 	elevator_put(t);
-	mutex_unlock(&q->sysfs_lock);
+	mutex_unlock(&q->elevator_lock);
 }
 
 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7e22ec96f2b3..355dfb514712 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -58,7 +58,13 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 
 static ssize_t queue_requests_show(struct gendisk *disk, char *page)
 {
-	return queue_var_show(disk->queue->nr_requests, page);
+	int ret;
+
+	mutex_lock(&disk->queue->sysfs_lock);
+	ret = queue_var_show(disk->queue->nr_requests, page);
+	mutex_unlock(&disk->queue->sysfs_lock);
+
+	return ret;
 }
 
 static ssize_t
@@ -66,27 +72,42 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
 {
 	unsigned long nr;
 	int ret, err;
+	unsigned int memflags;
+	struct request_queue *q = disk->queue;
 
-	if (!queue_is_mq(disk->queue))
-		return -EINVAL;
+	mutex_lock(&q->sysfs_lock);
+	memflags = blk_mq_freeze_queue(q);
+
+	if (!queue_is_mq(disk->queue)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	ret = queue_var_store(&nr, page, count);
 	if (ret < 0)
-		return ret;
+		goto out;
 
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;
 
 	err = blk_mq_update_nr_requests(disk->queue, nr);
 	if (err)
-		return err;
-
+		ret = err;
+out:
+	blk_mq_unfreeze_queue(q, memflags);
+	mutex_unlock(&q->sysfs_lock);
 	return ret;
 }
 
 static ssize_t queue_ra_show(struct gendisk *disk, char *page)
 {
-	return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+	int ret;
+
+	mutex_lock(&disk->queue->sysfs_lock);
+	ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+	mutex_unlock(&disk->queue->sysfs_lock);
+
+	return ret;
 }
 
 static ssize_t
@@ -94,11 +115,19 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count)
 {
 	unsigned long ra_kb;
 	ssize_t ret;
+	unsigned int memflags;
+	struct request_queue *q = disk->queue;
+
+	mutex_lock(&q->sysfs_lock);
+	memflags = blk_mq_freeze_queue(q);
 
 	ret = queue_var_store(&ra_kb, page, count);
 	if (ret < 0)
-		return ret;
+		goto out;
 	disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+out:
+	blk_mq_unfreeze_queue(q, memflags);
+	mutex_unlock(&q->sysfs_lock);
 	return ret;
 }
 
@@ -534,14 +563,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page)
 
 static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
 {
-	if (!wbt_rq_qos(disk->queue))
-		return -EINVAL;
+	int ret;
+
+	mutex_lock(&disk->queue->sysfs_lock);
+
+	if (!wbt_rq_qos(disk->queue)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (wbt_disabled(disk->queue))
-		return sysfs_emit(page, "0\n");
+		ret = sysfs_emit(page, "0\n");
+	else
+		ret = sysfs_emit(page, "%llu\n",
+				div_u64(wbt_get_min_lat(disk->queue), 1000));
 
-	return sysfs_emit(page, "%llu\n",
-		div_u64(wbt_get_min_lat(disk->queue), 1000));
+out:
+	mutex_unlock(&disk->queue->sysfs_lock);
+	return ret;
 }
 
 static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
@@ -551,18 +590,24 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 	struct rq_qos *rqos;
 	ssize_t ret;
 	s64 val;
+	unsigned int memflags;
+
+	mutex_lock(&q->sysfs_lock);
+	memflags = blk_mq_freeze_queue(q);
 
 	ret = queue_var_store64(&val, page);
 	if (ret < 0)
-		return ret;
-	if (val < -1)
-		return -EINVAL;
+		goto out;
+	if (val < -1) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	rqos = wbt_rq_qos(q);
 	if (!rqos) {
 		ret = wbt_init(disk);
 		if (ret)
-			return ret;
+			goto out;
 	}
 
 	if (val == -1)
@@ -570,8 +615,10 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 	else if (val >= 0)
 		val *= 1000ULL;
 
-	if (wbt_get_min_lat(q) == val)
-		return count;
+	if (wbt_get_min_lat(q) == val) {
+		ret = count;
+		goto out;
+	}
 
 	/*
 	 * Ensure that the queue is idled, in case the latency update
@@ -584,7 +631,10 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
 
 	blk_mq_unquiesce_queue(q);
 
-	return count;
+out:
+	blk_mq_unfreeze_queue(q, memflags);
+	mutex_unlock(&q->sysfs_lock);
+	return ret;
 }
 
 QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
@@ -593,7 +643,7 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
 /* Common attributes for bio-based and request-based queues. */
 static struct attribute *queue_attrs[] = {
 	/*
-	 * attributes protected with q->sysfs_lock
+	 * attributes which require some form of locking
 	 */
 	&queue_ra_entry.attr,
 
@@ -652,10 +702,10 @@ static struct attribute *queue_attrs[] = {
 /* Request-based queue attributes that are not relevant for bio-based queues. */
 static struct attribute *blk_mq_queue_attrs[] = {
 	/*
-	 * attributes protected with q->sysfs_lock
+	 * attributes which require some form of locking
 	 */
-	&queue_requests_entry.attr,
 	&elv_iosched_entry.attr,
+	&queue_requests_entry.attr,
 #ifdef CONFIG_BLK_WBT
 	&queue_wb_lat_entry.attr,
 #endif
@@ -729,10 +779,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 		return res;
 	}
 
-	mutex_lock(&disk->queue->sysfs_lock);
-	res = entry->show(disk, page);
-	mutex_unlock(&disk->queue->sysfs_lock);
-	return res;
+	return entry->show(disk, page);
 }
 
 static ssize_t
@@ -778,12 +825,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
 		return length;
 	}
 
-	mutex_lock(&q->sysfs_lock);
-	memflags = blk_mq_freeze_queue(q);
-	res = entry->store(disk, page, length);
-	blk_mq_unfreeze_queue(q, memflags);
-	mutex_unlock(&q->sysfs_lock);
-	return res;
+	return entry->store(disk, page, length);
 }
 
 static const struct sysfs_ops queue_sysfs_ops = {
@@ -852,15 +894,19 @@ int blk_register_queue(struct gendisk *disk)
 	if (ret)
 		goto out_debugfs_remove;
 
+	ret = blk_crypto_sysfs_register(disk);
+	if (ret)
+		goto out_unregister_ia_ranges;
+
+	mutex_lock(&q->elevator_lock);
 	if (q->elevator) {
 		ret = elv_register_queue(q, false);
-		if (ret)
-			goto out_unregister_ia_ranges;
+		if (ret) {
+			mutex_unlock(&q->elevator_lock);
+			goto out_crypto_sysfs_unregister;
+		}
 	}
-
-	ret = blk_crypto_sysfs_register(disk);
-	if (ret)
-		goto out_elv_unregister;
+	mutex_unlock(&q->elevator_lock);
 
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(disk);
@@ -885,8 +931,8 @@ int blk_register_queue(struct gendisk *disk)
 
 	return ret;
 
-out_elv_unregister:
-	elv_unregister_queue(q);
+out_crypto_sysfs_unregister:
+	blk_crypto_sysfs_unregister(disk);
 out_unregister_ia_ranges:
 	disk_unregister_independent_access_ranges(disk);
 out_debugfs_remove:
@@ -932,8 +978,11 @@ void blk_unregister_queue(struct gendisk *disk)
 		blk_mq_sysfs_unregister(disk);
 	blk_crypto_sysfs_unregister(disk);
 
-	mutex_lock(&q->sysfs_lock);
+	mutex_lock(&q->elevator_lock);
 	elv_unregister_queue(q);
+	mutex_unlock(&q->elevator_lock);
+
+	mutex_lock(&q->sysfs_lock);
 	disk_unregister_independent_access_ranges(disk);
 	mutex_unlock(&q->sysfs_lock);
 
diff --git a/block/elevator.c b/block/elevator.c
index cd2ce4921601..6ee372f0220c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
 	struct elevator_queue *e = q->elevator;
 	int error;
 
-	lockdep_assert_held(&q->sysfs_lock);
+	lockdep_assert_held(&q->elevator_lock);
 
 	error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
 	if (!error) {
@@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q)
 {
 	struct elevator_queue *e = q->elevator;
 
-	lockdep_assert_held(&q->sysfs_lock);
+	lockdep_assert_held(&q->elevator_lock);
 
 	if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
 		kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	unsigned int memflags;
 	int ret;
 
-	lockdep_assert_held(&q->sysfs_lock);
+	lockdep_assert_held(&q->elevator_lock);
 
 	memflags = blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
@@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q)
 {
 	unsigned int memflags;
 
-	lockdep_assert_held(&q->sysfs_lock);
+	lockdep_assert_held(&q->elevator_lock);
 
 	memflags = blk_mq_freeze_queue(q);
 	blk_mq_quiesce_queue(q);
@@ -723,11 +723,19 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
 {
 	char elevator_name[ELV_NAME_MAX];
 	int ret;
+	unsigned int memflags;
+	struct request_queue *q = disk->queue;
 
 	strscpy(elevator_name, buf, sizeof(elevator_name));
+
+	memflags = blk_mq_freeze_queue(q);
+	mutex_lock(&q->elevator_lock);
 	ret = elevator_change(disk->queue, strstrip(elevator_name));
+	mutex_unlock(&q->elevator_lock);
+	blk_mq_unfreeze_queue(q, memflags);
 	if (!ret)
 		return count;
+
 	return ret;
 }
 
@@ -738,6 +746,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
 	struct elevator_type *cur = NULL, *e;
 	int len = 0;
 
+	mutex_lock(&q->elevator_lock);
 	if (!q->elevator) {
 		len += sprintf(name+len, "[none] ");
 	} else {
@@ -755,6 +764,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
 	spin_unlock(&elv_list_lock);
 
 	len += sprintf(name+len, "\n");
+	mutex_unlock(&q->elevator_lock);
 	return len;
 }
 
diff --git a/block/genhd.c b/block/genhd.c
index e9375e20d866..c2bd86cd09de 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -565,8 +565,11 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
 	if (disk->major == BLOCK_EXT_MAJOR)
 		blk_free_ext_minor(disk->first_minor);
 out_exit_elevator:
-	if (disk->queue->elevator)
+	if (disk->queue->elevator) {
+		mutex_lock(&disk->queue->elevator_lock);
 		elevator_exit(disk->queue);
+		mutex_unlock(&disk->queue->elevator_lock);
+	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(add_disk_fwnode);
@@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk)
 
 	blk_mq_quiesce_queue(q);
 	if (q->elevator) {
-		mutex_lock(&q->sysfs_lock);
+		mutex_lock(&q->elevator_lock);
 		elevator_exit(q);
-		mutex_unlock(&q->sysfs_lock);
+		mutex_unlock(&q->elevator_lock);
 	}
 	rq_qos_exit(q);
 	blk_mq_unquiesce_queue(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 248416ecd01c..5690d2f3588e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -562,6 +562,7 @@ struct request_queue {
 
 	struct mutex		sysfs_lock;
 	struct mutex		limits_lock;
+	struct mutex		elevator_lock;
 
 	/*
 	 * for reusing dead hctx instance in case of updating
-- 
2.47.1


  parent reply	other threads:[~2025-02-18  8:29 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-18  8:28 [PATCHv2 0/6] block: fix lock order and remove redundant locking Nilay Shroff
2025-02-18  8:28 ` [PATCHv2 1/6] blk-sysfs: remove q->sysfs_lock for attributes which don't need it Nilay Shroff
2025-02-18  8:46   ` Christoph Hellwig
2025-02-18 11:26     ` Nilay Shroff
2025-02-21 14:02       ` Nilay Shroff
2025-02-22 12:44         ` Ming Lei
2025-02-24 13:09           ` Nilay Shroff
2025-02-24 14:49           ` Christoph Hellwig
2025-02-26 12:09             ` Nilay Shroff
2025-02-24  8:41         ` Hannes Reinecke
2025-02-24 13:12           ` Nilay Shroff
2025-02-18 12:10   ` Ming Lei
2025-02-18 13:11     ` Nilay Shroff
2025-02-18 13:45       ` Ming Lei
2025-02-18 16:29         ` Christoph Hellwig
2025-02-19  3:24           ` Ming Lei
2025-02-19  5:42             ` Christoph Hellwig
2025-02-19  8:34             ` Nilay Shroff
2025-02-19  8:56               ` Nilay Shroff
2025-02-19  9:20                 ` Ming Lei
2025-02-18  8:28 ` [PATCHv2 2/6] blk-sysfs: acquire q->limits_lock while reading attributes Nilay Shroff
2025-02-18  8:46   ` Christoph Hellwig
2025-02-18  8:28 ` Nilay Shroff [this message]
2025-02-18  9:05   ` [PATCHv2 3/6] block: Introduce a dedicated lock for protecting queue elevator updates Christoph Hellwig
2025-02-18 11:14     ` Nilay Shroff
2025-02-18 16:32       ` Christoph Hellwig
2025-02-19  8:41         ` Nilay Shroff
2025-02-18  8:28 ` [PATCHv2 4/6] blk-sysfs: protect nr_requests update using q->elevator_lock Nilay Shroff
2025-02-18  8:28 ` [PATCHv2 5/6] blk-sysfs: protect wbt_lat_usec " Nilay Shroff
2025-02-18  8:28 ` [PATCHv2 6/6] blk-sysfs: protect read_ahead_kb using q->limits_lock Nilay Shroff
2025-02-18  9:12   ` Christoph Hellwig
2025-02-18 11:27     ` Nilay Shroff
2025-02-18  9:21 ` [PATCHv2 0/6] block: fix lock order and remove redundant locking Christoph Hellwig
2025-02-18 12:09   ` Nilay Shroff

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250218082908.265283-4-nilay@linux.ibm.com \
    --to=nilay@linux.ibm.com \
    --cc=axboe@kernel.dk \
    --cc=dlemoal@kernel.org \
    --cc=gjoyce@ibm.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.