From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-block@vger.kernel.org
Cc: hch@lst.de, ming.lei@redhat.com, dlemoal@kernel.org,
axboe@kernel.dk, gjoyce@ibm.com
Subject: [PATCH 1/2] block: fix lock ordering between the queue ->sysfs_lock and freeze-lock
Date: Wed, 5 Feb 2025 20:14:47 +0530 [thread overview]
Message-ID: <20250205144506.663819-2-nilay@linux.ibm.com> (raw)
In-Reply-To: <20250205144506.663819-1-nilay@linux.ibm.com>
Lockdep reports [1] have identified inconsistent lock ordering between
q->sysfs_lock and freeze-lock at several call sites in the block layer.
This patch resolves the issue by enforcing a consistent lock acquisition
order: q->sysfs_lock is always acquired before freeze-lock. This change
eliminates the observed lockdep splats caused by the inconsistent
ordering.
Additionally, while rearranging the locking order, we ensure that no new
lock ordering issues are introduced between the global CPU hotplug (cpuhp)
lock and q->sysfs_lock, as previously reported [2]. To address this,
blk_mq_add_hw_queues_cpuhp() and blk_mq_remove_hw_queues_cpuhp() are now
called outside the critical section protected by q->sysfs_lock.
Since blk_mq_add_hw_queues_cpuhp() and blk_mq_remove_hw_queues_cpuhp()
are invoked during hardware context allocation via blk_mq_realloc_hw_
ctxs(), which runs holding q->sysfs_lock, we've relocated the add/remove
cpuhp function calls to __blk_mq_update_nr_hw_queues() and blk_mq_init_
allocated_queue() after the q->sysfs_lock is released. This ensures proper
lock ordering without introducing regressions.
[1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/
[2] https://lore.kernel.org/all/20241206082202.949142-1-ming.lei@redhat.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
block/blk-mq.c | 49 ++++++++++++++++++++++++++++++++----------------
block/elevator.c | 9 +++++++++
2 files changed, 42 insertions(+), 16 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 40490ac88045..87200539b3cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4467,7 +4467,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
unsigned long i, j;
/* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_lock);
+
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
@@ -4500,13 +4501,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
-
- /* unregister cpuhp callbacks for exited hctxs */
- blk_mq_remove_hw_queues_cpuhp(q);
-
- /* register cpuhp for new initialized hctxs */
- blk_mq_add_hw_queues_cpuhp(q);
}
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
@@ -4532,10 +4526,19 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
xa_init(&q->hctx_table);
+ mutex_lock(&q->sysfs_lock);
blk_mq_realloc_hw_ctxs(set, q);
+ mutex_unlock(&q->sysfs_lock);
if (!q->nr_hw_queues)
goto err_hctxs;
+ /*
+ * Register cpuhp for new initialized hctxs and ensure that the cpuhp
+ * registration happens outside of q->sysfs_lock to avoid any lock
+ * ordering issue between q->sysfs_lock and global cpuhp lock.
+ */
+ blk_mq_add_hw_queues_cpuhp(q);
+
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
@@ -4934,12 +4937,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
return false;
/* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_lock);
/* the check has to be done with holding sysfs_lock */
if (!q->elevator) {
kfree(qe);
- goto unlock;
+ goto out;
}
INIT_LIST_HEAD(&qe->node);
@@ -4949,8 +4952,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
__elevator_get(qe->type);
list_add(&qe->node, head);
elevator_disable(q);
-unlock:
- mutex_unlock(&q->sysfs_lock);
+out:
return true;
}
@@ -4973,6 +4975,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
struct blk_mq_qe_pair *qe;
struct elevator_type *t;
+ lockdep_assert_held(&q->sysfs_lock);
+
qe = blk_lookup_qe_pair(head, q);
if (!qe)
return;
@@ -4980,11 +4984,9 @@ static void blk_mq_elv_switch_back(struct list_head *head,
list_del(&qe->node);
kfree(qe);
- mutex_lock(&q->sysfs_lock);
elevator_switch(q, t);
/* drop the reference acquired in blk_mq_elv_switch_none */
elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
@@ -5006,8 +5008,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
- list_for_each_entry(q, &set->tag_list, tag_set_list)
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ mutex_lock(&q->sysfs_lock);
blk_mq_freeze_queue_nomemsave(q);
+ }
/*
* Switch IO scheduler to 'none', cleaning up the data associated
@@ -5055,8 +5059,21 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(&head, q);
- list_for_each_entry(q, &set->tag_list, tag_set_list)
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ mutex_unlock(&q->sysfs_lock);
+
+ /*
+ * Unregister cpuhp callbacks for exited hctxs and register
+ * cpuhp for new initialized hctxs. Ensure that unregister/
+ * register cpuhp is called outside of q->sysfs_lock to avoid
+ * lock ordering issue between q->sysfs_lock and global cpuhp
+ * lock.
+ */
+ blk_mq_remove_hw_queues_cpuhp(q);
+ blk_mq_add_hw_queues_cpuhp(q);
+
blk_mq_unfreeze_queue_nomemrestore(q);
+ }
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
diff --git a/block/elevator.c b/block/elevator.c
index cd2ce4921601..596eb5c0219f 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -725,7 +725,16 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
int ret;
strscpy(elevator_name, buf, sizeof(elevator_name));
+
+ /*
+ * The elevator change/switch code expects that the q->sysfs_lock
+ * is held while we update the iosched to protect against the
+ * simultaneous hctx update.
+ */
+ mutex_lock(&disk->queue->sysfs_lock);
ret = elevator_change(disk->queue, strstrip(elevator_name));
+ mutex_unlock(&disk->queue->sysfs_lock);
+
if (!ret)
return count;
return ret;
--
2.47.1
next prev parent reply other threads:[~2025-02-05 14:45 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-05 14:44 [PATCH 0/2] block: fix lock order and remove redundant locking Nilay Shroff
2025-02-05 14:44 ` Nilay Shroff [this message]
2025-02-05 15:59 ` [PATCH 1/2] block: fix lock ordering between the queue ->sysfs_lock and freeze-lock Christoph Hellwig
2025-02-06 13:22 ` Nilay Shroff
2025-02-06 14:15 ` Christoph Hellwig
2025-02-07 11:59 ` Ming Lei
2025-02-07 18:02 ` Nilay Shroff
2025-02-08 8:30 ` Ming Lei
2025-02-08 13:18 ` Nilay Shroff
2025-02-05 14:44 ` [PATCH 2/2] block: avoid acquiring q->sysfs_lock while accessing sysfs attributes Nilay Shroff
2025-02-05 15:53 ` Christoph Hellwig
2025-02-06 13:54 ` Nilay Shroff
2025-02-06 14:07 ` Christoph Hellwig
2025-02-07 11:03 ` Nilay Shroff
2025-02-08 10:41 ` Ming Lei
2025-02-08 12:56 ` Nilay Shroff
2025-02-09 11:41 ` Ming Lei
2025-02-09 13:41 ` Nilay Shroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250205144506.663819-2-nilay@linux.ibm.com \
--to=nilay@linux.ibm.com \
--cc=axboe@kernel.dk \
--cc=dlemoal@kernel.org \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=ming.lei@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.