From: Ming Lei <ming.lei@redhat.com>
To: Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org
Cc: "Nilay Shroff" <nilay@linux.ibm.com>,
"Shinichiro Kawasaki" <shinichiro.kawasaki@wdc.com>,
"Thomas Hellström" <thomas.hellstrom@linux.intel.com>,
"Christoph Hellwig" <hch@lst.de>,
"Ming Lei" <ming.lei@redhat.com>,
"Hannes Reinecke" <hare@suse.de>
Subject: [PATCH V5 16/25] block: unifying elevator change
Date: Mon, 5 May 2025 22:17:54 +0800 [thread overview]
Message-ID: <20250505141805.2751237-17-ming.lei@redhat.com> (raw)
In-Reply-To: <20250505141805.2751237-1-ming.lei@redhat.com>
Elevator change is one well-define behavior:
- tear down current elevator if it exists
- setup new elevator
It is supposed to cover any case for changing elevator by single
internal API, typically the following cases:
- setup default elevator in add_disk()
- switch to none in del_disk()
- reset elevator in blk_mq_update_nr_hw_queues()
- switch elevator in sysfs `store` elevator attribute
This patch uses elevator_change() to cover all above cases:
- every elevator switch is serialized with each other: add_disk/del_disk/
store elevator is serialized already, blk_mq_update_nr_hw_queues() uses
srcu for syncing with the other three cases
- for both add_disk()/del_disk(), queue freeze works at atomic mode
or has been froze, so the freeze in elevator_change() won't add extra
delay
- `struct elev_change_ctx` instance holds any info for changing elevator
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-sysfs.c | 19 +++-----
block/blk.h | 5 +-
block/elevator.c | 116 +++++++++++++++++++++-------------------------
block/genhd.c | 28 ++---------
4 files changed, 67 insertions(+), 101 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1f9b45b0b9ee..741e607dfab6 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -869,14 +869,9 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_unregister_ia_ranges;
+ if (queue_is_mq(q))
+ elevator_set_default(q);
mutex_lock(&q->elevator_lock);
- if (q->elevator) {
- ret = elv_register_queue(q, false);
- if (ret) {
- mutex_unlock(&q->elevator_lock);
- goto out_crypto_sysfs_unregister;
- }
- }
wbt_enable_default(disk);
mutex_unlock(&q->elevator_lock);
@@ -902,8 +897,6 @@ int blk_register_queue(struct gendisk *disk)
return ret;
-out_crypto_sysfs_unregister:
- blk_crypto_sysfs_unregister(disk);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@@ -951,9 +944,11 @@ void blk_unregister_queue(struct gendisk *disk)
blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(disk);
- mutex_lock(&q->elevator_lock);
- elv_unregister_queue(q);
- mutex_unlock(&q->elevator_lock);
+ if (queue_is_mq(q)) {
+ blk_mq_quiesce_queue(q);
+ elevator_set_none(q);
+ blk_mq_unquiesce_queue(q);
+ }
mutex_lock(&q->sysfs_lock);
disk_unregister_independent_access_ranges(disk);
diff --git a/block/blk.h b/block/blk.h
index 98fd3a6f5ec9..23b6ed27e5d1 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -323,9 +323,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
void elv_update_nr_hw_queues(struct request_queue *q);
-void elevator_exit(struct request_queue *q);
-int elv_register_queue(struct request_queue *q, bool uevent);
-void elv_unregister_queue(struct request_queue *q);
+void elevator_set_default(struct request_queue *q);
+void elevator_set_none(struct request_queue *q);
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
diff --git a/block/elevator.c b/block/elevator.c
index 6cfac8f77d9f..540542cee21c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -154,7 +154,7 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-void elevator_exit(struct request_queue *q)
+static void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
@@ -458,7 +458,7 @@ static const struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q, bool uevent)
+static int elv_register_queue(struct request_queue *q, bool uevent)
{
struct elevator_queue *e = q->elevator;
int error;
@@ -488,7 +488,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
return error;
}
-void elv_unregister_queue(struct request_queue *q)
+static void elv_unregister_queue(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
@@ -561,66 +561,6 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-/*
- * For single queue devices, default to using mq-deadline. If we have multiple
- * queues or mq-deadline is not available, default to "none".
- */
-static struct elevator_type *elevator_get_default(struct request_queue *q)
-{
- if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
- return NULL;
-
- if (q->nr_hw_queues != 1 &&
- !blk_mq_is_shared_tags(q->tag_set->flags))
- return NULL;
-
- return elevator_find_get("mq-deadline");
-}
-
-/*
- * Use the default elevator settings. If the chosen elevator initialization
- * fails, fall back to the "none" elevator (no elevator).
- */
-void elevator_init_mq(struct request_queue *q)
-{
- struct elevator_type *e;
- unsigned int memflags;
- int err;
-
- WARN_ON_ONCE(blk_queue_registered(q));
-
- if (unlikely(q->elevator))
- return;
-
- e = elevator_get_default(q);
- if (!e)
- return;
-
- /*
- * We are called before adding disk, when there isn't any FS I/O,
- * so freezing queue plus canceling dispatch work is enough to
- * drain any dispatch activities originated from passthrough
- * requests, then no need to quiesce queue which may add long boot
- * latency, especially when lots of disks are involved.
- *
- * Disk isn't added yet, so verifying queue lock only manually.
- */
- memflags = blk_mq_freeze_queue(q);
-
- blk_mq_cancel_work_sync(q);
-
- err = blk_mq_init_sched(q, e);
-
- blk_mq_unfreeze_queue(q, memflags);
-
- if (err) {
- pr_warn("\"%s\" elevator initialization failed, "
- "falling back to \"none\"\n", e->elevator_name);
- }
-
- elevator_put(e);
-}
-
/*
* Switch to new_e io scheduler.
*
@@ -688,6 +628,16 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
memflags = blk_mq_freeze_queue(q);
+ /*
+ * May be called before adding disk, when there isn't any FS I/O,
+ * so freezing queue plus canceling dispatch work is enough to
+ * drain any dispatch activities originated from passthrough
+ * requests, then no need to quiesce queue which may add long boot
+ * latency, especially when lots of disks are involved.
+ *
+ * Disk isn't added yet, so verifying queue lock only manually.
+ */
+ blk_mq_cancel_work_sync(q);
mutex_lock(&q->elevator_lock);
if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
ret = elevator_switch(q, ctx);
@@ -716,6 +666,46 @@ void elv_update_nr_hw_queues(struct request_queue *q)
mutex_unlock(&q->elevator_lock);
}
+/*
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
+ */
+void elevator_set_default(struct request_queue *q)
+{
+ struct elv_change_ctx ctx = {
+ .name = "mq-deadline",
+ .no_uevent = true,
+ };
+ int err = 0;
+
+ if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ return;
+
+ /*
+ * For single queue devices, default to using mq-deadline. If we
+ * have multiple queues or mq-deadline is not available, default
+ * to "none".
+ */
+ if (elevator_find_get(ctx.name) && (q->nr_hw_queues == 1 ||
+ blk_mq_is_shared_tags(q->tag_set->flags)))
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("\"%s\" elevator initialization, failed %d, "
+ "falling back to \"none\"\n", ctx.name, err);
+}
+
+void elevator_set_none(struct request_queue *q)
+{
+ struct elv_change_ctx ctx = {
+ .name = "none",
+ };
+ int err;
+
+ err = elevator_change(q, &ctx);
+ if (err < 0)
+ pr_warn("%s: set none elevator failed %d\n", __func__, err);
+}
+
static void elv_iosched_load_module(const char *elevator_name)
{
struct elevator_type *found;
diff --git a/block/genhd.c b/block/genhd.c
index e0dd8ecc925f..f192fe4808b9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -432,12 +432,6 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
*/
if (disk->fops->submit_bio || disk->fops->poll_bio)
return -EINVAL;
-
- /*
- * Initialize the I/O scheduler code and pick a default one if
- * needed.
- */
- elevator_init_mq(disk->queue);
} else {
if (!disk->fops->submit_bio)
return -EINVAL;
@@ -454,7 +448,7 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
ret = -EINVAL;
if (disk->major) {
if (WARN_ON(!disk->minors))
- goto out_exit_elevator;
+ goto out;
if (disk->minors > DISK_MAX_PARTS) {
pr_err("block: can't allocate more than %d partitions\n",
@@ -464,14 +458,14 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
if (disk->first_minor > MINORMASK ||
disk->minors > MINORMASK + 1 ||
disk->first_minor + disk->minors > MINORMASK + 1)
- goto out_exit_elevator;
+ goto out;
} else {
if (WARN_ON(disk->minors))
- goto out_exit_elevator;
+ goto out;
ret = blk_alloc_ext_minor();
if (ret < 0)
- goto out_exit_elevator;
+ goto out;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
}
@@ -561,12 +555,7 @@ static int __add_disk(struct device *parent, struct gendisk *disk,
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
-out_exit_elevator:
- if (disk->queue->elevator) {
- mutex_lock(&disk->queue->elevator_lock);
- elevator_exit(disk->queue);
- mutex_unlock(&disk->queue->elevator_lock);
- }
+out:
return ret;
}
@@ -760,14 +749,7 @@ static void __del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_cancel_work_sync(q);
- blk_mq_quiesce_queue(q);
- if (q->elevator) {
- mutex_lock(&q->elevator_lock);
- elevator_exit(q);
- mutex_unlock(&q->elevator_lock);
- }
rq_qos_exit(q);
- blk_mq_unquiesce_queue(q);
/*
* If the disk does not own the queue, allow using passthrough requests
--
2.47.0
next prev parent reply other threads:[~2025-05-05 14:19 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-05 14:17 [PATCH V5 00/25] block: unify elevator changing and fix lockdep warning Ming Lei
2025-05-05 14:17 ` [PATCH V5 01/25] block: move blk_mq_add_queue_tag_set() after blk_mq_map_swqueue() Ming Lei
2025-05-05 14:17 ` [PATCH V5 02/25] block: move ELEVATOR_FLAG_DISABLE_WBT a request queue flag Ming Lei
2025-05-05 14:17 ` [PATCH V5 03/25] block: don't call freeze queue in elevator_switch() and elevator_disable() Ming Lei
2025-05-05 14:17 ` [PATCH V5 04/25] block: use q->elevator with ->elevator_lock held in elv_iosched_show() Ming Lei
2025-05-05 14:17 ` [PATCH V5 05/25] block: add two helpers for registering/un-registering sched debugfs Ming Lei
2025-05-05 14:17 ` [PATCH V5 06/25] block: move sched debugfs register into elvevator_register_queue Ming Lei
2025-05-05 14:17 ` [PATCH V5 07/25] block: add helper add_disk_final() Ming Lei
2025-05-06 4:40 ` Christoph Hellwig
2025-05-06 7:43 ` Nilay Shroff
2025-05-06 11:02 ` Hannes Reinecke
2025-05-05 14:17 ` [PATCH V5 08/25] block: prevent adding/deleting disk during updating nr_hw_queues Ming Lei
2025-05-05 14:17 ` [PATCH V5 09/25] block: don't allow to switch elevator if updating nr_hw_queues is in-progress Ming Lei
2025-05-06 4:41 ` Christoph Hellwig
2025-05-06 6:26 ` Nilay Shroff
2025-05-05 14:17 ` [PATCH V5 10/25] block: look up the elevator type in elevator_switch Ming Lei
2025-05-05 14:17 ` [PATCH V5 11/25] block: fold elevator_disable into elevator_switch Ming Lei
2025-05-05 14:17 ` [PATCH V5 12/25] block: move blk_queue_registered() check into elv_iosched_store() Ming Lei
2025-05-06 4:41 ` Christoph Hellwig
2025-05-06 7:47 ` Nilay Shroff
2025-05-05 14:17 ` [PATCH V5 13/25] block: simplify elevator reattachment for updating nr_hw_queues Ming Lei
2025-05-05 14:17 ` [PATCH V5 14/25] block: move queue freezing & elevator_lock into elevator_change() Ming Lei
2025-05-05 14:17 ` [PATCH V5 15/25] block: add `struct elv_change_ctx` for unifying elevator change Ming Lei
2025-05-06 4:42 ` Christoph Hellwig
2025-05-05 14:17 ` Ming Lei [this message]
2025-05-05 14:17 ` [PATCH V5 17/25] block: pass elevator_queue to elv_register_queue & unregister_queue Ming Lei
2025-05-05 14:17 ` [PATCH V5 18/25] block: remove elevator queue's type check in elv_attr_show/store() Ming Lei
2025-05-05 14:17 ` [PATCH V5 19/25] block: fail to show/store elevator sysfs attribute if elevator is dying Ming Lei
2025-05-06 11:09 ` Hannes Reinecke
2025-05-05 14:17 ` [PATCH V5 20/25] block: add new helper for disabling elevator switch when deleting disk Ming Lei
2025-05-06 6:32 ` Nilay Shroff
2025-05-05 14:17 ` [PATCH V5 21/25] block: move elv_register[unregister]_queue out of elevator_lock Ming Lei
2025-05-06 4:43 ` Christoph Hellwig
2025-05-06 6:36 ` Nilay Shroff
2025-05-05 14:18 ` [PATCH V5 22/25] block: move hctx debugfs/sysfs registering out of freezing queue Ming Lei
2025-05-05 14:18 ` [PATCH V5 23/25] block: don't acquire ->elevator_lock in blk_mq_map_swqueue and blk_mq_realloc_hw_ctxs Ming Lei
2025-05-06 4:44 ` Christoph Hellwig
2025-05-05 14:18 ` [PATCH V5 24/25] block: move hctx cpuhp add/del out of queue freezing Ming Lei
2025-05-06 4:44 ` Christoph Hellwig
2025-05-05 14:18 ` [PATCH V5 25/25] block: move wbt_enable_default() out of queue freezing from sched ->exit() Ming Lei
2025-05-06 4:44 ` Christoph Hellwig
2025-05-06 13:48 ` [PATCH V5 00/25] block: unify elevator changing and fix lockdep warning Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250505141805.2751237-17-ming.lei@redhat.com \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=nilay@linux.ibm.com \
--cc=shinichiro.kawasaki@wdc.com \
--cc=thomas.hellstrom@linux.intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox