From: Ming Lei <ming.lei@redhat.com>
To: Nilay Shroff <nilay@linux.ibm.com>
Cc: linux-block@vger.kernel.org, hch@lst.de, axboe@kernel.dk,
sth@linux.ibm.com, gjoyce@ibm.com
Subject: Re: [PATCHv3 2/2] block: fix lock dependency between percpu alloc lock and elevator lock
Date: Wed, 18 Jun 2025 11:06:31 +0800 [thread overview]
Message-ID: <aFItNzzr-4iQbjyY@fedora> (raw)
In-Reply-To: <20250616173233.3803824-3-nilay@linux.ibm.com>
On Mon, Jun 16, 2025 at 11:02:26PM +0530, Nilay Shroff wrote:
> Recent lockdep reports [1] have revealed a potential deadlock caused by a
> lock dependency between the percpu allocator lock and the elevator lock.
> This issue can be avoided by ensuring that the allocation and release of
> scheduler tags (sched_tags) are performed outside the elevator lock.
> Furthermore, the queue does not need to be remain frozen during these
> operations.
>
> To address this, move all sched_tags allocations and deallocations outside
> of both the ->elevator_lock and the ->freeze_lock. Moreover, since the
> lifetime of the elevator queue and its associated sched_tags is closely
> tied, the allocated sched_tags are now stored in the elevator queue
> structure. Then, during the actual elevator switch (which runs under
> ->freeze_lock and ->elevator_lock), the pre-allocated sched_tags are
> assigned to the appropriate q->hctx. Once the elevator switch is complete
> and the locks are released, the old elevator queue and its associated
> sched_tags are freed.
>
> This restructuring ensures that sched_tags memory management occurs
> entirely outside of the ->elevator_lock and ->freeze_lock context,
> eliminating the lock dependency problem seen during scheduler updates
> or hardware queue update.
Nice!
>
> [1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
>
> Reported-by: Stefan Haberland <sth@linux.ibm.com>
> Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
> block/blk-mq-sched.c | 244 +++++++++++++++++++++++++++++++------------
> block/blk-mq-sched.h | 13 ++-
> block/blk-mq.c | 14 ++-
> block/blk.h | 4 +-
> block/elevator.c | 82 +++++++++++----
> block/elevator.h | 28 ++++-
> 6 files changed, 296 insertions(+), 89 deletions(-)
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index d914eb9d61a6..dd00d8c9fb78 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -374,27 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
> }
> EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
>
> -static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
> - struct blk_mq_hw_ctx *hctx,
> - unsigned int hctx_idx)
> +static void blk_mq_init_sched_tags(struct request_queue *q,
> + struct blk_mq_hw_ctx *hctx,
> + unsigned int hctx_idx,
> + struct elevator_queue *eq)
> {
> if (blk_mq_is_shared_tags(q->tag_set->flags)) {
> hctx->sched_tags = q->sched_shared_tags;
> - return 0;
> + return;
> }
>
> - hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
> - q->nr_requests);
> -
> - if (!hctx->sched_tags)
> - return -ENOMEM;
> - return 0;
> -}
> -
> -static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
> -{
> - blk_mq_free_rq_map(queue->sched_shared_tags);
> - queue->sched_shared_tags = NULL;
> + hctx->sched_tags = eq->tags->u.tags[hctx_idx];
> }
>
> /* called in queue's release handler, tagset has gone away */
> @@ -403,35 +393,18 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int fla
> struct blk_mq_hw_ctx *hctx;
> unsigned long i;
>
> - queue_for_each_hw_ctx(q, hctx, i) {
> - if (hctx->sched_tags) {
> - if (!blk_mq_is_shared_tags(flags))
> - blk_mq_free_rq_map(hctx->sched_tags);
> - hctx->sched_tags = NULL;
> - }
> - }
> + queue_for_each_hw_ctx(q, hctx, i)
> + hctx->sched_tags = NULL;
>
> if (blk_mq_is_shared_tags(flags))
> - blk_mq_exit_sched_shared_tags(q);
> + q->sched_shared_tags = NULL;
> }
>
> -static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
> +static void blk_mq_init_sched_shared_tags(struct request_queue *queue,
> + struct elevator_queue *eq)
> {
> - struct blk_mq_tag_set *set = queue->tag_set;
> -
> - /*
> - * Set initial depth at max so that we don't need to reallocate for
> - * updating nr_requests.
> - */
> - queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
> - BLK_MQ_NO_HCTX_IDX,
> - MAX_SCHED_RQ);
> - if (!queue->sched_shared_tags)
> - return -ENOMEM;
> -
> + queue->sched_shared_tags = eq->tags->u.shared_tags;
> blk_mq_tag_update_sched_shared_tags(queue);
> -
> - return 0;
> }
>
> void blk_mq_sched_reg_debugfs(struct request_queue *q)
> @@ -458,8 +431,165 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
> mutex_unlock(&q->debugfs_mutex);
> }
>
> +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
> + struct elevator_tags *tags)
> +{
> + unsigned long i;
> +
> + if (!tags)
> + return;
> +
> + if (blk_mq_is_shared_tags(set->flags)) {
> + if (tags->u.shared_tags) {
> + blk_mq_free_rqs(set, tags->u.shared_tags,
> + BLK_MQ_NO_HCTX_IDX);
> + blk_mq_free_rq_map(tags->u.shared_tags);
> + }
> + goto out;
> + }
> +
> + if (!tags->u.tags)
> + goto out;
> +
> + for (i = 0; i < tags->nr_hw_queues; i++) {
> + if (tags->u.tags[i]) {
> + blk_mq_free_rqs(set, tags->u.tags[i], i);
> + blk_mq_free_rq_map(tags->u.tags[i]);
> + }
> + }
> +
> + kfree(tags->u.tags);
> +out:
> + kfree(tags);
> +}
> +
> +void blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
> + struct elevator_tags **tags)
> +{
> + unsigned long i, count = 0;
> + struct request_queue *q;
> +
> + lockdep_assert_held_write(&set->update_nr_hwq_lock);
> +
> + if (!tags)
> + return;
> +
> + /*
> + * Accessing q->elevator without holding q->elevator_lock is safe
> + * because we're holding here set->update_nr_hwq_lock in the writer
> + * context. So, scheduler update/switch code (which acquires the same
> + * lock but in the reader context) can't run concurrently.
> + */
> + list_for_each_entry(q, &set->tag_list, tag_set_list) {
> + if (q->elevator)
> + count++;
> + }
> +
> + for (i = 0; i < count; i++)
> + __blk_mq_free_sched_tags(set, tags[i]);
> +
> + kfree(tags);
> +}
> +
> +struct elevator_tags *__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> + unsigned int nr_hw_queues, int id)
> +{
> + unsigned int i;
> + struct elevator_tags *tags;
> + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
> +
> + tags = kcalloc(1, sizeof(struct elevator_tags), gfp);
> + if (!tags)
> + return NULL;
> +
> + tags->id = id;
> + /*
> + * Default to double of smaller one between hw queue_depth and
> + * 128, since we don't split into sync/async like the old code
> + * did. Additionally, this is a per-hw queue depth.
> + */
> + tags->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
> + BLKDEV_DEFAULT_RQ);
> +
> + if (blk_mq_is_shared_tags(set->flags)) {
> +
> + tags->u.shared_tags = blk_mq_alloc_map_and_rqs(set,
> + BLK_MQ_NO_HCTX_IDX,
> + MAX_SCHED_RQ);
> + if (!tags->u.shared_tags)
> + goto out;
> +
> + return tags;
> + }
> +
> + tags->u.tags = kcalloc(nr_hw_queues, sizeof(struct blk_mq_tags *), gfp);
> + if (!tags->u.tags)
> + goto out;
> +
> + tags->nr_hw_queues = nr_hw_queues;
> + for (i = 0; i < nr_hw_queues; i++) {
> + tags->u.tags[i] = blk_mq_alloc_map_and_rqs(set, i,
> + tags->nr_requests);
> + if (!tags->u.tags[i])
> + goto out;
> + }
> +
> + return tags;
> +
> +out:
> + __blk_mq_free_sched_tags(set, tags);
> + return NULL;
> +}
> +
> +int blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> + unsigned int nr_hw_queues,
> + struct elevator_tags ***elv_tags)
We seldom see triple indirect pointers in linux kernel, :-)
> +{
> + unsigned long idx = 0, count = 0;
> + struct request_queue *q;
> + struct elevator_tags **tags;
> + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
> +
> + lockdep_assert_held_write(&set->update_nr_hwq_lock);
> + /*
> + * Accessing q->elevator without holding q->elevator_lock is safe
> + * because we're holding here set->update_nr_hwq_lock in the writer
> + * context. So, scheduler update/switch code (which acquires the same
> + * lock but in the reader context) can't run concurrently.
> + */
> + list_for_each_entry(q, &set->tag_list, tag_set_list) {
> + if (q->elevator)
> + count++;
> + }
> +
> + if (!count)
> + return 0;
> +
> + tags = kcalloc(count, sizeof(struct elevator_tags *), gfp);
> + if (!tags)
> + return -ENOMEM;
> +
> + list_for_each_entry(q, &set->tag_list, tag_set_list) {
> + if (q->elevator) {
> + tags[idx] = __blk_mq_alloc_sched_tags(set, nr_hw_queues,
> + q->id);
> + if (!tags[idx])
> + goto out;
> +
> + idx++;
> + }
> + }
> +
> + *elv_tags = tags;
> + return count;
> +out:
> + blk_mq_free_sched_tags(set, tags);
> + return -ENOMEM;
> +}
I believe lots of code can be saved if you take xarray to store the
allocated 'struct elevator_tags', and use q->id as the key.
Also the handling for updating_nr_hw_queues has new batch alloc & free logic
and should be done in standalone patch. Per my experience, bug is easier to
hide in the big patch, which is more hard to review.
> +
> /* caller must have a reference to @e, will grab another one if successful */
> -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
> + struct elevator_tags *tags)
> {
> unsigned int flags = q->tag_set->flags;
> struct blk_mq_hw_ctx *hctx;
> @@ -467,40 +597,26 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> unsigned long i;
> int ret;
>
> - /*
> - * Default to double of smaller one between hw queue_depth and 128,
> - * since we don't split into sync/async like the old code did.
> - * Additionally, this is a per-hw queue depth.
> - */
> - q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
> - BLKDEV_DEFAULT_RQ);
> -
> - eq = elevator_alloc(q, e);
> + eq = elevator_alloc(q, e, tags);
> if (!eq)
> return -ENOMEM;
>
> - if (blk_mq_is_shared_tags(flags)) {
> - ret = blk_mq_init_sched_shared_tags(q);
> - if (ret)
> - return ret;
> - }
> + q->nr_requests = tags->nr_requests;
>
> - queue_for_each_hw_ctx(q, hctx, i) {
> - ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
> - if (ret)
> - goto err_free_map_and_rqs;
> - }
> + if (blk_mq_is_shared_tags(flags))
> + blk_mq_init_sched_shared_tags(q, eq);
> +
> + queue_for_each_hw_ctx(q, hctx, i)
> + blk_mq_init_sched_tags(q, hctx, i, eq);
>
> ret = e->ops.init_sched(q, eq);
> if (ret)
> - goto err_free_map_and_rqs;
> + goto out;
>
> queue_for_each_hw_ctx(q, hctx, i) {
> if (e->ops.init_hctx) {
> ret = e->ops.init_hctx(hctx, i);
> if (ret) {
> - eq = q->elevator;
> - blk_mq_sched_free_rqs(q);
> blk_mq_exit_sched(q, eq);
> kobject_put(&eq->kobj);
> return ret;
> @@ -509,10 +625,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> }
> return 0;
>
> -err_free_map_and_rqs:
> - blk_mq_sched_free_rqs(q);
> +out:
> blk_mq_sched_tags_teardown(q, flags);
> -
> kobject_put(&eq->kobj);
> q->elevator = NULL;
> return ret;
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 1326526bb733..01c43192b98a 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -18,7 +18,18 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
>
> void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
>
> -int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
> +void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
> + struct elevator_tags *tags);
> +void blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
> + struct elevator_tags **elv_t);
> +int blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> + unsigned int nr_hw_queues,
> + struct elevator_tags ***tags);
> +struct elevator_tags *__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> + unsigned int nr_hw_queues, int id);
> +
> +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
> + struct elevator_tags *elv_t);
> void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
> void blk_mq_sched_free_rqs(struct request_queue *q);
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4806b867e37d..b8b616c79742 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -4970,6 +4970,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> int nr_hw_queues)
> {
> struct request_queue *q;
> + int count;
> + struct elevator_tags **elv_tags = NULL;
> int prev_nr_hw_queues = set->nr_hw_queues;
> unsigned int memflags;
> int i;
> @@ -4984,6 +4986,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> return;
>
> memflags = memalloc_noio_save();
> +
> + count = blk_mq_alloc_sched_tags(set, nr_hw_queues, &elv_tags);
> + if (count < 0)
> + goto out;
> +
> list_for_each_entry(q, &set->tag_list, tag_set_list) {
> blk_mq_debugfs_unregister_hctxs(q);
> blk_mq_sysfs_unregister_hctxs(q);
> @@ -4995,6 +5002,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
> list_for_each_entry(q, &set->tag_list, tag_set_list)
> blk_mq_unfreeze_queue_nomemrestore(q);
> +
> + blk_mq_free_sched_tags(set, elv_tags);
> + elv_tags = NULL;
> goto reregister;
> }
>
> @@ -5019,7 +5029,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>
> /* elv_update_nr_hw_queues() unfreeze queue for us */
> list_for_each_entry(q, &set->tag_list, tag_set_list)
> - elv_update_nr_hw_queues(q);
> + elv_update_nr_hw_queues(q, elv_tags, count);
>
> reregister:
> list_for_each_entry(q, &set->tag_list, tag_set_list) {
> @@ -5029,6 +5039,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> blk_mq_remove_hw_queues_cpuhp(q);
> blk_mq_add_hw_queues_cpuhp(q);
> }
> + kfree(elv_tags);
> +out:
> memalloc_noio_restore(memflags);
>
> /* Free the excess tags when nr_hw_queues shrink. */
> diff --git a/block/blk.h b/block/blk.h
> index 37ec459fe656..cd50305da84e 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -10,6 +10,7 @@
> #include <linux/timekeeping.h>
> #include <xen/xen.h>
> #include "blk-crypto-internal.h"
> +#include "elevator.h"
>
> struct elevator_type;
>
> @@ -321,7 +322,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
>
> bool blk_insert_flush(struct request *rq);
>
> -void elv_update_nr_hw_queues(struct request_queue *q);
> +void elv_update_nr_hw_queues(struct request_queue *q,
> + struct elevator_tags **elv_tags, int count);
> void elevator_set_default(struct request_queue *q);
> void elevator_set_none(struct request_queue *q);
>
> diff --git a/block/elevator.c b/block/elevator.c
> index ab22542e6cf0..849dbe347cb2 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -45,17 +45,6 @@
> #include "blk-wbt.h"
> #include "blk-cgroup.h"
>
> -/* Holding context data for changing elevator */
> -struct elv_change_ctx {
> - const char *name;
> - bool no_uevent;
> -
> - /* for unregistering old elevator */
> - struct elevator_queue *old;
> - /* for registering new elevator */
> - struct elevator_queue *new;
> -};
> -
> static DEFINE_SPINLOCK(elv_list_lock);
> static LIST_HEAD(elv_list);
>
> @@ -132,7 +121,7 @@ static struct elevator_type *elevator_find_get(const char *name)
> static const struct kobj_type elv_ktype;
>
> struct elevator_queue *elevator_alloc(struct request_queue *q,
> - struct elevator_type *e)
> + struct elevator_type *e, struct elevator_tags *tags)
> {
> struct elevator_queue *eq;
>
> @@ -142,6 +131,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
>
> __elevator_get(e);
> eq->type = e;
> + eq->tags = tags;
> kobject_init(&eq->kobj, &elv_ktype);
> mutex_init(&eq->sysfs_lock);
> hash_init(eq->hash);
> @@ -166,13 +156,25 @@ static void elevator_exit(struct request_queue *q)
> lockdep_assert_held(&q->elevator_lock);
>
> ioc_clear_queue(q);
> - blk_mq_sched_free_rqs(q);
>
> mutex_lock(&e->sysfs_lock);
> blk_mq_exit_sched(q, e);
> mutex_unlock(&e->sysfs_lock);
> }
>
> +static struct elevator_tags *elevator_tags_lookup(struct elevator_tags **tags,
> + struct request_queue *q, int count)
> +{
> + int i;
> +
> + for (i = 0; i < count; i++) {
> + if (tags[i]->id == q->id)
> + return tags[i];
> + }
> +
> + return NULL;
> +}
> +
> static inline void __elv_rqhash_del(struct request *rq)
> {
> hash_del(&rq->hash);
> @@ -570,7 +572,8 @@ EXPORT_SYMBOL_GPL(elv_unregister);
> * If switching fails, we are most likely running out of memory and not able
> * to restore the old io scheduler, so leaving the io scheduler being none.
> */
> -static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
> +static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx,
> + struct elevator_tags *tags)
> {
> struct elevator_type *new_e = NULL;
> int ret = 0;
> @@ -592,7 +595,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
> }
>
> if (new_e) {
> - ret = blk_mq_init_sched(q, new_e);
> + ret = blk_mq_init_sched(q, new_e, tags);
> if (ret)
> goto out_unfreeze;
> ctx->new = q->elevator;
> @@ -627,8 +630,10 @@ static void elv_exit_and_release(struct request_queue *q)
> elevator_exit(q);
> mutex_unlock(&q->elevator_lock);
> blk_mq_unfreeze_queue(q, memflags);
> - if (e)
> + if (e) {
> + __blk_mq_free_sched_tags(q->tag_set, e->tags);
> kobject_put(&e->kobj);
> + }
> }
>
> static int elevator_change_done(struct request_queue *q,
> @@ -641,6 +646,7 @@ static int elevator_change_done(struct request_queue *q,
> &ctx->old->flags);
>
> elv_unregister_queue(q, ctx->old);
> + __blk_mq_free_sched_tags(q->tag_set, ctx->old->tags);
> kobject_put(&ctx->old->kobj);
> if (enable_wbt)
> wbt_enable_default(q->disk);
> @@ -659,9 +665,17 @@ static int elevator_change_done(struct request_queue *q,
> static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
> {
> unsigned int memflags;
> + struct elevator_tags *tags = NULL;
> + struct blk_mq_tag_set *set = q->tag_set;
> int ret = 0;
>
> - lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
> + lockdep_assert_held(&set->update_nr_hwq_lock);
> +
> + if (strncmp(ctx->name, "none", 4)) {
> + tags = __blk_mq_alloc_sched_tags(set, set->nr_hw_queues, q->id);
> + if (!tags)
> + return -ENOMEM;
> + }
>
> memflags = blk_mq_freeze_queue(q);
> /*
> @@ -676,11 +690,16 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
> blk_mq_cancel_work_sync(q);
> mutex_lock(&q->elevator_lock);
> if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
> - ret = elevator_switch(q, ctx);
> + ret = elevator_switch(q, ctx, tags);
tags may be passed via 'ctx'.
> mutex_unlock(&q->elevator_lock);
> blk_mq_unfreeze_queue(q, memflags);
> if (!ret)
> ret = elevator_change_done(q, ctx);
> + /*
> + * Free sched tags if it's allocated but we couldn't switch elevator.
> + */
> + if (tags && !ctx->new)
> + __blk_mq_free_sched_tags(set, tags);
Maybe you can let elevator_change_done() cover failure handling, and
move the above two line into elevator_change_done().
>
> return ret;
> }
> @@ -689,24 +708,47 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
> * The I/O scheduler depends on the number of hardware queues, this forces a
> * reattachment when nr_hw_queues changes.
> */
> -void elv_update_nr_hw_queues(struct request_queue *q)
> +void elv_update_nr_hw_queues(struct request_queue *q,
> + struct elevator_tags **elv_tags, int count)
> {
> + struct elevator_tags *tags = NULL;
> + struct blk_mq_tag_set *set = q->tag_set;
> struct elv_change_ctx ctx = {};
> int ret = -ENODEV;
>
> WARN_ON_ONCE(q->mq_freeze_depth == 0);
>
> + /*
> + * Accessing q->elevator without holding q->elevator_lock is safe here
> + * because nr_hw_queue update is protected by set->update_nr_hwq_lock
> + * in the writer context. So, scheduler update/switch code (which
> + * acquires same lock in the reader context) can't run concurrently.
> + */
> + if (q->elevator) {
> + tags = elevator_tags_lookup(elv_tags, q, count);
> + if (!tags) {
> + WARN_ON(1);
> + goto out_unfreeze;
> + }
> + }
> +
> mutex_lock(&q->elevator_lock);
> if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
> ctx.name = q->elevator->type->elevator_name;
>
> /* force to reattach elevator after nr_hw_queue is updated */
> - ret = elevator_switch(q, &ctx);
> + ret = elevator_switch(q, &ctx, tags);
> }
> mutex_unlock(&q->elevator_lock);
> +out_unfreeze:
> blk_mq_unfreeze_queue_nomemrestore(q);
> if (!ret)
> WARN_ON_ONCE(elevator_change_done(q, &ctx));
> + /*
> + * Free sched tags if it's allocated but we couldn't switch elevator.
> + */
> + if (tags && !ctx.new)
> + __blk_mq_free_sched_tags(set, tags);
> }
>
> /*
> diff --git a/block/elevator.h b/block/elevator.h
> index a4de5f9ad790..0b92121005cf 100644
> --- a/block/elevator.h
> +++ b/block/elevator.h
> @@ -23,6 +23,17 @@ enum elv_merge {
> struct blk_mq_alloc_data;
> struct blk_mq_hw_ctx;
>
> +/* Holding context data for changing elevator */
> +struct elv_change_ctx {
> + const char *name;
> + bool no_uevent;
> +
> + /* for unregistering old elevator */
> + struct elevator_queue *old;
> + /* for registering new elevator */
> + struct elevator_queue *new;
> +};
> +
'elv_change_ctx' is only used in block/elevator.c, not sure why you
move it to the header.
> struct elevator_mq_ops {
> int (*init_sched)(struct request_queue *, struct elevator_queue *);
> void (*exit_sched)(struct elevator_queue *);
> @@ -107,12 +118,27 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq);
> void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
> struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
>
> +struct elevator_tags {
> + /* num. of hardware queues for which tags are allocated */
> + unsigned int nr_hw_queues;
> + /* depth used while allocating tags */
> + unsigned int nr_requests;
> + /* request queue id (q->id) used during elevator_tags_lookup() */
> + int id;
> + union {
> + /* tags shared across all queues */
> + struct blk_mq_tags *shared_tags;
> + /* array of blk_mq_tags pointers, one per hardware queue. */
> + struct blk_mq_tags **tags;
> + } u;
> +};
I feel it is simpler to just save shared tags in the 1st item
of the array, then you can store 'struct blk_mq_tags' in flex array of
'struct elevator_tags', save one extra allocation & many failure handling
code.
Thanks,
Ming
next prev parent reply other threads:[~2025-06-18 3:06 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-16 17:32 [PATCHv3 0/2] block: move sched_tags allocation/de-allocation outside of locking context Nilay Shroff
2025-06-16 17:32 ` [PATCHv3 1/2] block: move elevator queue allocation logic into blk_mq_init_sched Nilay Shroff
2025-06-17 15:07 ` Ming Lei
2025-06-20 14:39 ` Nilay Shroff
2025-06-20 15:17 ` Ming Lei
2025-06-20 16:13 ` Nilay Shroff
2025-06-23 5:56 ` Christoph Hellwig
2025-06-23 9:14 ` Nilay Shroff
2025-06-16 17:32 ` [PATCHv3 2/2] block: fix lock dependency between percpu alloc lock and elevator lock Nilay Shroff
2025-06-18 3:06 ` Ming Lei [this message]
2025-06-18 6:52 ` Nilay Shroff
2025-06-23 6:10 ` Christoph Hellwig
2025-06-23 9:33 ` Nilay Shroff
2025-06-23 13:36 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aFItNzzr-4iQbjyY@fedora \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=nilay@linux.ibm.com \
--cc=sth@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox