From: Ming Lei <ming.lei@redhat.com>
To: Nilay Shroff <nilay@linux.ibm.com>
Cc: linux-block@vger.kernel.org, hch@lst.de, axboe@kernel.dk,
yi.zhang@redhat.com, czhong@redhat.com, gjoyce@ibm.com
Subject: Re: [PATCHv3 3/4] block: introduce alloc_sched_data and free_sched_data elevator methods
Date: Mon, 3 Nov 2025 12:08:35 +0800 [thread overview]
Message-ID: <aQgqw-DX1A3R3wuN@fedora> (raw)
In-Reply-To: <20251029103622.205607-4-nilay@linux.ibm.com>
On Wed, Oct 29, 2025 at 04:06:16PM +0530, Nilay Shroff wrote:
> The recent lockdep splat [1] highlights a potential deadlock risk
> involving ->elevator_lock and ->freeze_lock dependencies on -pcpu_alloc_
> mutex. The trace shows that the issue occurs when the Kyber scheduler
> allocates dynamic memory for its elevator data during initialization.
>
> To address this, introduce two new elevator operation callbacks:
> ->alloc_sched_data and ->free_sched_data.
>
> When an elevator implements these methods, they are invoked during
> scheduler switch before acquiring ->freeze_lock and ->elevator_lock.
> This allows safe allocation and deallocation of per-elevator data
> without holding locks that could depend on pcpu_alloc_mutex, effectively
> breaking the lock dependency chain and avoiding the reported deadlock
> scenario.
>
> [1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/
>
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
> ---
> block/blk-mq-sched.c | 30 +++++++++++++++++++++++-------
> block/blk-mq-sched.h | 25 +++++++++++++++++++++++--
> block/elevator.c | 35 +++++++++++++++++++++++++----------
> block/elevator.h | 4 ++++
> 4 files changed, 75 insertions(+), 19 deletions(-)
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 6db45b0819e6..4376d0ddbd1e 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -428,12 +428,17 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
> }
>
> void blk_mq_free_sched_res(struct elevator_resources *res,
> + struct elevator_type *type,
> struct blk_mq_tag_set *set)
> {
> if (res->et) {
> blk_mq_free_sched_tags(res->et, set);
> res->et = NULL;
> }
> + if (res->data) {
> + blk_mq_free_sched_data(type, res->data);
> + res->data = NULL;
> + }
> }
>
> void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
> @@ -458,7 +463,7 @@ void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
> WARN_ON_ONCE(1);
> continue;
> }
> - blk_mq_free_sched_res(&ctx->res, set);
> + blk_mq_free_sched_res(&ctx->res, ctx->type, set);
> }
> }
> }
> @@ -540,15 +545,24 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> return NULL;
> }
>
> -int blk_mq_alloc_sched_res(struct elevator_resources *res,
> - struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
> +int blk_mq_alloc_sched_res(struct request_queue *q,
> + struct elevator_type *type,
> + struct elevator_resources *res,
> + struct blk_mq_tag_set *set,
> + unsigned int nr_hw_queues)
> {
> + int ret;
> +
> res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
> blk_mq_default_nr_requests(set));
> if (!res->et)
> return -ENOMEM;
>
> - return 0;
> + ret = blk_mq_alloc_sched_data(q, type, &res->data);
> + if (ret)
> + kfree(res->et);
use blk_mq_free_sched_res() instead of kfree() for avoiding memleak.
> +
> + return ret;
> }
>
> int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
> @@ -575,19 +589,21 @@ int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
> goto out_unwind;
> }
>
> - ret = blk_mq_alloc_sched_res(&ctx->res, set,
> - nr_hw_queues);
> + ret = blk_mq_alloc_sched_res(q, q->elevator->type,
> + &ctx->res, set, nr_hw_queues);
> if (ret)
> goto out_unwind;
> }
> }
> return 0;
> +
> out_unwind:
> list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
> if (q->elevator) {
> ctx = xa_load(elv_tbl, q->id);
> if (ctx)
> - blk_mq_free_sched_res(&ctx->res, set);
> + blk_mq_free_sched_res(&ctx->res,
> + ctx->type, set);
> }
> }
> return ret;
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 97204df76def..acd4f1355be6 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -25,8 +25,11 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
>
> struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
> unsigned int nr_hw_queues, unsigned int nr_requests);
> -int blk_mq_alloc_sched_res(struct elevator_resources *res,
> - struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
> +int blk_mq_alloc_sched_res(struct request_queue *q,
> + struct elevator_type *type,
> + struct elevator_resources *res,
> + struct blk_mq_tag_set *set,
> + unsigned int nr_hw_queues);
> int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
> struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
> int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl,
> @@ -35,10 +38,28 @@ void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl);
> void blk_mq_free_sched_tags(struct elevator_tags *et,
> struct blk_mq_tag_set *set);
> void blk_mq_free_sched_res(struct elevator_resources *res,
> + struct elevator_type *type,
> struct blk_mq_tag_set *set);
> void blk_mq_free_sched_res_batch(struct xarray *et_table,
> struct blk_mq_tag_set *set);
>
> +static inline int blk_mq_alloc_sched_data(struct request_queue *q,
> + struct elevator_type *e, void **data)
> +{
> + if (e && e->ops.alloc_sched_data) {
> + *data = e->ops.alloc_sched_data(q);
> + if (!*data)
> + return -ENOMEM;
> + }
> + return 0;
> +}
> +
> +static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data)
> +{
> + if (e && e->ops.free_sched_data)
> + e->ops.free_sched_data(data);
> +}
> +
> static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
> {
> if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
> diff --git a/block/elevator.c b/block/elevator.c
> index d5d89b202fda..8696b2a741b7 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -135,6 +135,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
> mutex_init(&eq->sysfs_lock);
> hash_init(eq->hash);
> eq->et = res->et;
> + eq->elevator_data = res->data;
>
> return eq;
> }
> @@ -617,7 +618,7 @@ static void elv_exit_and_release(struct elv_change_ctx *ctx,
> mutex_unlock(&q->elevator_lock);
> blk_mq_unfreeze_queue(q, memflags);
> if (e) {
> - blk_mq_free_sched_res(&ctx->res, q->tag_set);
> + blk_mq_free_sched_res(&ctx->res, ctx->type, q->tag_set);
> kobject_put(&e->kobj);
> }
> }
> @@ -628,12 +629,15 @@ static int elevator_change_done(struct request_queue *q,
> int ret = 0;
>
> if (ctx->old) {
> - struct elevator_resources res = {.et = ctx->old->et};
> + struct elevator_resources res = {
> + .et = ctx->old->et,
> + .data = ctx->old->elevator_data
> + };
> bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
> &ctx->old->flags);
>
> elv_unregister_queue(q, ctx->old);
> - blk_mq_free_sched_res(&res, q->tag_set);
> + blk_mq_free_sched_res(&res, ctx->old->type, q->tag_set);
> kobject_put(&ctx->old->kobj);
> if (enable_wbt)
> wbt_enable_default(q->disk);
> @@ -658,7 +662,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
> lockdep_assert_held(&set->update_nr_hwq_lock);
>
> if (strncmp(ctx->name, "none", 4)) {
> - ret = blk_mq_alloc_sched_res(&ctx->res, set, set->nr_hw_queues);
> + ret = blk_mq_alloc_sched_res(q, ctx->type, &ctx->res, set,
> + set->nr_hw_queues);
> if (ret)
> return ret;
> }
> @@ -681,11 +686,15 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
> blk_mq_unfreeze_queue(q, memflags);
> if (!ret)
> ret = elevator_change_done(q, ctx);
> +
> + if (ctx->new) /* switching to new elevator is successful */
> + return ret;
> +
Not necessary.
> /*
> * Free sched resource if it's allocated but we couldn't switch elevator.
> */
> if (!ctx->new)
> - blk_mq_free_sched_res(&ctx->res, set);
> + blk_mq_free_sched_res(&ctx->res, ctx->type, set);
>
> return ret;
> }
> @@ -711,11 +720,14 @@ void elv_update_nr_hw_queues(struct request_queue *q,
> blk_mq_unfreeze_queue_nomemrestore(q);
> if (!ret)
> WARN_ON_ONCE(elevator_change_done(q, ctx));
> +
> + if (ctx->new) /* switching to new elevator is successful */
> + return;
Not necessary.
> /*
> * Free sched resource if it's allocated but we couldn't switch elevator.
> */
> if (!ctx->new)
> - blk_mq_free_sched_res(&ctx->res, set);
> + blk_mq_free_sched_res(&ctx->res, ctx->type, set);
> }
>
> /*
> @@ -729,7 +741,6 @@ void elevator_set_default(struct request_queue *q)
> .no_uevent = true,
> };
> int err;
> - struct elevator_type *e;
>
> /* now we allow to switch elevator */
> blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
> @@ -742,8 +753,8 @@ void elevator_set_default(struct request_queue *q)
> * have multiple queues or mq-deadline is not available, default
> * to "none".
> */
> - e = elevator_find_get(ctx.name);
> - if (!e)
> + ctx.type = elevator_find_get(ctx.name);
> + if (!ctx.type)
> return;
>
> if ((q->nr_hw_queues == 1 ||
> @@ -753,7 +764,7 @@ void elevator_set_default(struct request_queue *q)
> pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
> ctx.name, err);
> }
> - elevator_put(e);
> + elevator_put(ctx.type);
> }
>
> void elevator_set_none(struct request_queue *q)
> @@ -802,6 +813,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
> ctx.name = strstrip(elevator_name);
>
> elv_iosched_load_module(ctx.name);
> + ctx.type = elevator_find_get(ctx.name);
>
> down_read(&set->update_nr_hwq_lock);
> if (!blk_queue_no_elv_switch(q)) {
> @@ -812,6 +824,9 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
> ret = -ENOENT;
> }
> up_read(&set->update_nr_hwq_lock);
> +
> + if (ctx.type)
> + elevator_put(ctx.type);
> return ret;
The above change can be unified into elevator_change() by one standalone
patch, so elv_iosched_store() can be covered too.
Thanks,
Ming
next prev parent reply other threads:[~2025-11-03 4:08 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-29 10:36 [PATCHv3 0/4] block: restructure elevator switch path and fix a lockdep splat Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 1/4] block: unify elevator tags and type xarrays into struct elv_change_ctx Nilay Shroff
2025-11-03 3:22 ` Ming Lei
2025-10-29 10:36 ` [PATCHv3 2/4] block: move elevator tags into struct elevator_resources Nilay Shroff
2025-11-03 3:50 ` Ming Lei
2025-11-03 7:03 ` Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 3/4] block: introduce alloc_sched_data and free_sched_data elevator methods Nilay Shroff
2025-11-03 4:08 ` Ming Lei [this message]
2025-11-03 6:55 ` Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 4/4] block: define alloc_sched_data and free_sched_data methods for kyber Nilay Shroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aQgqw-DX1A3R3wuN@fedora \
--to=ming.lei@redhat.com \
--cc=axboe@kernel.dk \
--cc=czhong@redhat.com \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=nilay@linux.ibm.com \
--cc=yi.zhang@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox