From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-block@vger.kernel.org
Cc: ming.lei@redhat.com, hch@lst.de, axboe@kernel.dk,
yi.zhang@redhat.com, czhong@redhat.com, gjoyce@ibm.com
Subject: [PATCHv3 3/4] block: introduce alloc_sched_data and free_sched_data elevator methods
Date: Wed, 29 Oct 2025 16:06:16 +0530 [thread overview]
Message-ID: <20251029103622.205607-4-nilay@linux.ibm.com> (raw)
In-Reply-To: <20251029103622.205607-1-nilay@linux.ibm.com>
The recent lockdep splat [1] highlights a potential deadlock risk
involving ->elevator_lock and ->freeze_lock dependencies on -pcpu_alloc_
mutex. The trace shows that the issue occurs when the Kyber scheduler
allocates dynamic memory for its elevator data during initialization.
To address this, introduce two new elevator operation callbacks:
->alloc_sched_data and ->free_sched_data.
When an elevator implements these methods, they are invoked during
scheduler switch before acquiring ->freeze_lock and ->elevator_lock.
This allows safe allocation and deallocation of per-elevator data
without holding locks that could depend on pcpu_alloc_mutex, effectively
breaking the lock dependency chain and avoiding the reported deadlock
scenario.
[1] https://lore.kernel.org/all/CAGVVp+VNW4M-5DZMNoADp6o2VKFhi7KxWpTDkcnVyjO0=-D5+A@mail.gmail.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
block/blk-mq-sched.c | 30 +++++++++++++++++++++++-------
block/blk-mq-sched.h | 25 +++++++++++++++++++++++--
block/elevator.c | 35 +++++++++++++++++++++++++----------
block/elevator.h | 4 ++++
4 files changed, 75 insertions(+), 19 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 6db45b0819e6..4376d0ddbd1e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -428,12 +428,17 @@ void blk_mq_free_sched_tags(struct elevator_tags *et,
}
void blk_mq_free_sched_res(struct elevator_resources *res,
+ struct elevator_type *type,
struct blk_mq_tag_set *set)
{
if (res->et) {
blk_mq_free_sched_tags(res->et, set);
res->et = NULL;
}
+ if (res->data) {
+ blk_mq_free_sched_data(type, res->data);
+ res->data = NULL;
+ }
}
void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
@@ -458,7 +463,7 @@ void blk_mq_free_sched_res_batch(struct xarray *elv_tbl,
WARN_ON_ONCE(1);
continue;
}
- blk_mq_free_sched_res(&ctx->res, set);
+ blk_mq_free_sched_res(&ctx->res, ctx->type, set);
}
}
}
@@ -540,15 +545,24 @@ struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
return NULL;
}
-int blk_mq_alloc_sched_res(struct elevator_resources *res,
- struct blk_mq_tag_set *set, unsigned int nr_hw_queues)
+int blk_mq_alloc_sched_res(struct request_queue *q,
+ struct elevator_type *type,
+ struct elevator_resources *res,
+ struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues)
{
+ int ret;
+
res->et = blk_mq_alloc_sched_tags(set, nr_hw_queues,
blk_mq_default_nr_requests(set));
if (!res->et)
return -ENOMEM;
- return 0;
+ ret = blk_mq_alloc_sched_data(q, type, &res->data);
+ if (ret)
+ kfree(res->et);
+
+ return ret;
}
int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
@@ -575,19 +589,21 @@ int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
goto out_unwind;
}
- ret = blk_mq_alloc_sched_res(&ctx->res, set,
- nr_hw_queues);
+ ret = blk_mq_alloc_sched_res(q, q->elevator->type,
+ &ctx->res, set, nr_hw_queues);
if (ret)
goto out_unwind;
}
}
return 0;
+
out_unwind:
list_for_each_entry_continue_reverse(q, &set->tag_list, tag_set_list) {
if (q->elevator) {
ctx = xa_load(elv_tbl, q->id);
if (ctx)
- blk_mq_free_sched_res(&ctx->res, set);
+ blk_mq_free_sched_res(&ctx->res,
+ ctx->type, set);
}
}
return ret;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 97204df76def..acd4f1355be6 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -25,8 +25,11 @@ void blk_mq_sched_free_rqs(struct request_queue *q);
struct elevator_tags *blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
unsigned int nr_hw_queues, unsigned int nr_requests);
-int blk_mq_alloc_sched_res(struct elevator_resources *res,
- struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
+int blk_mq_alloc_sched_res(struct request_queue *q,
+ struct elevator_type *type,
+ struct elevator_resources *res,
+ struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues);
int blk_mq_alloc_sched_res_batch(struct xarray *elv_tbl,
struct blk_mq_tag_set *set, unsigned int nr_hw_queues);
int blk_mq_alloc_sched_ctx_batch(struct xarray *elv_tbl,
@@ -35,10 +38,28 @@ void blk_mq_free_sched_ctx_batch(struct xarray *elv_tbl);
void blk_mq_free_sched_tags(struct elevator_tags *et,
struct blk_mq_tag_set *set);
void blk_mq_free_sched_res(struct elevator_resources *res,
+ struct elevator_type *type,
struct blk_mq_tag_set *set);
void blk_mq_free_sched_res_batch(struct xarray *et_table,
struct blk_mq_tag_set *set);
+static inline int blk_mq_alloc_sched_data(struct request_queue *q,
+ struct elevator_type *e, void **data)
+{
+ if (e && e->ops.alloc_sched_data) {
+ *data = e->ops.alloc_sched_data(q);
+ if (!*data)
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static inline void blk_mq_free_sched_data(struct elevator_type *e, void *data)
+{
+ if (e && e->ops.free_sched_data)
+ e->ops.free_sched_data(data);
+}
+
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
diff --git a/block/elevator.c b/block/elevator.c
index d5d89b202fda..8696b2a741b7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -135,6 +135,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
mutex_init(&eq->sysfs_lock);
hash_init(eq->hash);
eq->et = res->et;
+ eq->elevator_data = res->data;
return eq;
}
@@ -617,7 +618,7 @@ static void elv_exit_and_release(struct elv_change_ctx *ctx,
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
if (e) {
- blk_mq_free_sched_res(&ctx->res, q->tag_set);
+ blk_mq_free_sched_res(&ctx->res, ctx->type, q->tag_set);
kobject_put(&e->kobj);
}
}
@@ -628,12 +629,15 @@ static int elevator_change_done(struct request_queue *q,
int ret = 0;
if (ctx->old) {
- struct elevator_resources res = {.et = ctx->old->et};
+ struct elevator_resources res = {
+ .et = ctx->old->et,
+ .data = ctx->old->elevator_data
+ };
bool enable_wbt = test_bit(ELEVATOR_FLAG_ENABLE_WBT_ON_EXIT,
&ctx->old->flags);
elv_unregister_queue(q, ctx->old);
- blk_mq_free_sched_res(&res, q->tag_set);
+ blk_mq_free_sched_res(&res, ctx->old->type, q->tag_set);
kobject_put(&ctx->old->kobj);
if (enable_wbt)
wbt_enable_default(q->disk);
@@ -658,7 +662,8 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
lockdep_assert_held(&set->update_nr_hwq_lock);
if (strncmp(ctx->name, "none", 4)) {
- ret = blk_mq_alloc_sched_res(&ctx->res, set, set->nr_hw_queues);
+ ret = blk_mq_alloc_sched_res(q, ctx->type, &ctx->res, set,
+ set->nr_hw_queues);
if (ret)
return ret;
}
@@ -681,11 +686,15 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
blk_mq_unfreeze_queue(q, memflags);
if (!ret)
ret = elevator_change_done(q, ctx);
+
+ if (ctx->new) /* switching to new elevator is successful */
+ return ret;
+
/*
* Free sched resource if it's allocated but we couldn't switch elevator.
*/
if (!ctx->new)
- blk_mq_free_sched_res(&ctx->res, set);
+ blk_mq_free_sched_res(&ctx->res, ctx->type, set);
return ret;
}
@@ -711,11 +720,14 @@ void elv_update_nr_hw_queues(struct request_queue *q,
blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret)
WARN_ON_ONCE(elevator_change_done(q, ctx));
+
+ if (ctx->new) /* switching to new elevator is successful */
+ return;
/*
* Free sched resource if it's allocated but we couldn't switch elevator.
*/
if (!ctx->new)
- blk_mq_free_sched_res(&ctx->res, set);
+ blk_mq_free_sched_res(&ctx->res, ctx->type, set);
}
/*
@@ -729,7 +741,6 @@ void elevator_set_default(struct request_queue *q)
.no_uevent = true,
};
int err;
- struct elevator_type *e;
/* now we allow to switch elevator */
blk_queue_flag_clear(QUEUE_FLAG_NO_ELV_SWITCH, q);
@@ -742,8 +753,8 @@ void elevator_set_default(struct request_queue *q)
* have multiple queues or mq-deadline is not available, default
* to "none".
*/
- e = elevator_find_get(ctx.name);
- if (!e)
+ ctx.type = elevator_find_get(ctx.name);
+ if (!ctx.type)
return;
if ((q->nr_hw_queues == 1 ||
@@ -753,7 +764,7 @@ void elevator_set_default(struct request_queue *q)
pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n",
ctx.name, err);
}
- elevator_put(e);
+ elevator_put(ctx.type);
}
void elevator_set_none(struct request_queue *q)
@@ -802,6 +813,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
ctx.name = strstrip(elevator_name);
elv_iosched_load_module(ctx.name);
+ ctx.type = elevator_find_get(ctx.name);
down_read(&set->update_nr_hwq_lock);
if (!blk_queue_no_elv_switch(q)) {
@@ -812,6 +824,9 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
ret = -ENOENT;
}
up_read(&set->update_nr_hwq_lock);
+
+ if (ctx.type)
+ elevator_put(ctx.type);
return ret;
}
diff --git a/block/elevator.h b/block/elevator.h
index 6533f74ad5ef..3ee1d494f48a 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -33,6 +33,8 @@ struct elevator_tags {
};
struct elevator_resources {
+ /* holds elevator data */
+ void *data;
/* holds elevator tags */
struct elevator_tags *et;
};
@@ -58,6 +60,8 @@ struct elevator_mq_ops {
int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
void (*depth_updated)(struct request_queue *);
+ void *(*alloc_sched_data)(struct request_queue *);
+ void (*free_sched_data)(void *);
bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
--
2.51.0
next prev parent reply other threads:[~2025-10-29 10:37 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-29 10:36 [PATCHv3 0/4] block: restructure elevator switch path and fix a lockdep splat Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 1/4] block: unify elevator tags and type xarrays into struct elv_change_ctx Nilay Shroff
2025-11-03 3:22 ` Ming Lei
2025-10-29 10:36 ` [PATCHv3 2/4] block: move elevator tags into struct elevator_resources Nilay Shroff
2025-11-03 3:50 ` Ming Lei
2025-11-03 7:03 ` Nilay Shroff
2025-10-29 10:36 ` Nilay Shroff [this message]
2025-11-03 4:08 ` [PATCHv3 3/4] block: introduce alloc_sched_data and free_sched_data elevator methods Ming Lei
2025-11-03 6:55 ` Nilay Shroff
2025-10-29 10:36 ` [PATCHv3 4/4] block: define alloc_sched_data and free_sched_data methods for kyber Nilay Shroff
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251029103622.205607-4-nilay@linux.ibm.com \
--to=nilay@linux.ibm.com \
--cc=axboe@kernel.dk \
--cc=czhong@redhat.com \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=ming.lei@redhat.com \
--cc=yi.zhang@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox