From: Nilay Shroff <nilay@linux.ibm.com>
To: linux-block@vger.kernel.org
Cc: ming.lei@redhat.com, hch@lst.de, axboe@kernel.dk,
sth@linux.ibm.com, gjoyce@ibm.com
Subject: [PATCHv3 2/2] block: fix lock dependency between percpu alloc lock and elevator lock
Date: Mon, 16 Jun 2025 23:02:26 +0530 [thread overview]
Message-ID: <20250616173233.3803824-3-nilay@linux.ibm.com> (raw)
In-Reply-To: <20250616173233.3803824-1-nilay@linux.ibm.com>
Recent lockdep reports [1] have revealed a potential deadlock caused by a
lock dependency between the percpu allocator lock and the elevator lock.
This issue can be avoided by ensuring that the allocation and release of
scheduler tags (sched_tags) are performed outside the elevator lock.
Furthermore, the queue does not need to be remain frozen during these
operations.
To address this, move all sched_tags allocations and deallocations outside
of both the ->elevator_lock and the ->freeze_lock. Moreover, since the
lifetime of the elevator queue and its associated sched_tags is closely
tied, the allocated sched_tags are now stored in the elevator queue
structure. Then, during the actual elevator switch (which runs under
->freeze_lock and ->elevator_lock), the pre-allocated sched_tags are
assigned to the appropriate q->hctx. Once the elevator switch is complete
and the locks are released, the old elevator queue and its associated
sched_tags are freed.
This restructuring ensures that sched_tags memory management occurs
entirely outside of the ->elevator_lock and ->freeze_lock context,
eliminating the lock dependency problem seen during scheduler updates
or hardware queue update.
[1] https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Reported-by: Stefan Haberland <sth@linux.ibm.com>
Closes: https://lore.kernel.org/all/0659ea8d-a463-47c8-9180-43c719e106eb@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
---
block/blk-mq-sched.c | 244 +++++++++++++++++++++++++++++++------------
block/blk-mq-sched.h | 13 ++-
block/blk-mq.c | 14 ++-
block/blk.h | 4 +-
block/elevator.c | 82 +++++++++++----
block/elevator.h | 28 ++++-
6 files changed, 296 insertions(+), 89 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index d914eb9d61a6..dd00d8c9fb78 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -374,27 +374,17 @@ bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+static void blk_mq_init_sched_tags(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ unsigned int hctx_idx,
+ struct elevator_queue *eq)
{
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
hctx->sched_tags = q->sched_shared_tags;
- return 0;
+ return;
}
- hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
- q->nr_requests);
-
- if (!hctx->sched_tags)
- return -ENOMEM;
- return 0;
-}
-
-static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
-{
- blk_mq_free_rq_map(queue->sched_shared_tags);
- queue->sched_shared_tags = NULL;
+ hctx->sched_tags = eq->tags->u.tags[hctx_idx];
}
/* called in queue's release handler, tagset has gone away */
@@ -403,35 +393,18 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int fla
struct blk_mq_hw_ctx *hctx;
unsigned long i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags) {
- if (!blk_mq_is_shared_tags(flags))
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
- }
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ hctx->sched_tags = NULL;
if (blk_mq_is_shared_tags(flags))
- blk_mq_exit_sched_shared_tags(q);
+ q->sched_shared_tags = NULL;
}
-static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
+static void blk_mq_init_sched_shared_tags(struct request_queue *queue,
+ struct elevator_queue *eq)
{
- struct blk_mq_tag_set *set = queue->tag_set;
-
- /*
- * Set initial depth at max so that we don't need to reallocate for
- * updating nr_requests.
- */
- queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
- BLK_MQ_NO_HCTX_IDX,
- MAX_SCHED_RQ);
- if (!queue->sched_shared_tags)
- return -ENOMEM;
-
+ queue->sched_shared_tags = eq->tags->u.shared_tags;
blk_mq_tag_update_sched_shared_tags(queue);
-
- return 0;
}
void blk_mq_sched_reg_debugfs(struct request_queue *q)
@@ -458,8 +431,165 @@ void blk_mq_sched_unreg_debugfs(struct request_queue *q)
mutex_unlock(&q->debugfs_mutex);
}
+void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
+ struct elevator_tags *tags)
+{
+ unsigned long i;
+
+ if (!tags)
+ return;
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ if (tags->u.shared_tags) {
+ blk_mq_free_rqs(set, tags->u.shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ blk_mq_free_rq_map(tags->u.shared_tags);
+ }
+ goto out;
+ }
+
+ if (!tags->u.tags)
+ goto out;
+
+ for (i = 0; i < tags->nr_hw_queues; i++) {
+ if (tags->u.tags[i]) {
+ blk_mq_free_rqs(set, tags->u.tags[i], i);
+ blk_mq_free_rq_map(tags->u.tags[i]);
+ }
+ }
+
+ kfree(tags->u.tags);
+out:
+ kfree(tags);
+}
+
+void blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
+ struct elevator_tags **tags)
+{
+ unsigned long i, count = 0;
+ struct request_queue *q;
+
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+
+ if (!tags)
+ return;
+
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe
+ * because we're holding here set->update_nr_hwq_lock in the writer
+ * context. So, scheduler update/switch code (which acquires the same
+ * lock but in the reader context) can't run concurrently.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (q->elevator)
+ count++;
+ }
+
+ for (i = 0; i < count; i++)
+ __blk_mq_free_sched_tags(set, tags[i]);
+
+ kfree(tags);
+}
+
+struct elevator_tags *__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues, int id)
+{
+ unsigned int i;
+ struct elevator_tags *tags;
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
+
+ tags = kcalloc(1, sizeof(struct elevator_tags), gfp);
+ if (!tags)
+ return NULL;
+
+ tags->id = id;
+ /*
+ * Default to double of smaller one between hw queue_depth and
+ * 128, since we don't split into sync/async like the old code
+ * did. Additionally, this is a per-hw queue depth.
+ */
+ tags->nr_requests = 2 * min_t(unsigned int, set->queue_depth,
+ BLKDEV_DEFAULT_RQ);
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+
+ tags->u.shared_tags = blk_mq_alloc_map_and_rqs(set,
+ BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!tags->u.shared_tags)
+ goto out;
+
+ return tags;
+ }
+
+ tags->u.tags = kcalloc(nr_hw_queues, sizeof(struct blk_mq_tags *), gfp);
+ if (!tags->u.tags)
+ goto out;
+
+ tags->nr_hw_queues = nr_hw_queues;
+ for (i = 0; i < nr_hw_queues; i++) {
+ tags->u.tags[i] = blk_mq_alloc_map_and_rqs(set, i,
+ tags->nr_requests);
+ if (!tags->u.tags[i])
+ goto out;
+ }
+
+ return tags;
+
+out:
+ __blk_mq_free_sched_tags(set, tags);
+ return NULL;
+}
+
+int blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues,
+ struct elevator_tags ***elv_tags)
+{
+ unsigned long idx = 0, count = 0;
+ struct request_queue *q;
+ struct elevator_tags **tags;
+ gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
+
+ lockdep_assert_held_write(&set->update_nr_hwq_lock);
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe
+ * because we're holding here set->update_nr_hwq_lock in the writer
+ * context. So, scheduler update/switch code (which acquires the same
+ * lock but in the reader context) can't run concurrently.
+ */
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (q->elevator)
+ count++;
+ }
+
+ if (!count)
+ return 0;
+
+ tags = kcalloc(count, sizeof(struct elevator_tags *), gfp);
+ if (!tags)
+ return -ENOMEM;
+
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (q->elevator) {
+ tags[idx] = __blk_mq_alloc_sched_tags(set, nr_hw_queues,
+ q->id);
+ if (!tags[idx])
+ goto out;
+
+ idx++;
+ }
+ }
+
+ *elv_tags = tags;
+ return count;
+out:
+ blk_mq_free_sched_tags(set, tags);
+ return -ENOMEM;
+}
+
/* caller must have a reference to @e, will grab another one if successful */
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *tags)
{
unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
@@ -467,40 +597,26 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i;
int ret;
- /*
- * Default to double of smaller one between hw queue_depth and 128,
- * since we don't split into sync/async like the old code did.
- * Additionally, this is a per-hw queue depth.
- */
- q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
- BLKDEV_DEFAULT_RQ);
-
- eq = elevator_alloc(q, e);
+ eq = elevator_alloc(q, e, tags);
if (!eq)
return -ENOMEM;
- if (blk_mq_is_shared_tags(flags)) {
- ret = blk_mq_init_sched_shared_tags(q);
- if (ret)
- return ret;
- }
+ q->nr_requests = tags->nr_requests;
- queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
- if (ret)
- goto err_free_map_and_rqs;
- }
+ if (blk_mq_is_shared_tags(flags))
+ blk_mq_init_sched_shared_tags(q, eq);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_init_sched_tags(q, hctx, i, eq);
ret = e->ops.init_sched(q, eq);
if (ret)
- goto err_free_map_and_rqs;
+ goto out;
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
if (ret) {
- eq = q->elevator;
- blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
@@ -509,10 +625,8 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
}
return 0;
-err_free_map_and_rqs:
- blk_mq_sched_free_rqs(q);
+out:
blk_mq_sched_tags_teardown(q, flags);
-
kobject_put(&eq->kobj);
q->elevator = NULL;
return ret;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 1326526bb733..01c43192b98a 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -18,7 +18,18 @@ void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
-int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
+void __blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
+ struct elevator_tags *tags);
+void blk_mq_free_sched_tags(struct blk_mq_tag_set *set,
+ struct elevator_tags **elv_t);
+int blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues,
+ struct elevator_tags ***tags);
+struct elevator_tags *__blk_mq_alloc_sched_tags(struct blk_mq_tag_set *set,
+ unsigned int nr_hw_queues, int id);
+
+int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e,
+ struct elevator_tags *elv_t);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_rqs(struct request_queue *q);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4806b867e37d..b8b616c79742 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4970,6 +4970,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
int nr_hw_queues)
{
struct request_queue *q;
+ int count;
+ struct elevator_tags **elv_tags = NULL;
int prev_nr_hw_queues = set->nr_hw_queues;
unsigned int memflags;
int i;
@@ -4984,6 +4986,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
+
+ count = blk_mq_alloc_sched_tags(set, nr_hw_queues, &elv_tags);
+ if (count < 0)
+ goto out;
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
blk_mq_sysfs_unregister_hctxs(q);
@@ -4995,6 +5002,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) {
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue_nomemrestore(q);
+
+ blk_mq_free_sched_tags(set, elv_tags);
+ elv_tags = NULL;
goto reregister;
}
@@ -5019,7 +5029,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
/* elv_update_nr_hw_queues() unfreeze queue for us */
list_for_each_entry(q, &set->tag_list, tag_set_list)
- elv_update_nr_hw_queues(q);
+ elv_update_nr_hw_queues(q, elv_tags, count);
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -5029,6 +5039,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
blk_mq_remove_hw_queues_cpuhp(q);
blk_mq_add_hw_queues_cpuhp(q);
}
+ kfree(elv_tags);
+out:
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
diff --git a/block/blk.h b/block/blk.h
index 37ec459fe656..cd50305da84e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,6 +10,7 @@
#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"
+#include "elevator.h"
struct elevator_type;
@@ -321,7 +322,8 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
bool blk_insert_flush(struct request *rq);
-void elv_update_nr_hw_queues(struct request_queue *q);
+void elv_update_nr_hw_queues(struct request_queue *q,
+ struct elevator_tags **elv_tags, int count);
void elevator_set_default(struct request_queue *q);
void elevator_set_none(struct request_queue *q);
diff --git a/block/elevator.c b/block/elevator.c
index ab22542e6cf0..849dbe347cb2 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -45,17 +45,6 @@
#include "blk-wbt.h"
#include "blk-cgroup.h"
-/* Holding context data for changing elevator */
-struct elv_change_ctx {
- const char *name;
- bool no_uevent;
-
- /* for unregistering old elevator */
- struct elevator_queue *old;
- /* for registering new elevator */
- struct elevator_queue *new;
-};
-
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -132,7 +121,7 @@ static struct elevator_type *elevator_find_get(const char *name)
static const struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
- struct elevator_type *e)
+ struct elevator_type *e, struct elevator_tags *tags)
{
struct elevator_queue *eq;
@@ -142,6 +131,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
__elevator_get(e);
eq->type = e;
+ eq->tags = tags;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
hash_init(eq->hash);
@@ -166,13 +156,25 @@ static void elevator_exit(struct request_queue *q)
lockdep_assert_held(&q->elevator_lock);
ioc_clear_queue(q);
- blk_mq_sched_free_rqs(q);
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
}
+static struct elevator_tags *elevator_tags_lookup(struct elevator_tags **tags,
+ struct request_queue *q, int count)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ if (tags[i]->id == q->id)
+ return tags[i];
+ }
+
+ return NULL;
+}
+
static inline void __elv_rqhash_del(struct request *rq)
{
hash_del(&rq->hash);
@@ -570,7 +572,8 @@ EXPORT_SYMBOL_GPL(elv_unregister);
* If switching fails, we are most likely running out of memory and not able
* to restore the old io scheduler, so leaving the io scheduler being none.
*/
-static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
+static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx,
+ struct elevator_tags *tags)
{
struct elevator_type *new_e = NULL;
int ret = 0;
@@ -592,7 +595,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
}
if (new_e) {
- ret = blk_mq_init_sched(q, new_e);
+ ret = blk_mq_init_sched(q, new_e, tags);
if (ret)
goto out_unfreeze;
ctx->new = q->elevator;
@@ -627,8 +630,10 @@ static void elv_exit_and_release(struct request_queue *q)
elevator_exit(q);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
- if (e)
+ if (e) {
+ __blk_mq_free_sched_tags(q->tag_set, e->tags);
kobject_put(&e->kobj);
+ }
}
static int elevator_change_done(struct request_queue *q,
@@ -641,6 +646,7 @@ static int elevator_change_done(struct request_queue *q,
&ctx->old->flags);
elv_unregister_queue(q, ctx->old);
+ __blk_mq_free_sched_tags(q->tag_set, ctx->old->tags);
kobject_put(&ctx->old->kobj);
if (enable_wbt)
wbt_enable_default(q->disk);
@@ -659,9 +665,17 @@ static int elevator_change_done(struct request_queue *q,
static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
{
unsigned int memflags;
+ struct elevator_tags *tags = NULL;
+ struct blk_mq_tag_set *set = q->tag_set;
int ret = 0;
- lockdep_assert_held(&q->tag_set->update_nr_hwq_lock);
+ lockdep_assert_held(&set->update_nr_hwq_lock);
+
+ if (strncmp(ctx->name, "none", 4)) {
+ tags = __blk_mq_alloc_sched_tags(set, set->nr_hw_queues, q->id);
+ if (!tags)
+ return -ENOMEM;
+ }
memflags = blk_mq_freeze_queue(q);
/*
@@ -676,11 +690,16 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
blk_mq_cancel_work_sync(q);
mutex_lock(&q->elevator_lock);
if (!(q->elevator && elevator_match(q->elevator->type, ctx->name)))
- ret = elevator_switch(q, ctx);
+ ret = elevator_switch(q, ctx, tags);
mutex_unlock(&q->elevator_lock);
blk_mq_unfreeze_queue(q, memflags);
if (!ret)
ret = elevator_change_done(q, ctx);
+ /*
+ * Free sched tags if it's allocated but we couldn't switch elevator.
+ */
+ if (tags && !ctx->new)
+ __blk_mq_free_sched_tags(set, tags);
return ret;
}
@@ -689,24 +708,47 @@ static int elevator_change(struct request_queue *q, struct elv_change_ctx *ctx)
* The I/O scheduler depends on the number of hardware queues, this forces a
* reattachment when nr_hw_queues changes.
*/
-void elv_update_nr_hw_queues(struct request_queue *q)
+void elv_update_nr_hw_queues(struct request_queue *q,
+ struct elevator_tags **elv_tags, int count)
{
+ struct elevator_tags *tags = NULL;
+ struct blk_mq_tag_set *set = q->tag_set;
struct elv_change_ctx ctx = {};
int ret = -ENODEV;
WARN_ON_ONCE(q->mq_freeze_depth == 0);
+ /*
+ * Accessing q->elevator without holding q->elevator_lock is safe here
+ * because nr_hw_queue update is protected by set->update_nr_hwq_lock
+ * in the writer context. So, scheduler update/switch code (which
+ * acquires same lock in the reader context) can't run concurrently.
+ */
+ if (q->elevator) {
+ tags = elevator_tags_lookup(elv_tags, q, count);
+ if (!tags) {
+ WARN_ON(1);
+ goto out_unfreeze;
+ }
+ }
+
mutex_lock(&q->elevator_lock);
if (q->elevator && !blk_queue_dying(q) && blk_queue_registered(q)) {
ctx.name = q->elevator->type->elevator_name;
/* force to reattach elevator after nr_hw_queue is updated */
- ret = elevator_switch(q, &ctx);
+ ret = elevator_switch(q, &ctx, tags);
}
mutex_unlock(&q->elevator_lock);
+out_unfreeze:
blk_mq_unfreeze_queue_nomemrestore(q);
if (!ret)
WARN_ON_ONCE(elevator_change_done(q, &ctx));
+ /*
+ * Free sched tags if it's allocated but we couldn't switch elevator.
+ */
+ if (tags && !ctx.new)
+ __blk_mq_free_sched_tags(set, tags);
}
/*
diff --git a/block/elevator.h b/block/elevator.h
index a4de5f9ad790..0b92121005cf 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -23,6 +23,17 @@ enum elv_merge {
struct blk_mq_alloc_data;
struct blk_mq_hw_ctx;
+/* Holding context data for changing elevator */
+struct elv_change_ctx {
+ const char *name;
+ bool no_uevent;
+
+ /* for unregistering old elevator */
+ struct elevator_queue *old;
+ /* for registering new elevator */
+ struct elevator_queue *new;
+};
+
struct elevator_mq_ops {
int (*init_sched)(struct request_queue *, struct elevator_queue *);
void (*exit_sched)(struct elevator_queue *);
@@ -107,12 +118,27 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq);
void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
+struct elevator_tags {
+ /* num. of hardware queues for which tags are allocated */
+ unsigned int nr_hw_queues;
+ /* depth used while allocating tags */
+ unsigned int nr_requests;
+ /* request queue id (q->id) used during elevator_tags_lookup() */
+ int id;
+ union {
+ /* tags shared across all queues */
+ struct blk_mq_tags *shared_tags;
+ /* array of blk_mq_tags pointers, one per hardware queue. */
+ struct blk_mq_tags **tags;
+ } u;
+};
/*
* each queue has an elevator_queue associated with it
*/
struct elevator_queue
{
struct elevator_type *type;
+ struct elevator_tags *tags;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
@@ -153,7 +179,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
extern struct elevator_queue *elevator_alloc(struct request_queue *,
- struct elevator_type *);
+ struct elevator_type *, struct elevator_tags *);
/*
* Helper functions.
--
2.49.0
next prev parent reply other threads:[~2025-06-16 17:33 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-16 17:32 [PATCHv3 0/2] block: move sched_tags allocation/de-allocation outside of locking context Nilay Shroff
2025-06-16 17:32 ` [PATCHv3 1/2] block: move elevator queue allocation logic into blk_mq_init_sched Nilay Shroff
2025-06-17 15:07 ` Ming Lei
2025-06-20 14:39 ` Nilay Shroff
2025-06-20 15:17 ` Ming Lei
2025-06-20 16:13 ` Nilay Shroff
2025-06-23 5:56 ` Christoph Hellwig
2025-06-23 9:14 ` Nilay Shroff
2025-06-16 17:32 ` Nilay Shroff [this message]
2025-06-18 3:06 ` [PATCHv3 2/2] block: fix lock dependency between percpu alloc lock and elevator lock Ming Lei
2025-06-18 6:52 ` Nilay Shroff
2025-06-23 6:10 ` Christoph Hellwig
2025-06-23 9:33 ` Nilay Shroff
2025-06-23 13:36 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250616173233.3803824-3-nilay@linux.ibm.com \
--to=nilay@linux.ibm.com \
--cc=axboe@kernel.dk \
--cc=gjoyce@ibm.com \
--cc=hch@lst.de \
--cc=linux-block@vger.kernel.org \
--cc=ming.lei@redhat.com \
--cc=sth@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox