From: Sumit Saxena <sumit.saxena@broadcom.com>
To: martin.petersen@oracle.com, axboe@kernel.dk
Cc: linux-scsi@vger.kernel.org, linux-block@vger.kernel.org,
mpi3mr-linuxdrv.pdl@broadcom.com,
Bart Van Assche <bvanassche@acm.org>,
Sumit Saxena <sumit.saxena@broadcom.com>
Subject: [PATCH v2 2/3] block: drop shared-tag fairness throttling
Date: Mon, 20 Apr 2026 17:08:38 +0530 [thread overview]
Message-ID: <20260420113846.1401374-3-sumit.saxena@broadcom.com> (raw)
In-Reply-To: <20260420113846.1401374-1-sumit.saxena@broadcom.com>
From: Bart Van Assche <bvanassche@acm.org>
Original patch [1] by Bart Van Assche; this version is rebased onto the
current tree. In testing it improves IOPS by roughly 16-18% by removing
the fair-sharing throttle on shared tag queues.
This patch removes the following code and structure members:
- The function hctx_may_queue().
- blk_mq_hw_ctx.nr_active and request_queue.nr_active_requests_shared_tags
and also all the code that modifies these two member variables.
[1]: https://lore.kernel.org/linux-block/20240529213921.3166462-1-bvanassche@acm.org/
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com>
---
block/blk-core.c | 2 -
block/blk-mq-debugfs.c | 22 ++++++++-
block/blk-mq-tag.c | 4 --
block/blk-mq.c | 17 +------
block/blk-mq.h | 100 -----------------------------------------
include/linux/blk-mq.h | 6 ---
include/linux/blkdev.h | 2 -
7 files changed, 22 insertions(+), 131 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index 474700ffaa1c..430907b26fc4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -421,8 +421,6 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
q->node = node_id;
- atomic_set(&q->nr_active_requests_shared_tags, 0);
-
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 28167c9baa55..6ef922d7abc1 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -467,11 +467,31 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
return 0;
}
+struct count_active_params {
+ struct blk_mq_hw_ctx *hctx;
+ int *active;
+};
+
+static bool hctx_count_active(struct request *rq, void *data)
+{
+ const struct count_active_params *params = data;
+
+ if (rq->mq_hctx == params->hctx)
+ (*params->active)++;
+
+ return true;
+}
+
static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
+ int active = 0;
+ struct count_active_params params = { .hctx = hctx, .active = &active };
+
+ blk_mq_all_tag_iter(hctx->sched_tags ?: hctx->tags, hctx_count_active,
+ ¶ms);
- seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
+ seq_printf(m, "%d\n", active);
return 0;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 33946cdb5716..bfd27cc6249b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -109,10 +109,6 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
- if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
- !hctx_may_queue(data->hctx, bt))
- return BLK_MQ_NO_TAG;
-
if (data->shallow_depth)
return sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9af8c3dec3f6..3c54000bc554 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -489,8 +489,6 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
}
} while (data->nr_tags > nr);
- if (!(data->rq_flags & RQF_SCHED_TAGS))
- blk_mq_add_active_requests(data->hctx, nr);
/* caller already holds a reference, add for remainder */
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
@@ -587,8 +585,6 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
goto retry;
}
- if (!(data->rq_flags & RQF_SCHED_TAGS))
- blk_mq_inc_active_requests(data->hctx);
rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
return rq;
@@ -763,8 +759,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
- if (!(data.rq_flags & RQF_SCHED_TAGS))
- blk_mq_inc_active_requests(data.hctx);
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
@@ -807,10 +801,8 @@ static void __blk_mq_free_request(struct request *rq)
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
- if (rq->tag != BLK_MQ_NO_TAG) {
- blk_mq_dec_active_requests(hctx);
+ if (rq->tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
- }
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
@@ -1188,8 +1180,6 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = hctx->queue;
- blk_mq_sub_active_requests(hctx, nr_tags);
-
blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}
@@ -1875,9 +1865,6 @@ bool __blk_mq_alloc_driver_tag(struct request *rq)
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
bt = &rq->mq_hctx->tags->breserved_tags;
tag_offset = 0;
- } else {
- if (!hctx_may_queue(rq->mq_hctx, bt))
- return false;
}
tag = __sbitmap_queue_get(bt);
@@ -1885,7 +1872,6 @@ bool __blk_mq_alloc_driver_tag(struct request *rq)
return false;
rq->tag = tag + tag_offset;
- blk_mq_inc_active_requests(rq->mq_hctx);
return true;
}
@@ -4037,7 +4023,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
goto free_hctx;
- atomic_set(&hctx->nr_active, 0);
if (node == NUMA_NO_NODE)
node = set->numa_node;
hctx->numa_node = node;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index aa15d31aaae9..8dfb67c55f5d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -291,70 +291,9 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
return -1;
}
-static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
- int val)
-{
- if (blk_mq_is_shared_tags(hctx->flags))
- atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
- else
- atomic_add(val, &hctx->nr_active);
-}
-
-static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
-{
- __blk_mq_add_active_requests(hctx, 1);
-}
-
-static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
- int val)
-{
- if (blk_mq_is_shared_tags(hctx->flags))
- atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
- else
- atomic_sub(val, &hctx->nr_active);
-}
-
-static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
-{
- __blk_mq_sub_active_requests(hctx, 1);
-}
-
-static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
- int val)
-{
- if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- __blk_mq_add_active_requests(hctx, val);
-}
-
-static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
-{
- if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- __blk_mq_inc_active_requests(hctx);
-}
-
-static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
- int val)
-{
- if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- __blk_mq_sub_active_requests(hctx, val);
-}
-
-static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
-{
- if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- __blk_mq_dec_active_requests(hctx);
-}
-
-static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
-{
- if (blk_mq_is_shared_tags(hctx->flags))
- return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
- return atomic_read(&hctx->nr_active);
-}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = BLK_MQ_NO_TAG;
}
@@ -396,45 +335,6 @@ static inline void blk_mq_free_requests(struct list_head *list)
}
}
-/*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
- */
-static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
- struct sbitmap_queue *bt)
-{
- unsigned int depth, users;
-
- if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
- return true;
-
- /*
- * Don't try dividing an ant
- */
- if (bt->sb.depth == 1)
- return true;
-
- if (blk_mq_is_shared_tags(hctx->flags)) {
- struct request_queue *q = hctx->queue;
-
- if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
- return true;
- } else {
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- return true;
- }
-
- users = READ_ONCE(hctx->tags->active_queues);
- if (!users)
- return true;
-
- /*
- * Allow at least some tags
- */
- depth = max((bt->sb.depth + users - 1) / users, 4U);
- return __blk_mq_active_requests(hctx) < depth;
-}
-
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 18a2388ba581..ccbb07559402 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -432,12 +432,6 @@ struct blk_mq_hw_ctx {
/** @queue_num: Index of this hardware queue. */
unsigned int queue_num;
- /**
- * @nr_active: Number of active requests. Only used when a tag set is
- * shared across request queues.
- */
- atomic_t nr_active;
-
/** @cpuhp_online: List to store request if CPU is going to die */
struct hlist_node cpuhp_online;
/** @cpuhp_dead: List to store request if some CPU die. */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d463b9b5a0a5..0dd2a32068ec 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -561,8 +561,6 @@ struct request_queue {
struct timer_list timeout;
struct work_struct timeout_work;
- atomic_t nr_active_requests_shared_tags;
-
struct blk_mq_tags *sched_shared_tags;
struct list_head icq_list;
--
2.43.7
next prev parent reply other threads:[~2026-04-20 11:09 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-20 11:38 [PATCH v2 0/3] scsi/block: NUMA-local scan allocations, shared-tag path cleanup, and SCSI I/O counters Sumit Saxena
2026-04-20 11:38 ` [PATCH v2 1/3] scsi: scan: allocate sdev and starget on the NUMA node of the host adapter Sumit Saxena
2026-04-20 12:10 ` John Garry
2026-04-20 11:38 ` Sumit Saxena [this message]
2026-04-20 11:38 ` [PATCH v2 3/3] scsi: use percpu counters for iorequest_cnt and iodone_cnt Sumit Saxena
2026-04-20 16:41 ` Bart Van Assche
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260420113846.1401374-3-sumit.saxena@broadcom.com \
--to=sumit.saxena@broadcom.com \
--cc=axboe@kernel.dk \
--cc=bvanassche@acm.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-scsi@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=mpi3mr-linuxdrv.pdl@broadcom.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox