From: Yu Kuai <yukuai@kernel.org>
To: Jens Axboe <axboe@kernel.dk>, Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>, Keith Busch <kbusch@kernel.org>,
Sagi Grimberg <sagi@grimberg.me>,
Alasdair Kergon <agk@redhat.com>,
Benjamin Marzinski <bmarzins@redhat.com>,
Mike Snitzer <snitzer@kernel.org>,
Mikulas Patocka <mpatocka@redhat.com>,
Dongsheng Yang <dongsheng.yang@linux.dev>,
Zheng Gu <cengku@gmail.com>, Coly Li <colyli@fygo.io>,
Kent Overstreet <kent.overstreet@linux.dev>,
Josef Bacik <josef@toxicpanda.com>, Yu Kuai <yukuai@fygo.io>,
Nilay Shroff <nilay@linux.ibm.com>,
linux-block@vger.kernel.org, cgroups@vger.kernel.org,
linux-nvme@lists.infradead.org, dm-devel@lists.linux.dev,
linux-bcache@vger.kernel.org
Subject: [RFC PATCH v1 04/17] blk-throttle: protect throttle state with td lock
Date: Sun, 5 Jul 2026 03:51:11 +0800 [thread overview]
Message-ID: <20260704195124.1375075-5-yukuai@kernel.org> (raw)
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>
From: Yu Kuai <yukuai@fygo.io>
Throttle currently uses queue_lock for both blkcg topology and its own
runtime state. This blocks moving blkg topology protection to blkcg_mutex
cleanly.
Add a throttle-private spinlock and use it for throttle service queues,
pending timers, runtime counters and config updates. Keep queue_lock only
where the current intermediate code still walks blkcg topology.
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
block/blk-throttle.c | 87 ++++++++++++++++++++++++++++++++++----------
1 file changed, 67 insertions(+), 20 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ffc3b70065d4..7bca2805404f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -30,6 +30,9 @@ static struct workqueue_struct *kthrotld_workqueue;
struct throtl_data
{
+ /* protects throttle service queues and group runtime state */
+ spinlock_t lock;
+
/* service tree for active throtl groups */
struct throtl_service_queue service_queue;
@@ -346,11 +349,16 @@ static void tg_update_has_rules(struct throtl_grp *tg)
static void throtl_pd_online(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
+ struct throtl_data *td = tg->td;
+ unsigned long flags;
+
+ spin_lock_irqsave(&td->lock, flags);
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
tg_update_has_rules(tg);
+ spin_unlock_irqrestore(&td->lock, flags);
}
static void tg_release(struct rcu_head *rcu)
@@ -368,7 +376,7 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
- timer_delete_sync(&tg->service_queue.pending_timer);
+ timer_shutdown_sync(&tg->service_queue.pending_timer);
call_rcu(&pd->rcu_head, tg_release);
}
@@ -1142,9 +1150,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
else
q = td->queue;
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
- if (!q->root_blkg)
+ if (!READ_ONCE(q->root_blkg))
goto out_unlock;
again:
@@ -1168,9 +1176,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
break;
/* this dispatch windows is still open, relax and repeat */
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
cpu_relax();
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
}
if (!dispatched)
@@ -1193,7 +1201,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
}
/**
@@ -1209,7 +1217,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
struct throtl_data *td = container_of(work, struct throtl_data,
dispatch_work);
struct throtl_service_queue *td_sq = &td->service_queue;
- struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
@@ -1217,11 +1224,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
bio_list_init(&bio_list_on_stack);
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
bio_list_add(&bio_list_on_stack, bio);
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
@@ -1299,7 +1306,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
rcu_read_unlock();
/*
- * We're already holding queue_lock and know @tg is valid. Let's
+ * We're already holding td->lock and know @tg is valid. Let's
* apply the new config directly.
*
* Restart the slices for both READ and WRITES. It might happen
@@ -1327,6 +1334,7 @@ static int blk_throtl_init(struct gendisk *disk)
return -ENOMEM;
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+ spin_lock_init(&td->lock);
throtl_service_queue_init(&td->service_queue);
memflags = blk_mq_freeze_queue(disk->queue);
@@ -1381,6 +1389,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
+ spin_lock_irq(&tg->td->lock);
tg_update_carryover(tg);
if (is_u64)
@@ -1389,6 +1398,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
tg_conf_updated(tg, false);
+ spin_unlock_irq(&tg->td->lock);
ret = 0;
unprep:
@@ -1563,6 +1573,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto close_bdev;
tg = blkg_to_tg(ctx.blkg);
+ spin_lock_irq(&tg->td->lock);
tg_update_carryover(tg);
v[0] = tg->bps[READ];
@@ -1586,11 +1597,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
p = tok;
strsep(&p, "=");
if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
- goto unprep;
+ goto unlock;
ret = -ERANGE;
if (!val)
- goto unprep;
+ goto unlock;
ret = -EINVAL;
if (!strcmp(tok, "rbps"))
@@ -1602,7 +1613,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
else
- goto unprep;
+ goto unlock;
}
tg->bps[READ] = v[0];
@@ -1611,7 +1622,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg->iops[WRITE] = v[3];
tg_conf_updated(tg, false);
+ spin_unlock_irq(&tg->td->lock);
ret = 0;
+ goto unprep;
+unlock:
+ spin_unlock_irq(&tg->td->lock);
unprep:
blkg_conf_unprep(&ctx);
close_bdev:
@@ -1636,6 +1651,28 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
+static void throtl_shutdown_timers(struct request_queue *q)
+{
+ struct throtl_data *td = q->td;
+ struct blkcg_gq *blkg;
+
+ /*
+ * blkg_destroy_all() has already offlined the policy, but blkg policy
+ * data is freed asynchronously. Shut down per-group timers before
+ * freeing td, as their callbacks still dereference tg->td.
+ */
+ mutex_lock(&q->blkcg_mutex);
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg)
+ timer_shutdown_sync(&tg->service_queue.pending_timer);
+ }
+ mutex_unlock(&q->blkcg_mutex);
+
+ timer_shutdown_sync(&td->service_queue.pending_timer);
+}
+
static void tg_flush_bios(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
@@ -1669,7 +1706,13 @@ static void tg_flush_bios(struct throtl_grp *tg)
static void throtl_pd_offline(struct blkg_policy_data *pd)
{
- tg_flush_bios(pd_to_tg(pd));
+ struct throtl_grp *tg = pd_to_tg(pd);
+ struct throtl_data *td = tg->td;
+ unsigned long flags;
+
+ spin_lock_irqsave(&td->lock, flags);
+ tg_flush_bios(tg);
+ spin_unlock_irqrestore(&td->lock, flags);
}
struct blkcg_policy blkcg_policy_throtl = {
@@ -1725,6 +1768,7 @@ static void tg_cancel_writeback_bios(struct throtl_grp *tg,
void blk_throtl_cancel_bios(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
+ struct throtl_data *td = q->td;
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
struct bio_list cancel_bios[2] = { };
@@ -1734,6 +1778,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
return;
spin_lock_irq(&q->queue_lock);
+ spin_lock(&td->lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
* However, rcu lock is still held to emphasize that following
@@ -1752,6 +1797,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
tg_cancel_writeback_bios(blkg_to_tg(blkg), cancel_bios);
}
rcu_read_unlock();
+ spin_unlock(&td->lock);
spin_unlock_irq(&q->queue_lock);
for (rw = READ; rw <= WRITE; rw++) {
@@ -1791,7 +1837,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
bool __blk_throtl_bio(struct bio *bio)
{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1801,7 +1846,7 @@ bool __blk_throtl_bio(struct bio *bio)
struct throtl_data *td = tg->td;
rcu_read_lock();
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->lock);
sq = &tg->service_queue;
while (true) {
@@ -1877,7 +1922,7 @@ bool __blk_throtl_bio(struct bio *bio)
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->lock);
rcu_read_unlock();
return throttled;
@@ -1886,17 +1931,19 @@ bool __blk_throtl_bio(struct bio *bio)
void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
+ struct throtl_data *td = q->td;
/*
* blkg_destroy_all() already deactivate throtl policy, just check and
* free throtl data.
*/
- if (!q->td)
+ if (!td)
return;
- timer_delete_sync(&q->td->service_queue.pending_timer);
+ throtl_shutdown_timers(q);
throtl_shutdown_wq(q);
- kfree(q->td);
+ q->td = NULL;
+ kfree(td);
}
static int __init throtl_init(void)
--
2.51.0
next prev parent reply other threads:[~2026-07-04 19:52 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-07-04 19:51 [RFC PATCH v1 00/17] blk-cgroup: protect blkgs with blkcg_mutex Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 01/17] nvme-multipath: retarget failedover bios from requeue work Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 02/17] dm thin: avoid bio_set_dev under pool lock Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 03/17] dm snapshot: avoid bio_set_dev in locked map paths Yu Kuai
2026-07-04 19:51 ` Yu Kuai [this message]
2026-07-04 19:51 ` [RFC PATCH v1 05/17] block: add bio_alloc_atomic() for atomic bio users Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 06/17] blk-cgroup: support non-blocking bio association Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 07/17] block: support non-blocking bio allocation with a bdev Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 08/17] bcache: avoid sleeping blkg association from locked paths Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 09/17] dm bufio: avoid blkg association from GFP_NOWAIT bio init Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 10/17] dm pcache: handle non-blocking bio clone init failure Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 11/17] block: avoid scheduling from non-blocking helper allocations Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 12/17] dm: avoid sleeping blkg association from NOWAIT remaps Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 13/17] bfq: avoid blkg lookup from locked cgroup update Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 14/17] blk-cgroup: protect blkgs with blkcg_mutex Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 15/17] blk-cgroup: remove blkg radix tree preloading Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 16/17] blk-cgroup: allocate blkgs in blkg_create Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 17/17] blk-cgroup: share blkg creation between lookup and config prep Yu Kuai
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260704195124.1375075-5-yukuai@kernel.org \
--to=yukuai@kernel.org \
--cc=agk@redhat.com \
--cc=axboe@kernel.dk \
--cc=bmarzins@redhat.com \
--cc=cengku@gmail.com \
--cc=cgroups@vger.kernel.org \
--cc=colyli@fygo.io \
--cc=dm-devel@lists.linux.dev \
--cc=dongsheng.yang@linux.dev \
--cc=hch@lst.de \
--cc=josef@toxicpanda.com \
--cc=kbusch@kernel.org \
--cc=kent.overstreet@linux.dev \
--cc=linux-bcache@vger.kernel.org \
--cc=linux-block@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=mpatocka@redhat.com \
--cc=nilay@linux.ibm.com \
--cc=sagi@grimberg.me \
--cc=snitzer@kernel.org \
--cc=tj@kernel.org \
--cc=yukuai@fygo.io \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox