Linux block layer
 help / color / mirror / Atom feed
From: Yu Kuai <yukuai@kernel.org>
To: Jens Axboe <axboe@kernel.dk>, Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>, Keith Busch <kbusch@kernel.org>,
	Sagi Grimberg <sagi@grimberg.me>,
	Alasdair Kergon <agk@redhat.com>,
	Benjamin Marzinski <bmarzins@redhat.com>,
	Mike Snitzer <snitzer@kernel.org>,
	Mikulas Patocka <mpatocka@redhat.com>,
	Dongsheng Yang <dongsheng.yang@linux.dev>,
	Zheng Gu <cengku@gmail.com>, Coly Li <colyli@fygo.io>,
	Kent Overstreet <kent.overstreet@linux.dev>,
	Josef Bacik <josef@toxicpanda.com>, Yu Kuai <yukuai@fygo.io>,
	Nilay Shroff <nilay@linux.ibm.com>,
	linux-block@vger.kernel.org, cgroups@vger.kernel.org,
	linux-nvme@lists.infradead.org, dm-devel@lists.linux.dev,
	linux-bcache@vger.kernel.org
Subject: [RFC PATCH v1 04/17] blk-throttle: protect throttle state with td lock
Date: Sun,  5 Jul 2026 03:51:11 +0800	[thread overview]
Message-ID: <20260704195124.1375075-5-yukuai@kernel.org> (raw)
In-Reply-To: <20260704195124.1375075-1-yukuai@kernel.org>

From: Yu Kuai <yukuai@fygo.io>

Throttle currently uses queue_lock for both blkcg topology and its own
runtime state. This blocks moving blkg topology protection to blkcg_mutex
cleanly.

Add a throttle-private spinlock and use it for throttle service queues,
pending timers, runtime counters and config updates. Keep queue_lock only
where the current intermediate code still walks blkcg topology.

Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-throttle.c | 87 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index ffc3b70065d4..7bca2805404f 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -30,6 +30,9 @@ static struct workqueue_struct *kthrotld_workqueue;
 
 struct throtl_data
 {
+	/* protects throttle service queues and group runtime state */
+	spinlock_t lock;
+
 	/* service tree for active throtl groups */
 	struct throtl_service_queue service_queue;
 
@@ -346,11 +349,16 @@ static void tg_update_has_rules(struct throtl_grp *tg)
 static void throtl_pd_online(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
+	struct throtl_data *td = tg->td;
+	unsigned long flags;
+
+	spin_lock_irqsave(&td->lock, flags);
 	/*
 	 * We don't want new groups to escape the limits of its ancestors.
 	 * Update has_rules[] after a new group is brought online.
 	 */
 	tg_update_has_rules(tg);
+	spin_unlock_irqrestore(&td->lock, flags);
 }
 
 static void tg_release(struct rcu_head *rcu)
@@ -368,7 +376,7 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 
-	timer_delete_sync(&tg->service_queue.pending_timer);
+	timer_shutdown_sync(&tg->service_queue.pending_timer);
 	call_rcu(&pd->rcu_head, tg_release);
 }
 
@@ -1142,9 +1150,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	else
 		q = td->queue;
 
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_irq(&td->lock);
 
-	if (!q->root_blkg)
+	if (!READ_ONCE(q->root_blkg))
 		goto out_unlock;
 
 again:
@@ -1168,9 +1176,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 			break;
 
 		/* this dispatch windows is still open, relax and repeat */
-		spin_unlock_irq(&q->queue_lock);
+		spin_unlock_irq(&td->lock);
 		cpu_relax();
-		spin_lock_irq(&q->queue_lock);
+		spin_lock_irq(&td->lock);
 	}
 
 	if (!dispatched)
@@ -1193,7 +1201,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 		queue_work(kthrotld_workqueue, &td->dispatch_work);
 	}
 out_unlock:
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_irq(&td->lock);
 }
 
 /**
@@ -1209,7 +1217,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	struct throtl_data *td = container_of(work, struct throtl_data,
 					      dispatch_work);
 	struct throtl_service_queue *td_sq = &td->service_queue;
-	struct request_queue *q = td->queue;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
@@ -1217,11 +1224,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 	bio_list_init(&bio_list_on_stack);
 
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_irq(&td->lock);
 	for (rw = READ; rw <= WRITE; rw++)
 		while ((bio = throtl_pop_queued(td_sq, NULL, rw)))
 			bio_list_add(&bio_list_on_stack, bio);
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_irq(&td->lock);
 
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
@@ -1299,7 +1306,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	rcu_read_unlock();
 
 	/*
-	 * We're already holding queue_lock and know @tg is valid.  Let's
+	 * We're already holding td->lock and know @tg is valid.  Let's
 	 * apply the new config directly.
 	 *
 	 * Restart the slices for both READ and WRITES. It might happen
@@ -1327,6 +1334,7 @@ static int blk_throtl_init(struct gendisk *disk)
 		return -ENOMEM;
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+	spin_lock_init(&td->lock);
 	throtl_service_queue_init(&td->service_queue);
 
 	memflags = blk_mq_freeze_queue(disk->queue);
@@ -1381,6 +1389,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		v = U64_MAX;
 
 	tg = blkg_to_tg(ctx.blkg);
+	spin_lock_irq(&tg->td->lock);
 	tg_update_carryover(tg);
 
 	if (is_u64)
@@ -1389,6 +1398,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;
 
 	tg_conf_updated(tg, false);
+	spin_unlock_irq(&tg->td->lock);
 	ret = 0;
 
 unprep:
@@ -1563,6 +1573,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 		goto close_bdev;
 
 	tg = blkg_to_tg(ctx.blkg);
+	spin_lock_irq(&tg->td->lock);
 	tg_update_carryover(tg);
 
 	v[0] = tg->bps[READ];
@@ -1586,11 +1597,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 		p = tok;
 		strsep(&p, "=");
 		if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
-			goto unprep;
+			goto unlock;
 
 		ret = -ERANGE;
 		if (!val)
-			goto unprep;
+			goto unlock;
 
 		ret = -EINVAL;
 		if (!strcmp(tok, "rbps"))
@@ -1602,7 +1613,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 		else if (!strcmp(tok, "wiops"))
 			v[3] = min_t(u64, val, UINT_MAX);
 		else
-			goto unprep;
+			goto unlock;
 	}
 
 	tg->bps[READ] = v[0];
@@ -1611,7 +1622,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	tg->iops[WRITE] = v[3];
 
 	tg_conf_updated(tg, false);
+	spin_unlock_irq(&tg->td->lock);
 	ret = 0;
+	goto unprep;
+unlock:
+	spin_unlock_irq(&tg->td->lock);
 unprep:
 	blkg_conf_unprep(&ctx);
 close_bdev:
@@ -1636,6 +1651,28 @@ static void throtl_shutdown_wq(struct request_queue *q)
 	cancel_work_sync(&td->dispatch_work);
 }
 
+static void throtl_shutdown_timers(struct request_queue *q)
+{
+	struct throtl_data *td = q->td;
+	struct blkcg_gq *blkg;
+
+	/*
+	 * blkg_destroy_all() has already offlined the policy, but blkg policy
+	 * data is freed asynchronously.  Shut down per-group timers before
+	 * freeing td, as their callbacks still dereference tg->td.
+	 */
+	mutex_lock(&q->blkcg_mutex);
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
+		if (tg)
+			timer_shutdown_sync(&tg->service_queue.pending_timer);
+	}
+	mutex_unlock(&q->blkcg_mutex);
+
+	timer_shutdown_sync(&td->service_queue.pending_timer);
+}
+
 static void tg_flush_bios(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1669,7 +1706,13 @@ static void tg_flush_bios(struct throtl_grp *tg)
 
 static void throtl_pd_offline(struct blkg_policy_data *pd)
 {
-	tg_flush_bios(pd_to_tg(pd));
+	struct throtl_grp *tg = pd_to_tg(pd);
+	struct throtl_data *td = tg->td;
+	unsigned long flags;
+
+	spin_lock_irqsave(&td->lock, flags);
+	tg_flush_bios(tg);
+	spin_unlock_irqrestore(&td->lock, flags);
 }
 
 struct blkcg_policy blkcg_policy_throtl = {
@@ -1725,6 +1768,7 @@ static void tg_cancel_writeback_bios(struct throtl_grp *tg,
 void blk_throtl_cancel_bios(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
+	struct throtl_data *td = q->td;
 	struct cgroup_subsys_state *pos_css;
 	struct blkcg_gq *blkg;
 	struct bio_list cancel_bios[2] = { };
@@ -1734,6 +1778,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 		return;
 
 	spin_lock_irq(&q->queue_lock);
+	spin_lock(&td->lock);
 	/*
 	 * queue_lock is held, rcu lock is not needed here technically.
 	 * However, rcu lock is still held to emphasize that following
@@ -1752,6 +1797,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 		tg_cancel_writeback_bios(blkg_to_tg(blkg), cancel_bios);
 	}
 	rcu_read_unlock();
+	spin_unlock(&td->lock);
 	spin_unlock_irq(&q->queue_lock);
 
 	for (rw = READ; rw <= WRITE; rw++) {
@@ -1791,7 +1837,6 @@ static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
 
 bool __blk_throtl_bio(struct bio *bio)
 {
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct blkcg_gq *blkg = bio->bi_blkg;
 	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1801,7 +1846,7 @@ bool __blk_throtl_bio(struct bio *bio)
 	struct throtl_data *td = tg->td;
 
 	rcu_read_lock();
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_irq(&td->lock);
 	sq = &tg->service_queue;
 
 	while (true) {
@@ -1877,7 +1922,7 @@ bool __blk_throtl_bio(struct bio *bio)
 	}
 
 out_unlock:
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_irq(&td->lock);
 
 	rcu_read_unlock();
 	return throttled;
@@ -1886,17 +1931,19 @@ bool __blk_throtl_bio(struct bio *bio)
 void blk_throtl_exit(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
+	struct throtl_data *td = q->td;
 
 	/*
 	 * blkg_destroy_all() already deactivate throtl policy, just check and
 	 * free throtl data.
 	 */
-	if (!q->td)
+	if (!td)
 		return;
 
-	timer_delete_sync(&q->td->service_queue.pending_timer);
+	throtl_shutdown_timers(q);
 	throtl_shutdown_wq(q);
-	kfree(q->td);
+	q->td = NULL;
+	kfree(td);
 }
 
 static int __init throtl_init(void)
-- 
2.51.0


  parent reply	other threads:[~2026-07-04 19:52 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-07-04 19:51 [RFC PATCH v1 00/17] blk-cgroup: protect blkgs with blkcg_mutex Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 01/17] nvme-multipath: retarget failedover bios from requeue work Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 02/17] dm thin: avoid bio_set_dev under pool lock Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 03/17] dm snapshot: avoid bio_set_dev in locked map paths Yu Kuai
2026-07-04 19:51 ` Yu Kuai [this message]
2026-07-04 19:51 ` [RFC PATCH v1 05/17] block: add bio_alloc_atomic() for atomic bio users Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 06/17] blk-cgroup: support non-blocking bio association Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 07/17] block: support non-blocking bio allocation with a bdev Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 08/17] bcache: avoid sleeping blkg association from locked paths Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 09/17] dm bufio: avoid blkg association from GFP_NOWAIT bio init Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 10/17] dm pcache: handle non-blocking bio clone init failure Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 11/17] block: avoid scheduling from non-blocking helper allocations Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 12/17] dm: avoid sleeping blkg association from NOWAIT remaps Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 13/17] bfq: avoid blkg lookup from locked cgroup update Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 14/17] blk-cgroup: protect blkgs with blkcg_mutex Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 15/17] blk-cgroup: remove blkg radix tree preloading Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 16/17] blk-cgroup: allocate blkgs in blkg_create Yu Kuai
2026-07-04 19:51 ` [RFC PATCH v1 17/17] blk-cgroup: share blkg creation between lookup and config prep Yu Kuai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260704195124.1375075-5-yukuai@kernel.org \
    --to=yukuai@kernel.org \
    --cc=agk@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=bmarzins@redhat.com \
    --cc=cengku@gmail.com \
    --cc=cgroups@vger.kernel.org \
    --cc=colyli@fygo.io \
    --cc=dm-devel@lists.linux.dev \
    --cc=dongsheng.yang@linux.dev \
    --cc=hch@lst.de \
    --cc=josef@toxicpanda.com \
    --cc=kbusch@kernel.org \
    --cc=kent.overstreet@linux.dev \
    --cc=linux-bcache@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=mpatocka@redhat.com \
    --cc=nilay@linux.ibm.com \
    --cc=sagi@grimberg.me \
    --cc=snitzer@kernel.org \
    --cc=tj@kernel.org \
    --cc=yukuai@fygo.io \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox