From: Yu Kuai <yukuai@fnnas.com>
To: Tejun Heo <tj@kernel.org>, Josef Bacik <josef@toxicpanda.com>,
Jens Axboe <axboe@kernel.dk>
Cc: cgroups@vger.kernel.org, linux-block@vger.kernel.org,
linux-kernel@vger.kernel.org,
Zheng Qixing <zhengqixing@huawei.com>,
Ming Lei <ming.lei@redhat.com>,
Nilay Shroff <nilay@linux.ibm.com>
Subject: [PATCH v3 6/7] blk-cgroup: allocate pds before freezing queue in blkcg_activate_policy()
Date: Wed, 4 Mar 2026 15:38:07 +0800 [thread overview]
Message-ID: <20260304073809.3438679-7-yukuai@fnnas.com> (raw)
In-Reply-To: <20260304073809.3438679-1-yukuai@fnnas.com>
Some policies like iocost and iolatency perform percpu allocation in
pd_alloc_fn(). Percpu allocation with queue frozen can cause deadlock
because percpu memory reclaim may issue IO.
Now that q->blkg_list is protected by blkcg_mutex, restructure
blkcg_activate_policy() to allocate all pds before freezing the queue:
1. Allocate all pds with GFP_KERNEL before freezing the queue
2. Freeze the queue
3. Initialize and online all pds
Note: Future work is to remove all queue freezing before
blkcg_activate_policy() to fix the deadlocks thoroughly.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
block/blk-cgroup.c | 95 +++++++++++++++++-----------------------------
1 file changed, 35 insertions(+), 60 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0206050f81ea..1620be75f124 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1606,8 +1606,7 @@ static void blkcg_policy_teardown_pds(struct request_queue *q,
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
struct request_queue *q = disk->queue;
- struct blkg_policy_data *pd_prealloc = NULL;
- struct blkcg_gq *blkg, *pinned_blkg = NULL;
+ struct blkcg_gq *blkg;
unsigned int memflags;
int ret;
@@ -1622,90 +1621,65 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
return -EINVAL;
- if (queue_is_mq(q))
- memflags = blk_mq_freeze_queue(q);
-
+ /*
+ * Allocate all pds before freezing queue. Some policies like iocost
+ * and iolatency do percpu allocation in pd_alloc_fn(), which can
+ * deadlock with queue frozen because percpu memory reclaim may issue
+ * IO. blkcg_mutex protects q->blkg_list iteration.
+ */
mutex_lock(&q->blkcg_mutex);
-retry:
- spin_lock_irq(&q->queue_lock);
-
- /* blkg_list is pushed at the head, reverse walk to initialize parents first */
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
- if (blkg->pd[pol->plid])
- continue;
+ /* Skip dying blkg */
if (hlist_unhashed(&blkg->blkcg_node))
continue;
- /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
- if (blkg == pinned_blkg) {
- pd = pd_prealloc;
- pd_prealloc = NULL;
- } else {
- pd = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_NOWAIT);
- }
-
+ pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL);
if (!pd) {
- /*
- * GFP_NOWAIT failed. Free the existing one and
- * prealloc for @blkg w/ GFP_KERNEL.
- */
- if (pinned_blkg)
- blkg_put(pinned_blkg);
- blkg_get(blkg);
- pinned_blkg = blkg;
-
- spin_unlock_irq(&q->queue_lock);
-
- if (pd_prealloc)
- pol->pd_free_fn(pd_prealloc);
- pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
- GFP_KERNEL);
- if (pd_prealloc)
- goto retry;
- else
- goto enomem;
+ ret = -ENOMEM;
+ goto err_teardown;
}
- spin_lock(&blkg->blkcg->lock);
-
pd->blkg = blkg;
pd->plid = pol->plid;
+ pd->online = false;
blkg->pd[pol->plid] = pd;
+ }
+
+ /* Now freeze queue and initialize/online all pds */
+ if (queue_is_mq(q))
+ memflags = blk_mq_freeze_queue(q);
+ spin_lock_irq(&q->queue_lock);
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
+ struct blkg_policy_data *pd = blkg->pd[pol->plid];
+
+ /* Skip dying blkg */
+ if (hlist_unhashed(&blkg->blkcg_node))
+ continue;
+
+ spin_lock(&blkg->blkcg->lock);
if (pol->pd_init_fn)
pol->pd_init_fn(pd);
-
if (pol->pd_online_fn)
pol->pd_online_fn(pd);
pd->online = true;
-
spin_unlock(&blkg->blkcg->lock);
}
__set_bit(pol->plid, q->blkcg_pols);
- ret = 0;
-
spin_unlock_irq(&q->queue_lock);
-out:
- mutex_unlock(&q->blkcg_mutex);
+
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q, memflags);
- if (pinned_blkg)
- blkg_put(pinned_blkg);
- if (pd_prealloc)
- pol->pd_free_fn(pd_prealloc);
- return ret;
+ mutex_unlock(&q->blkcg_mutex);
+ return 0;
-enomem:
- /* alloc failed, take down everything */
- spin_lock_irq(&q->queue_lock);
+err_teardown:
blkcg_policy_teardown_pds(q, pol);
- spin_unlock_irq(&q->queue_lock);
- ret = -ENOMEM;
- goto out;
+ mutex_unlock(&q->blkcg_mutex);
+ return ret;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1726,18 +1700,19 @@ void blkcg_deactivate_policy(struct gendisk *disk,
if (!blkcg_policy_enabled(q, pol))
return;
+ /* Same locking order as blkcg_activate_policy(): mutex -> freeze */
+ mutex_lock(&q->blkcg_mutex);
if (queue_is_mq(q))
memflags = blk_mq_freeze_queue(q);
- mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
__clear_bit(pol->plid, q->blkcg_pols);
blkcg_policy_teardown_pds(q, pol);
spin_unlock_irq(&q->queue_lock);
- mutex_unlock(&q->blkcg_mutex);
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q, memflags);
+ mutex_unlock(&q->blkcg_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
--
2.51.0
next prev parent reply other threads:[~2026-03-04 7:38 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-04 7:38 [PATCH v3 0/7] blk-cgroup: fix races related to blkg_list iteration Yu Kuai
2026-03-04 7:38 ` [PATCH v3 1/7] blk-cgroup: protect q->blkg_list iteration in blkg_destroy_all() with blkcg_mutex Yu Kuai
2026-03-04 7:38 ` [PATCH v3 2/7] bfq: protect q->blkg_list iteration in bfq_end_wr_async() " Yu Kuai
2026-03-04 7:38 ` [PATCH v3 3/7] blk-cgroup: fix race between policy activation and blkg destruction Yu Kuai
2026-03-04 7:38 ` [PATCH v3 4/7] blk-cgroup: skip dying blkg in blkcg_activate_policy() Yu Kuai
2026-03-04 7:38 ` [PATCH v3 5/7] blk-cgroup: factor policy pd teardown loop into helper Yu Kuai
2026-03-04 7:38 ` Yu Kuai [this message]
2026-03-04 7:38 ` [PATCH v3 7/7] blk-rq-qos: move rq_qos_mutex acquisition inside rq_qos_add/del Yu Kuai
2026-03-23 8:11 ` [PATCH v3 0/7] blk-cgroup: fix races related to blkg_list iteration Yu Kuai
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260304073809.3438679-7-yukuai@fnnas.com \
--to=yukuai@fnnas.com \
--cc=axboe@kernel.dk \
--cc=cgroups@vger.kernel.org \
--cc=josef@toxicpanda.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ming.lei@redhat.com \
--cc=nilay@linux.ibm.com \
--cc=tj@kernel.org \
--cc=zhengqixing@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox