* [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data
2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
@ 2023-02-13 10:41 ` Christoph Hellwig
[not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
2 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo, Josef Bacik; +Cc: Ming Lei, cgroups, linux-block
We generally need a gendisk for core cgroup helpers, so store that
and derive the queue from it where needed.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-throttle.c | 52 ++++++++++++++++++++------------------------
1 file changed, 23 insertions(+), 29 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e7bd7050d68402..6a8b82939a38ad 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -61,7 +61,7 @@ struct throtl_data
/* service tree for active throtl groups */
struct throtl_service_queue service_queue;
- struct request_queue *queue;
+ struct gendisk *disk;
/* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
@@ -223,13 +223,13 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
struct throtl_data *__td = sq_to_td((sq)); \
\
(void)__td; \
- if (likely(!blk_trace_note_message_enabled(__td->queue))) \
+ if (likely(!blk_trace_note_message_enabled(__td->disk->queue))) \
break; \
if ((__tg)) { \
- blk_add_cgroup_trace_msg(__td->queue, \
+ blk_add_cgroup_trace_msg(__td->disk->queue, \
&tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\
} else { \
- blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
+ blk_add_trace_msg(__td->disk->queue, "throtl " fmt, ##args); \
} \
} while (0)
@@ -451,8 +451,7 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
bool low_valid = false;
rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css,
- td->queue->disk->root_blkg) {
+ blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
@@ -1169,19 +1168,19 @@ static void throtl_pending_timer_fn(struct timer_list *t)
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
struct throtl_service_queue *parent_sq;
- struct request_queue *q;
+ struct gendisk *disk;
bool dispatched;
int ret;
/* throtl_data may be gone, so figure out request queue by blkg */
if (tg)
- q = tg->pd.blkg->disk->queue;
+ disk = tg->pd.blkg->disk;
else
- q = td->queue;
+ disk = td->disk;
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&disk->queue->queue_lock);
- if (!q->disk->root_blkg)
+ if (!disk->root_blkg)
goto out_unlock;
if (throtl_can_upgrade(td, NULL))
@@ -1206,9 +1205,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
break;
/* this dispatch windows is still open, relax and repeat */
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&disk->queue->queue_lock);
cpu_relax();
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&disk->queue->queue_lock);
}
if (!dispatched)
@@ -1230,7 +1229,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&disk->queue->queue_lock);
}
/**
@@ -1246,7 +1245,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
struct throtl_data *td = container_of(work, struct throtl_data,
dispatch_work);
struct throtl_service_queue *td_sq = &td->service_queue;
- struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
@@ -1254,11 +1252,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
bio_list_init(&bio_list_on_stack);
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->disk->queue->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
bio_list_add(&bio_list_on_stack, bio);
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->disk->queue->queue_lock);
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
@@ -1323,8 +1321,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* blk-throttle.
*/
blkg_for_each_descendant_pre(blkg, pos_css,
- global ? tg->td->queue->disk->root_blkg :
- tg_to_blkg(tg)) {
+ global ? tg->td->disk->root_blkg : tg_to_blkg(tg)) {
struct throtl_grp *this_tg = blkg_to_tg(blkg);
struct throtl_grp *parent_tg;
@@ -1873,8 +1870,7 @@ static bool throtl_can_upgrade(struct throtl_data *td,
return false;
rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css,
- td->queue->disk->root_blkg) {
+ blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
if (tg == this_tg)
@@ -1920,8 +1916,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
td->low_upgrade_time = jiffies;
td->scale = 0;
rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css,
- td->queue->disk->root_blkg) {
+ blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq = &tg->service_queue;
@@ -2068,7 +2063,7 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
unsigned long last_latency[2] = { 0 };
unsigned long latency[2];
- if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
+ if (!blk_queue_nonrot(td->disk->queue) || !td->limit_valid[LIMIT_LOW])
return;
if (time_before(jiffies, td->last_calculate_time + HZ))
return;
@@ -2288,7 +2283,7 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
if (!td || td->limit_index != LIMIT_LOW ||
!(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
- !blk_queue_nonrot(td->queue))
+ !blk_queue_nonrot(td->disk->queue))
return;
index = request_bucket_index(size);
@@ -2365,11 +2360,10 @@ void blk_throtl_bio_endio(struct bio *bio)
int blk_throtl_init(struct gendisk *disk)
{
- struct request_queue *q = disk->queue;
struct throtl_data *td;
int ret;
- td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, disk->queue->node);
if (!td)
return -ENOMEM;
td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
@@ -2389,8 +2383,8 @@ int blk_throtl_init(struct gendisk *disk)
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
- q->td = td;
- td->queue = q;
+ disk->queue->td = td;
+ td->disk = disk;
td->limit_valid[LIMIT_MAX] = true;
td->limit_index = LIMIT_MAX;
--
2.39.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg
2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
[not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
@ 2023-02-13 10:41 ` Christoph Hellwig
2023-02-13 12:11 ` Ming Lei
2 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo, Josef Bacik; +Cc: Ming Lei, cgroups, linux-block
To avoid a circular reference, do not grab a device model reference
to the gendisk for each blkg, but just the lower level inode reference
preventing the memory from beeing freed.
This means blkg freeing and pd_free need to be careful to not rely
on anything torn down in disk_release.
Fixes: c43332fe028c ("blk-cgroup: delay calling blkcg_exit_disk until disk_release")
Reported-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-cgroup.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 935028912e7abf..9e7e48c8fa47ae 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -136,7 +136,7 @@ static void blkg_free_workfn(struct work_struct *work)
list_del_init(&blkg->entry);
mutex_unlock(&blkg->disk->blkcg_mutex);
- put_disk(blkg->disk);
+ iput(blkg->disk->part0->bd_inode);
free_percpu(blkg->iostat_cpu);
percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
@@ -264,9 +264,15 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
if (!blkg->iostat_cpu)
goto out_exit_refcnt;
+ /*
+ * Grab a reference the part0 inode, which keeps the memory backing the
+ * gendisk from beeing released and safe for use in ->pd_free instead of
+ * the full fledged device model reference because the blkgs are only
+ * released in disk_release and would thus create circular references.
+ */
if (test_bit(GD_DEAD, &disk->state))
goto out_free_iostat;
- get_device(disk_to_dev(disk));
+ igrab(disk->part0->bd_inode);
blkg->disk = disk;
INIT_LIST_HEAD(&blkg->entry);
@@ -304,7 +310,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
while (--i >= 0)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- put_disk(blkg->disk);
+ iput(blkg->disk->part0->bd_inode);
out_free_iostat:
free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
--
2.39.1
^ permalink raw reply related [flat|nested] 5+ messages in thread