* fix circular disk reference in blk-cgroup
@ 2023-02-13 10:41 Christoph Hellwig
2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo, Josef Bacik
Cc: Ming Lei, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-block-u79uwXL29TY76Z2rM5mHXA
Hi all,
the third patch fixes a problem in haivng a circular disk reference
for blkgs. The first two patches clean up blk-throttle to avoid
queue->disk refernece that get torn down in disk_release but could
in theory be used in the pd_free handler.
Diffstat:
block/blk-cgroup.c | 12 +++++++++---
block/blk-throttle.c | 39 ++++++++++++++++-----------------------
include/linux/blkdev.h | 8 +++-----
3 files changed, 28 insertions(+), 31 deletions(-)
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data
2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
@ 2023-02-13 10:41 ` Christoph Hellwig
[not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
2 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo, Josef Bacik; +Cc: Ming Lei, cgroups, linux-block
We generally need a gendisk for core cgroup helpers, so store that
and derive the queue from it where needed.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-throttle.c | 52 ++++++++++++++++++++------------------------
1 file changed, 23 insertions(+), 29 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e7bd7050d68402..6a8b82939a38ad 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -61,7 +61,7 @@ struct throtl_data
/* service tree for active throtl groups */
struct throtl_service_queue service_queue;
- struct request_queue *queue;
+ struct gendisk *disk;
/* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
@@ -223,13 +223,13 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
struct throtl_data *__td = sq_to_td((sq)); \
\
(void)__td; \
- if (likely(!blk_trace_note_message_enabled(__td->queue))) \
+ if (likely(!blk_trace_note_message_enabled(__td->disk->queue))) \
break; \
if ((__tg)) { \
- blk_add_cgroup_trace_msg(__td->queue, \
+ blk_add_cgroup_trace_msg(__td->disk->queue, \
&tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\
} else { \
- blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
+ blk_add_trace_msg(__td->disk->queue, "throtl " fmt, ##args); \
} \
} while (0)
@@ -451,8 +451,7 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
bool low_valid = false;
rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css,
- td->queue->disk->root_blkg) {
+ blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
@@ -1169,19 +1168,19 @@ static void throtl_pending_timer_fn(struct timer_list *t)
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
struct throtl_service_queue *parent_sq;
- struct request_queue *q;
+ struct gendisk *disk;
bool dispatched;
int ret;
/* throtl_data may be gone, so figure out request queue by blkg */
if (tg)
- q = tg->pd.blkg->disk->queue;
+ disk = tg->pd.blkg->disk;
else
- q = td->queue;
+ disk = td->disk;
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&disk->queue->queue_lock);
- if (!q->disk->root_blkg)
+ if (!disk->root_blkg)
goto out_unlock;
if (throtl_can_upgrade(td, NULL))
@@ -1206,9 +1205,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
break;
/* this dispatch windows is still open, relax and repeat */
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&disk->queue->queue_lock);
cpu_relax();
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&disk->queue->queue_lock);
}
if (!dispatched)
@@ -1230,7 +1229,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&disk->queue->queue_lock);
}
/**
@@ -1246,7 +1245,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
struct throtl_data *td = container_of(work, struct throtl_data,
dispatch_work);
struct throtl_service_queue *td_sq = &td->service_queue;
- struct request_queue *q = td->queue;
struct bio_list bio_list_on_stack;
struct bio *bio;
struct blk_plug plug;
@@ -1254,11 +1252,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
bio_list_init(&bio_list_on_stack);
- spin_lock_irq(&q->queue_lock);
+ spin_lock_irq(&td->disk->queue->queue_lock);
for (rw = READ; rw <= WRITE; rw++)
while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
bio_list_add(&bio_list_on_stack, bio);
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&td->disk->queue->queue_lock);
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
@@ -1323,8 +1321,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* blk-throttle.
*/
blkg_for_each_descendant_pre(blkg, pos_css,
- global ? tg->td->queue->disk->root_blkg :
- tg_to_blkg(tg)) {
+ global ? tg->td->disk->root_blkg : tg_to_blkg(tg)) {
struct throtl_grp *this_tg = blkg_to_tg(blkg);
struct throtl_grp *parent_tg;
@@ -1873,8 +1870,7 @@ static bool throtl_can_upgrade(struct throtl_data *td,
return false;
rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css,
- td->queue->disk->root_blkg) {
+ blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
if (tg == this_tg)
@@ -1920,8 +1916,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
td->low_upgrade_time = jiffies;
td->scale = 0;
rcu_read_lock();
- blkg_for_each_descendant_post(blkg, pos_css,
- td->queue->disk->root_blkg) {
+ blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq = &tg->service_queue;
@@ -2068,7 +2063,7 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
unsigned long last_latency[2] = { 0 };
unsigned long latency[2];
- if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
+ if (!blk_queue_nonrot(td->disk->queue) || !td->limit_valid[LIMIT_LOW])
return;
if (time_before(jiffies, td->last_calculate_time + HZ))
return;
@@ -2288,7 +2283,7 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
if (!td || td->limit_index != LIMIT_LOW ||
!(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
- !blk_queue_nonrot(td->queue))
+ !blk_queue_nonrot(td->disk->queue))
return;
index = request_bucket_index(size);
@@ -2365,11 +2360,10 @@ void blk_throtl_bio_endio(struct bio *bio)
int blk_throtl_init(struct gendisk *disk)
{
- struct request_queue *q = disk->queue;
struct throtl_data *td;
int ret;
- td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, disk->queue->node);
if (!td)
return -ENOMEM;
td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
@@ -2389,8 +2383,8 @@ int blk_throtl_init(struct gendisk *disk)
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
- q->td = td;
- td->queue = q;
+ disk->queue->td = td;
+ td->disk = disk;
td->limit_valid[LIMIT_MAX] = true;
td->limit_index = LIMIT_MAX;
--
2.39.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] blk-throttle: move the throtl_data pointer from to struct gendisk
[not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
@ 2023-02-13 10:41 ` Christoph Hellwig
0 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo, Josef Bacik
Cc: Ming Lei, cgroups-u79uwXL29TY76Z2rM5mHXA,
linux-block-u79uwXL29TY76Z2rM5mHXA
Block throttling is only used for file system I/O, so move the
throtl_data pointer to the gendisk.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
block/blk-throttle.c | 39 ++++++++++++++++-----------------------
include/linux/blkdev.h | 8 +++-----
2 files changed, 19 insertions(+), 28 deletions(-)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6a8b82939a38ad..8cece10c56515d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -387,7 +387,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
{
struct throtl_grp *tg = pd_to_tg(pd);
struct blkcg_gq *blkg = tg_to_blkg(tg);
- struct throtl_data *td = blkg->disk->queue->td;
+ struct throtl_data *td = blkg->disk->td;
struct throtl_service_queue *sq = &tg->service_queue;
/*
@@ -1685,13 +1685,6 @@ static struct cftype throtl_files[] = {
{ } /* terminate */
};
-static void throtl_shutdown_wq(struct request_queue *q)
-{
- struct throtl_data *td = q->td;
-
- cancel_work_sync(&td->dispatch_work);
-}
-
struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@@ -2297,7 +2290,7 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
{
struct request_queue *q = rq->q;
- struct throtl_data *td = q->td;
+ struct throtl_data *td = q->disk->td;
throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
time_ns >> 10);
@@ -2383,7 +2376,7 @@ int blk_throtl_init(struct gendisk *disk)
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
- disk->queue->td = td;
+ disk->td = td;
td->disk = disk;
td->limit_valid[LIMIT_MAX] = true;
@@ -2403,25 +2396,24 @@ int blk_throtl_init(struct gendisk *disk)
void blk_throtl_exit(struct gendisk *disk)
{
- struct request_queue *q = disk->queue;
+ struct throtl_data *td = disk->td;
- if (!q->td)
+ if (!td)
return;
- del_timer_sync(&q->td->service_queue.pending_timer);
- throtl_shutdown_wq(q);
+ del_timer_sync(&td->service_queue.pending_timer);
+ cancel_work_sync(&td->dispatch_work);
blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets[READ]);
- free_percpu(q->td->latency_buckets[WRITE]);
- kfree(q->td);
+ free_percpu(td->latency_buckets[READ]);
+ free_percpu(td->latency_buckets[WRITE]);
+ kfree(td);
}
void blk_throtl_register(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- struct throtl_data *td;
+ struct throtl_data *td = disk->td;
int i;
- td = q->td;
BUG_ON(!td);
if (blk_queue_nonrot(q)) {
@@ -2448,9 +2440,10 @@ void blk_throtl_register(struct gendisk *disk)
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
{
- if (!q->td)
+ if (!q->disk->td)
return -EINVAL;
- return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
+ return sprintf(page, "%u\n",
+ jiffies_to_msecs(q->disk->td->throtl_slice));
}
ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@@ -2459,14 +2452,14 @@ ssize_t blk_throtl_sample_time_store(struct request_queue *q,
unsigned long v;
unsigned long t;
- if (!q->td)
+ if (!q->disk->td)
return -EINVAL;
if (kstrtoul(page, 10, &v))
return -EINVAL;
t = msecs_to_jiffies(v);
if (t == 0 || t > MAX_THROTL_SLICE)
return -EINVAL;
- q->td->throtl_slice = t;
+ q->disk->td->throtl_slice = t;
return count;
}
#endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 79aec4ebadb9e0..f07bc82c87f8b3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -169,6 +169,9 @@ struct gendisk {
struct list_head blkg_list;
struct mutex blkcg_mutex;
#endif /* CONFIG_BLK_CGROUP */
+#ifdef CONFIG_BLK_DEV_THROTTLING
+ struct throtl_data *td;
+#endif
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -516,11 +519,6 @@ struct request_queue {
spinlock_t unused_hctx_lock;
int mq_freeze_depth;
-
-#ifdef CONFIG_BLK_DEV_THROTTLING
- /* Throttle data */
- struct throtl_data *td;
-#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
/*
--
2.39.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg
2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
[not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
@ 2023-02-13 10:41 ` Christoph Hellwig
2023-02-13 12:11 ` Ming Lei
2 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
To: Jens Axboe, Tejun Heo, Josef Bacik; +Cc: Ming Lei, cgroups, linux-block
To avoid a circular reference, do not grab a device model reference
to the gendisk for each blkg, but just the lower level inode reference
preventing the memory from beeing freed.
This means blkg freeing and pd_free need to be careful to not rely
on anything torn down in disk_release.
Fixes: c43332fe028c ("blk-cgroup: delay calling blkcg_exit_disk until disk_release")
Reported-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
block/blk-cgroup.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 935028912e7abf..9e7e48c8fa47ae 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -136,7 +136,7 @@ static void blkg_free_workfn(struct work_struct *work)
list_del_init(&blkg->entry);
mutex_unlock(&blkg->disk->blkcg_mutex);
- put_disk(blkg->disk);
+ iput(blkg->disk->part0->bd_inode);
free_percpu(blkg->iostat_cpu);
percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
@@ -264,9 +264,15 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
if (!blkg->iostat_cpu)
goto out_exit_refcnt;
+ /*
+ * Grab a reference the part0 inode, which keeps the memory backing the
+ * gendisk from beeing released and safe for use in ->pd_free instead of
+ * the full fledged device model reference because the blkgs are only
+ * released in disk_release and would thus create circular references.
+ */
if (test_bit(GD_DEAD, &disk->state))
goto out_free_iostat;
- get_device(disk_to_dev(disk));
+ igrab(disk->part0->bd_inode);
blkg->disk = disk;
INIT_LIST_HEAD(&blkg->entry);
@@ -304,7 +310,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
while (--i >= 0)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- put_disk(blkg->disk);
+ iput(blkg->disk->part0->bd_inode);
out_free_iostat:
free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
--
2.39.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg
2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
@ 2023-02-13 12:11 ` Ming Lei
0 siblings, 0 replies; 5+ messages in thread
From: Ming Lei @ 2023-02-13 12:11 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jens Axboe, Tejun Heo, Josef Bacik, cgroups, linux-block,
ming.lei
On Mon, Feb 13, 2023 at 11:41:34AM +0100, Christoph Hellwig wrote:
> To avoid a circular reference, do not grab a device model reference
> to the gendisk for each blkg, but just the lower level inode reference
> preventing the memory from beeing freed.
It might not be enough to just prevent gendisk memory from being freed,
anywhere queue reference via disk->queue could become not safe given
disk->queue can be released after disk_release() is called.
thanks,
Ming
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2023-02-13 12:11 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
[not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
2023-02-13 10:41 ` [PATCH 2/3] blk-throttle: move the throtl_data pointer from to struct gendisk Christoph Hellwig
2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
2023-02-13 12:11 ` Ming Lei
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox