public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed
* fix circular disk reference in blk-cgroup
@ 2023-02-13 10:41 Christoph Hellwig
  2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo, Josef Bacik
  Cc: Ming Lei, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-block-u79uwXL29TY76Z2rM5mHXA

Hi all,

the third patch fixes a problem in haivng a circular disk reference
for blkgs.  The first two patches clean up blk-throttle to avoid
queue->disk refernece that get torn down in disk_release but could
in theory be used in the pd_free handler.

Diffstat:
 block/blk-cgroup.c     |   12 +++++++++---
 block/blk-throttle.c   |   39 ++++++++++++++++-----------------------
 include/linux/blkdev.h |    8 +++-----
 3 files changed, 28 insertions(+), 31 deletions(-)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data
  2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
@ 2023-02-13 10:41 ` Christoph Hellwig
       [not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
  2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
  2 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo, Josef Bacik; +Cc: Ming Lei, cgroups, linux-block

We generally need a gendisk for core cgroup helpers, so store that
and derive the queue from it where needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-throttle.c | 52 ++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index e7bd7050d68402..6a8b82939a38ad 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -61,7 +61,7 @@ struct throtl_data
 	/* service tree for active throtl groups */
 	struct throtl_service_queue service_queue;
 
-	struct request_queue *queue;
+	struct gendisk *disk;
 
 	/* Total Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
@@ -223,13 +223,13 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 	struct throtl_data *__td = sq_to_td((sq));			\
 									\
 	(void)__td;							\
-	if (likely(!blk_trace_note_message_enabled(__td->queue)))	\
+	if (likely(!blk_trace_note_message_enabled(__td->disk->queue)))	\
 		break;							\
 	if ((__tg)) {							\
-		blk_add_cgroup_trace_msg(__td->queue,			\
+		blk_add_cgroup_trace_msg(__td->disk->queue,			\
 			&tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\
 	} else {							\
-		blk_add_trace_msg(__td->queue, "throtl " fmt, ##args);	\
+		blk_add_trace_msg(__td->disk->queue, "throtl " fmt, ##args);	\
 	}								\
 } while (0)
 
@@ -451,8 +451,7 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
 	bool low_valid = false;
 
 	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css,
-			td->queue->disk->root_blkg) {
+	blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
 
 		if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
@@ -1169,19 +1168,19 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	struct throtl_grp *tg = sq_to_tg(sq);
 	struct throtl_data *td = sq_to_td(sq);
 	struct throtl_service_queue *parent_sq;
-	struct request_queue *q;
+	struct gendisk *disk;
 	bool dispatched;
 	int ret;
 
 	/* throtl_data may be gone, so figure out request queue by blkg */
 	if (tg)
-		q = tg->pd.blkg->disk->queue;
+		disk = tg->pd.blkg->disk;
 	else
-		q = td->queue;
+		disk = td->disk;
 
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_irq(&disk->queue->queue_lock);
 
-	if (!q->disk->root_blkg)
+	if (!disk->root_blkg)
 		goto out_unlock;
 
 	if (throtl_can_upgrade(td, NULL))
@@ -1206,9 +1205,9 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 			break;
 
 		/* this dispatch windows is still open, relax and repeat */
-		spin_unlock_irq(&q->queue_lock);
+		spin_unlock_irq(&disk->queue->queue_lock);
 		cpu_relax();
-		spin_lock_irq(&q->queue_lock);
+		spin_lock_irq(&disk->queue->queue_lock);
 	}
 
 	if (!dispatched)
@@ -1230,7 +1229,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 		queue_work(kthrotld_workqueue, &td->dispatch_work);
 	}
 out_unlock:
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_irq(&disk->queue->queue_lock);
 }
 
 /**
@@ -1246,7 +1245,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 	struct throtl_data *td = container_of(work, struct throtl_data,
 					      dispatch_work);
 	struct throtl_service_queue *td_sq = &td->service_queue;
-	struct request_queue *q = td->queue;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
@@ -1254,11 +1252,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 	bio_list_init(&bio_list_on_stack);
 
-	spin_lock_irq(&q->queue_lock);
+	spin_lock_irq(&td->disk->queue->queue_lock);
 	for (rw = READ; rw <= WRITE; rw++)
 		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
 			bio_list_add(&bio_list_on_stack, bio);
-	spin_unlock_irq(&q->queue_lock);
+	spin_unlock_irq(&td->disk->queue->queue_lock);
 
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
@@ -1323,8 +1321,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	 * blk-throttle.
 	 */
 	blkg_for_each_descendant_pre(blkg, pos_css,
-			global ? tg->td->queue->disk->root_blkg :
-			tg_to_blkg(tg)) {
+			global ? tg->td->disk->root_blkg : tg_to_blkg(tg)) {
 		struct throtl_grp *this_tg = blkg_to_tg(blkg);
 		struct throtl_grp *parent_tg;
 
@@ -1873,8 +1870,7 @@ static bool throtl_can_upgrade(struct throtl_data *td,
 		return false;
 
 	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css,
-			td->queue->disk->root_blkg) {
+	blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
 
 		if (tg == this_tg)
@@ -1920,8 +1916,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
 	td->low_upgrade_time = jiffies;
 	td->scale = 0;
 	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css,
-			td->queue->disk->root_blkg) {
+	blkg_for_each_descendant_post(blkg, pos_css, td->disk->root_blkg) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
 		struct throtl_service_queue *sq = &tg->service_queue;
 
@@ -2068,7 +2063,7 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
 	unsigned long last_latency[2] = { 0 };
 	unsigned long latency[2];
 
-	if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
+	if (!blk_queue_nonrot(td->disk->queue) || !td->limit_valid[LIMIT_LOW])
 		return;
 	if (time_before(jiffies, td->last_calculate_time + HZ))
 		return;
@@ -2288,7 +2283,7 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
 
 	if (!td || td->limit_index != LIMIT_LOW ||
 	    !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
-	    !blk_queue_nonrot(td->queue))
+	    !blk_queue_nonrot(td->disk->queue))
 		return;
 
 	index = request_bucket_index(size);
@@ -2365,11 +2360,10 @@ void blk_throtl_bio_endio(struct bio *bio)
 
 int blk_throtl_init(struct gendisk *disk)
 {
-	struct request_queue *q = disk->queue;
 	struct throtl_data *td;
 	int ret;
 
-	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, disk->queue->node);
 	if (!td)
 		return -ENOMEM;
 	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
@@ -2389,8 +2383,8 @@ int blk_throtl_init(struct gendisk *disk)
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
 
-	q->td = td;
-	td->queue = q;
+	disk->queue->td = td;
+	td->disk = disk;
 
 	td->limit_valid[LIMIT_MAX] = true;
 	td->limit_index = LIMIT_MAX;
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/3] blk-throttle: move the throtl_data pointer from to struct gendisk
       [not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
@ 2023-02-13 10:41   ` Christoph Hellwig
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo, Josef Bacik
  Cc: Ming Lei, cgroups-u79uwXL29TY76Z2rM5mHXA,
	linux-block-u79uwXL29TY76Z2rM5mHXA

Block throttling is only used for file system I/O, so move the
throtl_data pointer to the gendisk.

Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
---
 block/blk-throttle.c   | 39 ++++++++++++++++-----------------------
 include/linux/blkdev.h |  8 +++-----
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6a8b82939a38ad..8cece10c56515d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -387,7 +387,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 	struct blkcg_gq *blkg = tg_to_blkg(tg);
-	struct throtl_data *td = blkg->disk->queue->td;
+	struct throtl_data *td = blkg->disk->td;
 	struct throtl_service_queue *sq = &tg->service_queue;
 
 	/*
@@ -1685,13 +1685,6 @@ static struct cftype throtl_files[] = {
 	{ }	/* terminate */
 };
 
-static void throtl_shutdown_wq(struct request_queue *q)
-{
-	struct throtl_data *td = q->td;
-
-	cancel_work_sync(&td->dispatch_work);
-}
-
 struct blkcg_policy blkcg_policy_throtl = {
 	.dfl_cftypes		= throtl_files,
 	.legacy_cftypes		= throtl_legacy_files,
@@ -2297,7 +2290,7 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
 {
 	struct request_queue *q = rq->q;
-	struct throtl_data *td = q->td;
+	struct throtl_data *td = q->disk->td;
 
 	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
 			     time_ns >> 10);
@@ -2383,7 +2376,7 @@ int blk_throtl_init(struct gendisk *disk)
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
 
-	disk->queue->td = td;
+	disk->td = td;
 	td->disk = disk;
 
 	td->limit_valid[LIMIT_MAX] = true;
@@ -2403,25 +2396,24 @@ int blk_throtl_init(struct gendisk *disk)
 
 void blk_throtl_exit(struct gendisk *disk)
 {
-	struct request_queue *q = disk->queue;
+	struct throtl_data *td = disk->td;
 
-	if (!q->td)
+	if (!td)
 		return;
-	del_timer_sync(&q->td->service_queue.pending_timer);
-	throtl_shutdown_wq(q);
+	del_timer_sync(&td->service_queue.pending_timer);
+	cancel_work_sync(&td->dispatch_work);
 	blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
-	free_percpu(q->td->latency_buckets[READ]);
-	free_percpu(q->td->latency_buckets[WRITE]);
-	kfree(q->td);
+	free_percpu(td->latency_buckets[READ]);
+	free_percpu(td->latency_buckets[WRITE]);
+	kfree(td);
 }
 
 void blk_throtl_register(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
-	struct throtl_data *td;
+	struct throtl_data *td = disk->td;
 	int i;
 
-	td = q->td;
 	BUG_ON(!td);
 
 	if (blk_queue_nonrot(q)) {
@@ -2448,9 +2440,10 @@ void blk_throtl_register(struct gendisk *disk)
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
 {
-	if (!q->td)
+	if (!q->disk->td)
 		return -EINVAL;
-	return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
+	return sprintf(page, "%u\n",
+		       jiffies_to_msecs(q->disk->td->throtl_slice));
 }
 
 ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@@ -2459,14 +2452,14 @@ ssize_t blk_throtl_sample_time_store(struct request_queue *q,
 	unsigned long v;
 	unsigned long t;
 
-	if (!q->td)
+	if (!q->disk->td)
 		return -EINVAL;
 	if (kstrtoul(page, 10, &v))
 		return -EINVAL;
 	t = msecs_to_jiffies(v);
 	if (t == 0 || t > MAX_THROTL_SLICE)
 		return -EINVAL;
-	q->td->throtl_slice = t;
+	q->disk->td->throtl_slice = t;
 	return count;
 }
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 79aec4ebadb9e0..f07bc82c87f8b3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -169,6 +169,9 @@ struct gendisk {
 	struct list_head	blkg_list;
 	struct mutex		blkcg_mutex;
 #endif /* CONFIG_BLK_CGROUP */
+#ifdef CONFIG_BLK_DEV_THROTTLING
+	struct throtl_data	*td;
+#endif
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct kobject integrity_kobj;
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
@@ -516,11 +519,6 @@ struct request_queue {
 	spinlock_t		unused_hctx_lock;
 
 	int			mq_freeze_depth;
-
-#ifdef CONFIG_BLK_DEV_THROTTLING
-	/* Throttle data */
-	struct throtl_data *td;
-#endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
 	/*
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg
  2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
  2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
       [not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
@ 2023-02-13 10:41 ` Christoph Hellwig
  2023-02-13 12:11   ` Ming Lei
  2 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2023-02-13 10:41 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo, Josef Bacik; +Cc: Ming Lei, cgroups, linux-block

To avoid a circular reference, do not grab a device model reference
to the gendisk for each blkg, but just the lower level inode reference
preventing the memory from beeing freed.

This means blkg freeing and pd_free need to be careful to not rely
on anything torn down in disk_release.

Fixes: c43332fe028c ("blk-cgroup: delay calling blkcg_exit_disk until disk_release")
Reported-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-cgroup.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 935028912e7abf..9e7e48c8fa47ae 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -136,7 +136,7 @@ static void blkg_free_workfn(struct work_struct *work)
 	list_del_init(&blkg->entry);
 	mutex_unlock(&blkg->disk->blkcg_mutex);
 
-	put_disk(blkg->disk);
+	iput(blkg->disk->part0->bd_inode);
 	free_percpu(blkg->iostat_cpu);
 	percpu_ref_exit(&blkg->refcnt);
 	kfree(blkg);
@@ -264,9 +264,15 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	if (!blkg->iostat_cpu)
 		goto out_exit_refcnt;
 
+	/*
+	 * Grab a reference the part0 inode, which keeps the memory backing the
+	 * gendisk from beeing released and safe for use in ->pd_free instead of
+	 * the full fledged device model reference because the blkgs are only
+	 * released in disk_release and would thus create circular references.
+	 */
 	if (test_bit(GD_DEAD, &disk->state))
 		goto out_free_iostat;
-	get_device(disk_to_dev(disk));
+	igrab(disk->part0->bd_inode);
 	blkg->disk = disk;
 
 	INIT_LIST_HEAD(&blkg->entry);
@@ -304,7 +310,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
 	while (--i >= 0)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
-	put_disk(blkg->disk);
+	iput(blkg->disk->part0->bd_inode);
 out_free_iostat:
 	free_percpu(blkg->iostat_cpu);
 out_exit_refcnt:
-- 
2.39.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg
  2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
@ 2023-02-13 12:11   ` Ming Lei
  0 siblings, 0 replies; 5+ messages in thread
From: Ming Lei @ 2023-02-13 12:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Tejun Heo, Josef Bacik, cgroups, linux-block,
	ming.lei

On Mon, Feb 13, 2023 at 11:41:34AM +0100, Christoph Hellwig wrote:
> To avoid a circular reference, do not grab a device model reference
> to the gendisk for each blkg, but just the lower level inode reference
> preventing the memory from beeing freed.

It might not be enough to just prevent gendisk memory from being freed,
anywhere queue reference via disk->queue could become not safe given
disk->queue can be released after disk_release() is called.

thanks,
Ming


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-02-13 12:11 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-02-13 10:41 fix circular disk reference in blk-cgroup Christoph Hellwig
2023-02-13 10:41 ` [PATCH 1/3] blk-throttle: store a gendisk in struct throtl_data Christoph Hellwig
     [not found] ` <20230213104134.475204-1-hch-jcswGhMUV9g@public.gmane.org>
2023-02-13 10:41   ` [PATCH 2/3] blk-throttle: move the throtl_data pointer from to struct gendisk Christoph Hellwig
2023-02-13 10:41 ` [PATCH 3/3] blk-cgroup: only grab an inode reference to the disk for each blkg Christoph Hellwig
2023-02-13 12:11   ` Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox