[RFC PATCH 0/2] introduce budgt control in readahead

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 0/2] introduce budgt control in readahead
@ 2024-05-09  2:39 zhaoyang.huang
  2024-05-09  2:39 ` [RFC PATCH 1/2] block: introduce helper function to calculate bps budgt zhaoyang.huang
  2024-05-09  2:39 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
  0 siblings, 2 replies; 17+ messages in thread
From: zhaoyang.huang @ 2024-05-09  2:39 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	Zhaoyang Huang, steve.kang

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Over-limit bw value is observed during fio test in the throttling group
which caused by over-sized bio as there is no control on ra->size during
readahead. This series patches would like to introduce the helper
function to provide the bytes limit and apply it on readahead.

Please find below for the fio test result on v6.6 which presents 2%-10%
improvement for BW and lat. Besides, we can also observed stable BW
instantaneous value during the test.

blkio.throttle.read_bps_device = 1MB/s
   before:  read: IOPS=223, BW=894KiB/s (915kB/s)(175MiB/200919msec)
   after :  read: IOPS=239, BW=960KiB/s (983kB/s)(153MiB/163105msec)

   before:  clat (usec): min=4, max=16795k, avg=4468.74, stdev=265746.14
            lat  (usec): min=6, max=16795k, avg=4470.57, stdev=265746.14
   after :  clat (usec): min=11, max=209193, avg=4105.22, stdev=27188.04
            lat  (usec): min=16, max=209197, avg=4120.03, stdev=27188.04


blkio.throttle.read_bps_device = 10MB/s
   before:  read: IOPS=2380, BW=9524KiB/s (9752kB/s)(1007MiB/108311msec)
   after :  read: IOPS=2438, BW=9754KiB/s (9989kB/s)(1680MiB/176405msec)

   before:  clat (usec): min=4, max=201817, avg=399.58, stdev=8268.85
            lat  (usec): min=6, max=201819, avg=402.10, stdev=8268.85
   after :  clat (usec): min=4, max=2494.6k, avg=412.72, stdev=25783.51
            lat  (usec): min=6, max=2494.6k, avg=414.48, stdev=25783.51

Zhaoyang Huang (2):
  block: introduce helper function to calculate bps budgt
  mm: introduce budgt control in readahead

 block/blk-throttle.c       | 44 ++++++++++++++++++++++++++++++++++++++
 include/linux/blk-cgroup.h | 10 +++++++++
 mm/readahead.c             | 33 ++++++++++++++++++++--------
 3 files changed, 78 insertions(+), 9 deletions(-)

-- 
2.25.1



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RFC PATCH 1/2] block: introduce helper function to calculate bps budgt
  2024-05-09  2:39 [RFC PATCH 0/2] introduce budgt control in readahead zhaoyang.huang
@ 2024-05-09  2:39 ` zhaoyang.huang
  2024-05-09  2:39 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
  1 sibling, 0 replies; 17+ messages in thread
From: zhaoyang.huang @ 2024-05-09  2:39 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	Zhaoyang Huang, steve.kang

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

The 'over-sized' bio under blk throttle control will be delayed to
launch which breaks original IO timing and have the immediate BW be
not within the bps limit. Introduce a helper function to calculate block
device's budgt which provide the allowed bytes for current bio.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 block/blk-throttle.c       | 44 ++++++++++++++++++++++++++++++++++++++
 include/linux/blk-cgroup.h | 10 +++++++++
 2 files changed, 54 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f4850a6f860b..41c75258183d 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,6 +10,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
+#include <linux/cgroup.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
 #include "blk-stat.h"
@@ -2365,6 +2366,49 @@ void blk_throtl_bio_endio(struct bio *bio)
 }
 #endif
 
+unsigned long blk_throttle_budgt(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	struct throtl_grp *tg;
+	long long bytes_allowed = 0;
+	unsigned long jiffy_elapsed, jiffy_elapsed_rnd;
+	u64 bps_limit;
+
+	if (!q)
+		return U64_MAX;
+
+	rcu_read_lock();
+	spin_lock_irq(&q->queue_lock);
+	blkcg =	css_to_blkcg(task_css(current, io_cgrp_id));
+	if (!blkcg)
+		goto out;
+
+	blkg = blkg_lookup(blkcg, q);
+	if (!blkg || !blkg_tryget(blkg))
+		goto out;
+
+	tg = blkg_to_tg(blkg);
+	bps_limit = tg_bps_limit(tg, READ);
+	if (bps_limit == U64_MAX)
+		goto out;
+
+	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[READ];
+	if (!jiffy_elapsed)
+		jiffy_elapsed_rnd = tg->td->throtl_slice;
+
+	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
+	bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
+			tg->carryover_bytes[READ];
+	blkg_put(blkg);
+out:
+	spin_unlock_irq(&q->queue_lock);
+	rcu_read_unlock();
+	return bytes_allowed;
+}
+
+
 int blk_throtl_init(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index dd5841a42c33..ba79fa464e0a 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -15,10 +15,12 @@
  */
 
 #include <linux/types.h>
+#include <linux/limits.h>
 
 struct bio;
 struct cgroup_subsys_state;
 struct gendisk;
+struct block_device;
 
 #define FC_APPID_LEN              129
 
@@ -45,6 +47,14 @@ static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
 }
 #endif	/* CONFIG_BLK_CGROUP */
 
+#ifdef CONFIG_BLK_DEV_THROTTLING
+unsigned long blk_throttle_budgt(struct block_device *bdev);
+#else
+static inline unsigned long blk_throttle_budgt(struct block_device *bdev)
+{
+	return U64_MAX;
+}
+#endif
 int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len);
 char *blkcg_get_fc_appid(struct bio *bio);
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-09  2:39 [RFC PATCH 0/2] introduce budgt control in readahead zhaoyang.huang
  2024-05-09  2:39 ` [RFC PATCH 1/2] block: introduce helper function to calculate bps budgt zhaoyang.huang
@ 2024-05-09  2:39 ` zhaoyang.huang
  2024-05-09  3:15   ` Matthew Wilcox
  2024-05-09 12:39   ` Christoph Hellwig
  1 sibling, 2 replies; 17+ messages in thread
From: zhaoyang.huang @ 2024-05-09  2:39 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	Zhaoyang Huang, steve.kang

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Currently, readahead's size is decided mainly by page cache's status
like hit/miss or hole size which could lead to suspension of following
bio which is over the size of blk-throttle allowed size when
BLK_THROTTLING is on. Introduce the budgt value here to have the bio's
size be within the legal size.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 mm/readahead.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 130c0e7df99f..2b6120ced6f9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,6 +128,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/fadvise.h>
 #include <linux/sched/mm.h>
+#include <linux/minmax.h>
 
 #include "internal.h"
 
@@ -358,16 +359,23 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
  *  Get the previous window size, ramp it up, and
  *  return it as the new window size.
  */
-static unsigned long get_next_ra_size(struct file_ra_state *ra,
+static unsigned long get_next_ra_size(struct readahead_control *ractl,
 				      unsigned long max)
 {
-	unsigned long cur = ra->size;
+	unsigned long cur = ractl->ra->size;
+	struct inode *inode = ractl->mapping->host;
+	unsigned long budgt = inode->i_sb->s_bdev ?
+			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
+	unsigned long val = max;
 
 	if (cur < max / 16)
-		return 4 * cur;
+		val = 4 * cur;
 	if (cur <= max / 2)
-		return 2 * cur;
-	return max;
+		val = 2 * cur;
+
+	val = budgt ? min(budgt / PAGE_SIZE, val) : val;
+
+	return val;
 }
 
 /*
@@ -437,6 +445,8 @@ static int try_context_readahead(struct address_space *mapping,
 				 unsigned long max)
 {
 	pgoff_t size;
+	unsigned long budgt = mapping->host->i_sb->s_bdev ?
+		blk_throttle_budgt(mapping->host->i_sb->s_bdev) : 0;
 
 	size = count_history_pages(mapping, index, max);
 
@@ -455,7 +465,7 @@ static int try_context_readahead(struct address_space *mapping,
 		size *= 2;
 
 	ra->start = index;
-	ra->size = min(size + req_size, max);
+	ra->size = min3(budgt / PAGE_SIZE, size + req_size, max);
 	ra->async_size = 1;
 
 	return 1;
@@ -552,6 +562,8 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	pgoff_t index = readahead_index(ractl);
 	pgoff_t expected, prev_index;
 	unsigned int order = folio ? folio_order(folio) : 0;
+	unsigned long budgt = ractl->mapping->host->i_sb->s_bdev ?
+		blk_throttle_budgt(ractl->mapping->host->i_sb->s_bdev) : 0;
 
 	/*
 	 * If the request exceeds the readahead window, allow the read to
@@ -574,7 +586,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 			1UL << order);
 	if (index == expected || index == (ra->start + ra->size)) {
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		ra->size = get_next_ra_size(ractl, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -599,7 +611,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 		ra->start = start;
 		ra->size = start - index;	/* old async_size */
 		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		ra->size = get_next_ra_size(ractl, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -631,6 +643,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	 * standalone, small random read
 	 * Read as is, and do not pollute the readahead state.
 	 */
+	if (budgt)
+		req_size = min(budgt / PAGE_SIZE, req_size);
+
 	do_page_cache_ra(ractl, req_size, 0);
 	return;
 
@@ -647,7 +662,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	 * Take care of maximum IO pages as above.
 	 */
 	if (index == ra->start && ra->size == ra->async_size) {
-		add_pages = get_next_ra_size(ra, max_pages);
+		add_pages = get_next_ra_size(ractl, max_pages);
 		if (ra->size + add_pages <= max_pages) {
 			ra->async_size = add_pages;
 			ra->size += add_pages;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-09  2:39 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
@ 2024-05-09  3:15   ` Matthew Wilcox
  2024-05-10  2:43     ` Zhaoyang Huang
  2024-05-09 12:39   ` Christoph Hellwig
  1 sibling, 1 reply; 17+ messages in thread
From: Matthew Wilcox @ 2024-05-09  3:15 UTC (permalink / raw)
  To: zhaoyang.huang
  Cc: Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik, Baolin Wang,
	linux-mm, linux-block, linux-kernel, cgroups, Zhaoyang Huang,
	steve.kang

On Thu, May 09, 2024 at 10:39:37AM +0800, zhaoyang.huang wrote:
> -static unsigned long get_next_ra_size(struct file_ra_state *ra,
> +static unsigned long get_next_ra_size(struct readahead_control *ractl,
>  				      unsigned long max)
>  {
> -	unsigned long cur = ra->size;
> +	unsigned long cur = ractl->ra->size;
> +	struct inode *inode = ractl->mapping->host;
> +	unsigned long budgt = inode->i_sb->s_bdev ?
> +			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;

You can't do this.  There's no guarantee that the IO is going to
mapping->host->i_sb->s_bdev.  You'd have to figure out how to ask the
filesystem to get the bdev for the particular range (eg the fs might
implement RAID internally).



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-09  2:39 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
  2024-05-09  3:15   ` Matthew Wilcox
@ 2024-05-09 12:39   ` Christoph Hellwig
  2024-05-10  3:06     ` Zhaoyang Huang
  1 sibling, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2024-05-09 12:39 UTC (permalink / raw)
  To: zhaoyang.huang
  Cc: Andrew Morton, Matthew Wilcox, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	Zhaoyang Huang, steve.kang

> +	unsigned long budgt = inode->i_sb->s_bdev ?
> +			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;

The readahead code is used for all file systems, you can't just call
into block layer code here.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-09  3:15   ` Matthew Wilcox
@ 2024-05-10  2:43     ` Zhaoyang Huang
  2024-05-10  3:18       ` Matthew Wilcox
  0 siblings, 1 reply; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-10  2:43 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: zhaoyang.huang, Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	steve.kang

On Thu, May 9, 2024 at 11:15 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, May 09, 2024 at 10:39:37AM +0800, zhaoyang.huang wrote:
> > -static unsigned long get_next_ra_size(struct file_ra_state *ra,
> > +static unsigned long get_next_ra_size(struct readahead_control *ractl,
> >                                     unsigned long max)
> >  {
> > -     unsigned long cur = ra->size;
> > +     unsigned long cur = ractl->ra->size;
> > +     struct inode *inode = ractl->mapping->host;
> > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
>
> You can't do this.  There's no guarantee that the IO is going to
> mapping->host->i_sb->s_bdev.  You'd have to figure out how to ask the
> filesystem to get the bdev for the particular range (eg the fs might
> implement RAID internally).
>
Thanks for the prompt. I did some basic research on soft RAID and
wonder if applying the bps limit on /dev/md0 like below could make
this work.

mdadm -C -v /dev/md0 -l raid0 -n 2 /dev/sd[b-c]1
mount /dev/md0 /mnt/raid0/
echo "/dev/md0 100000" > blkio.throttle.read_bps_device

I didn't find information about 'RAID internally'. Could we set the
limit on the root device(the one used for mount) to manage the whole
partition without caring about where the bio finally goes? Or ask the
user to decide if to use by making sure the device they apply will not
do RAID?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-09 12:39   ` Christoph Hellwig
@ 2024-05-10  3:06     ` Zhaoyang Huang
  2024-05-10  4:14       ` Matthew Wilcox
  0 siblings, 1 reply; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-10  3:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: zhaoyang.huang, Andrew Morton, Matthew Wilcox, Jens Axboe,
	Tejun Heo, Josef Bacik, Baolin Wang, linux-mm, linux-block,
	linux-kernel, cgroups, steve.kang

On Thu, May 9, 2024 at 8:40 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
>
> The readahead code is used for all file systems, you can't just call
> into block layer code here.
>
ok. I would like to know any suggestions on introducing throttle
budget control into readahead which actually works as a negative
feedback path. IMO, negative feedback is a good methodology which has
been used in scheduler(EAS) and thermal control(IPA) and
memory(MGLRU). I would like to suggest to have a try on have it work
cross the boundary of memory and block layer.

vfs_read / page fault
|
readahead  <---------|
|                               |
aops->readpages    |
|                               |
block_layer------------


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-10  2:43     ` Zhaoyang Huang
@ 2024-05-10  3:18       ` Matthew Wilcox
  2024-05-11  7:35         ` Zhaoyang Huang
  0 siblings, 1 reply; 17+ messages in thread
From: Matthew Wilcox @ 2024-05-10  3:18 UTC (permalink / raw)
  To: Zhaoyang Huang
  Cc: zhaoyang.huang, Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	steve.kang

On Fri, May 10, 2024 at 10:43:20AM +0800, Zhaoyang Huang wrote:
> Thanks for the prompt. I did some basic research on soft RAID and
> wonder if applying the bps limit on /dev/md0 like below could make
> this work.

No.  Look at btrfs' raid support, for example.  it doesn't use md0.

> I didn't find information about 'RAID internally'. Could we set the
> limit on the root device(the one used for mount) to manage the whole
> partition without caring about where the bio finally goes? Or ask the
> user to decide if to use by making sure the device they apply will not
> do RAID?

No.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-10  3:06     ` Zhaoyang Huang
@ 2024-05-10  4:14       ` Matthew Wilcox
  2024-05-10  7:08         ` Zhaoyang Huang
  0 siblings, 1 reply; 17+ messages in thread
From: Matthew Wilcox @ 2024-05-10  4:14 UTC (permalink / raw)
  To: Zhaoyang Huang
  Cc: Christoph Hellwig, zhaoyang.huang, Andrew Morton, Jens Axboe,
	Tejun Heo, Josef Bacik, Baolin Wang, linux-mm, linux-block,
	linux-kernel, cgroups, steve.kang

On Fri, May 10, 2024 at 11:06:14AM +0800, Zhaoyang Huang wrote:
> On Thu, May 9, 2024 at 8:40 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
> >
> > The readahead code is used for all file systems, you can't just call
> > into block layer code here.
> >
> ok. I would like to know any suggestions on introducing throttle
> budget control into readahead which actually works as a negative
> feedback path. IMO, negative feedback is a good methodology which has
> been used in scheduler(EAS) and thermal control(IPA) and
> memory(MGLRU). I would like to suggest to have a try on have it work
> cross the boundary of memory and block layer.
> 
> vfs_read / page fault
> |
> readahead  <---------|
> |                               |
> aops->readpages    |
> |                               |
> block_layer------------

what you could do is have blk-throttle fail bios that are tagged as
readahead if we've hit the threshold?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-10  4:14       ` Matthew Wilcox
@ 2024-05-10  7:08         ` Zhaoyang Huang
  0 siblings, 0 replies; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-10  7:08 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Christoph Hellwig, zhaoyang.huang, Andrew Morton, Jens Axboe,
	Tejun Heo, Josef Bacik, Baolin Wang, linux-mm, linux-block,
	linux-kernel, cgroups, steve.kang

On Fri, May 10, 2024 at 12:14 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, May 10, 2024 at 11:06:14AM +0800, Zhaoyang Huang wrote:
> > On Thu, May 9, 2024 at 8:40 PM Christoph Hellwig <hch@infradead.org> wrote:
> > >
> > > > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > > > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
> > >
> > > The readahead code is used for all file systems, you can't just call
> > > into block layer code here.
> > >
> > ok. I would like to know any suggestions on introducing throttle
> > budget control into readahead which actually works as a negative
> > feedback path. IMO, negative feedback is a good methodology which has
> > been used in scheduler(EAS) and thermal control(IPA) and
> > memory(MGLRU). I would like to suggest to have a try on have it work
> > cross the boundary of memory and block layer.
> >
> > vfs_read / page fault
> > |
> > readahead  <---------|
> > |                               |
> > aops->readpages    |
> > |                               |
> > block_layer------------
>
> what you could do is have blk-throttle fail bios that are tagged as
> readahead if we've hit the threshold?
Actually, blk throttle will postpone the over-size bio's launch by
adding it to the throttle group's private queue which this idea aims
at. The delay here could be avoidable by some means to have the bio
meet the max ability of the throttle blkcg. Furthermore, we may get a
totally non over-sized readahead mechanism if we do this well.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-10  3:18       ` Matthew Wilcox
@ 2024-05-11  7:35         ` Zhaoyang Huang
  2024-05-14  2:37           ` Zhaoyang Huang
  0 siblings, 1 reply; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-11  7:35 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: zhaoyang.huang, Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	steve.kang

On Fri, May 10, 2024 at 11:18 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Fri, May 10, 2024 at 10:43:20AM +0800, Zhaoyang Huang wrote:
> > Thanks for the prompt. I did some basic research on soft RAID and
> > wonder if applying the bps limit on /dev/md0 like below could make
> > this work.
>
> No.  Look at btrfs' raid support, for example.  it doesn't use md0.
If I understand the below command correctly, btrfs uses one of the
volumes within RAID as the mount block device, not /dev/md0. However,
I think this is a problem of blkio.throttle rather than this commit
which means this readahead budget control will work accordingly as
long as blkio.throttle's parameter is configured correctly(eg. 50/50
on sdb and sdc)

mkfs.btrfs -m raid0 -d raid0 /dev/sdb /dev/sdc
mount -t btrfs /dev/sdb /mnt/btr



>
> > I didn't find information about 'RAID internally'. Could we set the
> > limit on the root device(the one used for mount) to manage the whole
> > partition without caring about where the bio finally goes? Or ask the
> > user to decide if to use by making sure the device they apply will not
> > do RAID?
>
> No.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-11  7:35         ` Zhaoyang Huang
@ 2024-05-14  2:37           ` Zhaoyang Huang
  0 siblings, 0 replies; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-14  2:37 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: zhaoyang.huang, Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	steve.kang

On Sat, May 11, 2024 at 3:35 PM Zhaoyang Huang <huangzhaoyang@gmail.com> wrote:
>
> On Fri, May 10, 2024 at 11:18 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Fri, May 10, 2024 at 10:43:20AM +0800, Zhaoyang Huang wrote:
> > > Thanks for the prompt. I did some basic research on soft RAID and
> > > wonder if applying the bps limit on /dev/md0 like below could make
> > > this work.
> >
> > No.  Look at btrfs' raid support, for example.  it doesn't use md0.
> If I understand the below command correctly, btrfs uses one of the
> volumes within RAID as the mount block device, not /dev/md0. However,
> I think this is a problem of blkio.throttle rather than this commit
> which means this readahead budget control will work accordingly as
> long as blkio.throttle's parameter is configured correctly(eg. 50/50
> on sdb and sdc)
>
> mkfs.btrfs -m raid0 -d raid0 /dev/sdb /dev/sdc
> mount -t btrfs /dev/sdb /mnt/btr
>
>
>
> >
> > > I didn't find information about 'RAID internally'. Could we set the
> > > limit on the root device(the one used for mount) to manage the whole
> > > partition without caring about where the bio finally goes? Or ask the
> > > user to decide if to use by making sure the device they apply will not
> > > do RAID?
> >
> > No.

@all, Please find below for more test results where we can find this
commit has the result meet the desired value more closely and enhance
it by 3% than mainline.

echo "254:48 20000000" > blkio.throttle.read_bps_device
fio -filename=/data/ylog/ap/000-0101_000015_poweron.ylog -rw=read
-direct=0 -bs=4k -size=2000M -numjobs=8 -group_reporting -name=mytest

    before : IOPS=37.9k, BW=148MiB/s (155MB/s)(11.6GiB/80333msec)
    after  : IOPS=39.0k, BW=153MiB/s (160MB/s)(15.6GiB/104914msec)

    before : clat (usec): min=4, max=1056.6k, avg=197.23, stdev=10080.69
    after  : clat (usec): min=4, max=193481, avg=188.83, stdev=4651.29

    before : lat (usec): min=5, max=1056.6k, avg=200.48, stdev=10080.76
    after  : lat (usec): min=5, max=193483, avg=192.68, stdev=4651.87


echo "254:48 30000000" > blkio.throttle.read_bps_device
fio -filename=/data/ylog/ap/000-0101_000015_poweron.ylog -rw=read
-direct=0 -bs=4k -size=2000M -numjobs=8 -group_reporting -name=mytest

    before : IOPS=57.2k, BW=224MiB/s (234MB/s)(15.6GiB/71561msec)
    after  : IOPS=58.5k, BW=229MiB/s (240MB/s)(15.6GiB/69996msec)

    before : clat (usec): min=4, max=1105.5k, avg=126.20, stdev=6419.22
    after  : clat (usec): min=4, max=183956, avg=120.60, stdev=2957.28

    before : lat (usec): min=5, max=1105.5k, avg=129.45, stdev=6419.29
    after  : lat (usec): min=5, max=183958, avg=124.40, stdev=2958.18


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-15  1:23 [RFC PATCH 0/2] introduce precised blk-throttle control zhaoyang.huang
@ 2024-05-15  1:23 ` zhaoyang.huang
  2024-05-15  4:09   ` Matthew Wilcox
  2024-05-15  7:40   ` Tejun Heo
  0 siblings, 2 replies; 17+ messages in thread
From: zhaoyang.huang @ 2024-05-15  1:23 UTC (permalink / raw)
  To: Andrew Morton, Matthew Wilcox, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	Zhaoyang Huang, steve.kang

From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Currently, readahead's size is decided mainly by page cache's status
like hit/miss or hole size which could lead to suspension of following
bio which is over the size of blk-throttle allowed size when
BLK_THROTTLING is on. Introduce the budgt value here to have the bio's
size be within the legal size.

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 mm/readahead.c | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 130c0e7df99f..2b6120ced6f9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,6 +128,7 @@
 #include <linux/blk-cgroup.h>
 #include <linux/fadvise.h>
 #include <linux/sched/mm.h>
+#include <linux/minmax.h>
 
 #include "internal.h"
 
@@ -358,16 +359,23 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
  *  Get the previous window size, ramp it up, and
  *  return it as the new window size.
  */
-static unsigned long get_next_ra_size(struct file_ra_state *ra,
+static unsigned long get_next_ra_size(struct readahead_control *ractl,
 				      unsigned long max)
 {
-	unsigned long cur = ra->size;
+	unsigned long cur = ractl->ra->size;
+	struct inode *inode = ractl->mapping->host;
+	unsigned long budgt = inode->i_sb->s_bdev ?
+			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
+	unsigned long val = max;
 
 	if (cur < max / 16)
-		return 4 * cur;
+		val = 4 * cur;
 	if (cur <= max / 2)
-		return 2 * cur;
-	return max;
+		val = 2 * cur;
+
+	val = budgt ? min(budgt / PAGE_SIZE, val) : val;
+
+	return val;
 }
 
 /*
@@ -437,6 +445,8 @@ static int try_context_readahead(struct address_space *mapping,
 				 unsigned long max)
 {
 	pgoff_t size;
+	unsigned long budgt = mapping->host->i_sb->s_bdev ?
+		blk_throttle_budgt(mapping->host->i_sb->s_bdev) : 0;
 
 	size = count_history_pages(mapping, index, max);
 
@@ -455,7 +465,7 @@ static int try_context_readahead(struct address_space *mapping,
 		size *= 2;
 
 	ra->start = index;
-	ra->size = min(size + req_size, max);
+	ra->size = min3(budgt / PAGE_SIZE, size + req_size, max);
 	ra->async_size = 1;
 
 	return 1;
@@ -552,6 +562,8 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	pgoff_t index = readahead_index(ractl);
 	pgoff_t expected, prev_index;
 	unsigned int order = folio ? folio_order(folio) : 0;
+	unsigned long budgt = ractl->mapping->host->i_sb->s_bdev ?
+		blk_throttle_budgt(ractl->mapping->host->i_sb->s_bdev) : 0;
 
 	/*
 	 * If the request exceeds the readahead window, allow the read to
@@ -574,7 +586,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 			1UL << order);
 	if (index == expected || index == (ra->start + ra->size)) {
 		ra->start += ra->size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		ra->size = get_next_ra_size(ractl, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -599,7 +611,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 		ra->start = start;
 		ra->size = start - index;	/* old async_size */
 		ra->size += req_size;
-		ra->size = get_next_ra_size(ra, max_pages);
+		ra->size = get_next_ra_size(ractl, max_pages);
 		ra->async_size = ra->size;
 		goto readit;
 	}
@@ -631,6 +643,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	 * standalone, small random read
 	 * Read as is, and do not pollute the readahead state.
 	 */
+	if (budgt)
+		req_size = min(budgt / PAGE_SIZE, req_size);
+
 	do_page_cache_ra(ractl, req_size, 0);
 	return;
 
@@ -647,7 +662,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
 	 * Take care of maximum IO pages as above.
 	 */
 	if (index == ra->start && ra->size == ra->async_size) {
-		add_pages = get_next_ra_size(ra, max_pages);
+		add_pages = get_next_ra_size(ractl, max_pages);
 		if (ra->size + add_pages <= max_pages) {
 			ra->async_size = add_pages;
 			ra->size += add_pages;
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-15  1:23 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
@ 2024-05-15  4:09   ` Matthew Wilcox
  2024-05-15  6:31     ` Zhaoyang Huang
  2024-05-15  7:40   ` Tejun Heo
  1 sibling, 1 reply; 17+ messages in thread
From: Matthew Wilcox @ 2024-05-15  4:09 UTC (permalink / raw)
  To: zhaoyang.huang
  Cc: Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik, Baolin Wang,
	linux-mm, linux-block, linux-kernel, cgroups, Zhaoyang Huang,
	steve.kang

On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> +	unsigned long budgt = inode->i_sb->s_bdev ?
> +			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;

NAK as previously explained.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-15  4:09   ` Matthew Wilcox
@ 2024-05-15  6:31     ` Zhaoyang Huang
  0 siblings, 0 replies; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-15  6:31 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: zhaoyang.huang, Andrew Morton, Jens Axboe, Tejun Heo, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	steve.kang

On Wed, May 15, 2024 at 12:09 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
>
> NAK as previously explained.
ok. But this commit could work by following the configuration of
blk-throttle as long as it works on btrfs with internal RAID on.
Furthermore, this will help the blkcg meet the desired BPS value
perfectly.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-15  1:23 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
  2024-05-15  4:09   ` Matthew Wilcox
@ 2024-05-15  7:40   ` Tejun Heo
  2024-05-15  8:17     ` Zhaoyang Huang
  1 sibling, 1 reply; 17+ messages in thread
From: Tejun Heo @ 2024-05-15  7:40 UTC (permalink / raw)
  To: zhaoyang.huang
  Cc: Andrew Morton, Matthew Wilcox, Jens Axboe, Josef Bacik,
	Baolin Wang, linux-mm, linux-block, linux-kernel, cgroups,
	Zhaoyang Huang, steve.kang

Hello,

On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> +static unsigned long get_next_ra_size(struct readahead_control *ractl,
>  				      unsigned long max)
>  {
> +	unsigned long cur = ractl->ra->size;
> +	struct inode *inode = ractl->mapping->host;
> +	unsigned long budgt = inode->i_sb->s_bdev ?
> +			blk_throttle_budgt(inode->i_sb->s_bdev) : 0;

Technical correctness aside, I'm not convinced it's generally a good idea to
bubble up one specific IO control mechanism's detail all the way upto RA
layer. Besides what's the gain here? For continuous IO stream, whether some
RA bios are oversized or not shouldn't matter, no? Doesn't this just affect
the accuracy of the last RA IO of a finite read stream?

Thanks.

-- 
tejun


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC PATCH 2/2] mm: introduce budgt control in readahead
  2024-05-15  7:40   ` Tejun Heo
@ 2024-05-15  8:17     ` Zhaoyang Huang
  0 siblings, 0 replies; 17+ messages in thread
From: Zhaoyang Huang @ 2024-05-15  8:17 UTC (permalink / raw)
  To: Tejun Heo
  Cc: zhaoyang.huang, Andrew Morton, Matthew Wilcox, Jens Axboe,
	Josef Bacik, Baolin Wang, linux-mm, linux-block, linux-kernel,
	cgroups, steve.kang

On Wed, May 15, 2024 at 3:40 PM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Wed, May 15, 2024 at 09:23:50AM +0800, zhaoyang.huang wrote:
> > +static unsigned long get_next_ra_size(struct readahead_control *ractl,
> >                                     unsigned long max)
> >  {
> > +     unsigned long cur = ractl->ra->size;
> > +     struct inode *inode = ractl->mapping->host;
> > +     unsigned long budgt = inode->i_sb->s_bdev ?
> > +                     blk_throttle_budgt(inode->i_sb->s_bdev) : 0;
>
> Technical correctness aside, I'm not convinced it's generally a good idea to
> bubble up one specific IO control mechanism's detail all the way upto RA
> layer. Besides what's the gain here? For continuous IO stream, whether some
> RA bios are oversized or not shouldn't matter, no? Doesn't this just affect
> the accuracy of the last RA IO of a finite read stream?
Thanks for feedback. If I understand right, the oversized RA bios of a
finite read will fail by being queued to tg's queue which should be
deemed as introducing a drop of IOPS.

submit_bio
    blk_throtl_bio
        if(!tg_may_dispatch) //failed, queue the bio to tg's queue

What we get here is a more precise BW of the throttled blkcg like
below, from which we can find the result of 'after' could exactly meet
the configured bps value and a little bit enhancement since there are
no hung(oversized) bios any more.

blkio.throttle.read_bps_device = 20MB/s
fio ... -numjobs=8 ...

    before : IOPS=37.9k, BW=148MiB/s (155MB/s)(11.6GiB/80333msec)
    after  : IOPS=39.0k, BW=153MiB/s (160MB/s)(15.6GiB/104914msec)

    before : clat (usec): min=4, max=1056.6k, avg=197.23, stdev=10080.69
    after  : clat (usec): min=4, max=193481, avg=188.83, stdev=4651.29

    before : lat (usec): min=5, max=1056.6k, avg=200.48, stdev=10080.76
    after  : lat (usec): min=5, max=193483, avg=192.68, stdev=4651.87

blkio.throttle.read_bps_device = 30MB/s
fio ... -numjobs=8 ...

    before : IOPS=57.2k, BW=224MiB/s (234MB/s)(15.6GiB/71561msec)
    after  : IOPS=58.5k, BW=229MiB/s (240MB/s)(15.6GiB/69996msec)

    before : clat (usec): min=4, max=1105.5k, avg=126.20, stdev=6419.22
    after  : clat (usec): min=4, max=183956, avg=120.60, stdev=2957.28

    before : lat (usec): min=5, max=1105.5k, avg=129.45, stdev=6419.29
    after  : lat (usec): min=5, max=183958, avg=124.40, stdev=2958.18

>
> Thanks. blk_throttle_budgt

>
> --
> tejun


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2024-05-15  8:17 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-09  2:39 [RFC PATCH 0/2] introduce budgt control in readahead zhaoyang.huang
2024-05-09  2:39 ` [RFC PATCH 1/2] block: introduce helper function to calculate bps budgt zhaoyang.huang
2024-05-09  2:39 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
2024-05-09  3:15   ` Matthew Wilcox
2024-05-10  2:43     ` Zhaoyang Huang
2024-05-10  3:18       ` Matthew Wilcox
2024-05-11  7:35         ` Zhaoyang Huang
2024-05-14  2:37           ` Zhaoyang Huang
2024-05-09 12:39   ` Christoph Hellwig
2024-05-10  3:06     ` Zhaoyang Huang
2024-05-10  4:14       ` Matthew Wilcox
2024-05-10  7:08         ` Zhaoyang Huang
  -- strict thread matches above, loose matches on Subject: below --
2024-05-15  1:23 [RFC PATCH 0/2] introduce precised blk-throttle control zhaoyang.huang
2024-05-15  1:23 ` [RFC PATCH 2/2] mm: introduce budgt control in readahead zhaoyang.huang
2024-05-15  4:09   ` Matthew Wilcox
2024-05-15  6:31     ` Zhaoyang Huang
2024-05-15  7:40   ` Tejun Heo
2024-05-15  8:17     ` Zhaoyang Huang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).